mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Restore CLI compat and plugin dependency handling
This commit is contained in:
@@ -13,16 +13,16 @@ def process(tmp_path):
|
||||
def disable_extractors_dict():
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
|
||||
@@ -145,8 +145,8 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
|
||||
assert snapshot is not None, "Should create at least one snapshot"
|
||||
|
||||
|
||||
def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl creates a Seed object for input."""
|
||||
def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl input URLs are stored on the Crawl record."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
@@ -158,10 +158,11 @@ def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
seed = c.execute("SELECT id FROM crawls_seed").fetchone()
|
||||
crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
|
||||
assert seed is not None, "Seed should be created for crawl input"
|
||||
assert crawl_urls is not None, "Crawl should be created for crawl input"
|
||||
assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"
|
||||
|
||||
|
||||
class TestCrawlCLI:
|
||||
@@ -178,7 +179,7 @@ class TestCrawlCLI:
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--depth' in result.stdout or '-d' in result.stdout
|
||||
assert 'create' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -3,7 +3,7 @@ import json as pyjson
|
||||
|
||||
|
||||
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
|
||||
disable_extractors_dict.update({"SAVE_SINGLEFILE": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
@@ -11,7 +11,7 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
assert output_file.exists()
|
||||
|
||||
def test_readability_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_READABILITY": "true"})
|
||||
disable_extractors_dict.update({"SAVE_READABILITY": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||
@@ -27,7 +27,7 @@ def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
|
||||
assert output_file.exists()
|
||||
|
||||
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
disable_extractors_dict.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
output_str = add_process.stdout.decode("utf-8")
|
||||
|
||||
@@ -39,6 +39,17 @@ def test_add_real_world_example_domain(tmp_path):
|
||||
env = os.environ.copy()
|
||||
env["TMP_DIR"] = str(tmp_short)
|
||||
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
|
||||
env["SAVE_TITLE"] = "True"
|
||||
env["SAVE_WGET"] = "True"
|
||||
env["SAVE_SINGLEFILE"] = "True"
|
||||
env["SAVE_READABILITY"] = "False"
|
||||
env["SAVE_HTMLTOTEXT"] = "True"
|
||||
env["SAVE_HEADERS"] = "True"
|
||||
env["SAVE_PDF"] = "False"
|
||||
env["SAVE_SCREENSHOT"] = "False"
|
||||
env["SAVE_ARCHIVEDOTORG"] = "False"
|
||||
env["SAVE_YTDLP"] = "False"
|
||||
env["SAVE_GIT"] = "False"
|
||||
|
||||
init = subprocess.run(
|
||||
["archivebox", "init"],
|
||||
@@ -50,7 +61,7 @@ def test_add_real_world_example_domain(tmp_path):
|
||||
assert init.returncode == 0, f"archivebox init failed: {init.stderr}"
|
||||
|
||||
result = subprocess.run(
|
||||
["archivebox", "add", "https://example.com"],
|
||||
["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=900,
|
||||
@@ -115,19 +126,13 @@ def test_add_real_world_example_domain(tmp_path):
|
||||
)
|
||||
|
||||
text_hits = 0
|
||||
for path in (
|
||||
*snapshot_dir.glob("*_readability/content.txt"),
|
||||
snapshot_dir / "readability" / "content.txt",
|
||||
):
|
||||
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
|
||||
text_hits += 1
|
||||
for path in (
|
||||
*snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
|
||||
snapshot_dir / "htmltotext" / "htmltotext.txt",
|
||||
):
|
||||
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
|
||||
text_hits += 1
|
||||
assert text_hits >= 2, (
|
||||
"Expected multiple text extractors to contain Example Domain "
|
||||
f"(readability/htmltotext hits={text_hits})."
|
||||
assert text_hits >= 1, (
|
||||
"Expected htmltotext output to contain Example Domain "
|
||||
f"(htmltotext hits={text_hits})."
|
||||
)
|
||||
|
||||
@@ -22,16 +22,16 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
# Disable most extractors
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
@@ -122,16 +122,16 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
|
||||
# Enable only parse_html_urls for this test
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
@@ -202,12 +202,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"URL_ALLOWLIST": r"monadical\.com/.*", # Only crawl same domain
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
})
|
||||
|
||||
# Start a crawl with depth=1 (just one hop to test recursive crawling)
|
||||
# Use file:// URL so it's instant, no network fetch needed
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', '--depth=1', f'file://{test_html}'],
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
|
||||
@@ -12,7 +12,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
# Verify snapshot exists
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before >= 1
|
||||
|
||||
@@ -24,7 +24,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count == 0
|
||||
@@ -59,7 +59,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before >= 2
|
||||
|
||||
@@ -67,7 +67,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
count_after = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_after == 0
|
||||
|
||||
@@ -80,7 +80,7 @@ def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
|
||||
crawl_count = c.execute("SELECT COUNT() FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert crawl_count == 2
|
||||
|
||||
@@ -9,7 +9,10 @@ from pathlib import Path
|
||||
from archivebox.tests.conftest import create_test_url
|
||||
|
||||
|
||||
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
|
||||
|
||||
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
|
||||
project_root = Path(__file__).resolve().parents[2]
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
@@ -31,7 +34,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
|
||||
|
||||
target_url = {request_url!r}
|
||||
|
||||
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
|
||||
resp = client.get('/web/' + target_url, HTTP_HOST={host!r})
|
||||
assert resp.status_code == 302, resp.status_code
|
||||
|
||||
snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
|
||||
@@ -46,7 +49,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
|
||||
)
|
||||
assert resp['Location'] == f"/{{snapshot.url_path}}"
|
||||
|
||||
resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
|
||||
resp2 = client.get('/web/' + target_url, HTTP_HOST={host!r})
|
||||
assert resp2.status_code == 302, resp2.status_code
|
||||
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
|
||||
assert resp2['Location'] == f"/{{snapshot.url_path}}"
|
||||
@@ -208,7 +211,7 @@ def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
|
||||
"""/web/https://... should work for authenticated users even when public add is off."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-auth')
|
||||
request_url = url.replace('https://', '')
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (logged-in) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
@@ -220,7 +223,7 @@ def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
|
||||
"""/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-public')
|
||||
request_url = url.replace('https://', '')
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (public add) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
|
||||
@@ -6,14 +6,19 @@ from .fixtures import *
|
||||
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that title is extracted from the page."""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=title', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
|
||||
|
||||
os.chdir(tmp_path)
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
conn.row_factory = sqlite3.Row
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT title from archivebox.core.snapshot")
|
||||
c.execute("SELECT title FROM core_snapshot")
|
||||
snapshot = c.fetchone()
|
||||
conn.close()
|
||||
|
||||
@@ -27,8 +32,13 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
|
||||
and breaks the layout.
|
||||
"""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=title', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
|
||||
list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)
|
||||
|
||||
# Should not contain unescaped HTML tags in output
|
||||
|
||||
@@ -1,5 +1,30 @@
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
from threading import Thread
|
||||
|
||||
from archivebox.misc.util import download_url
|
||||
|
||||
|
||||
class _ExampleHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
body = b"<html><body><h1>Example Domain</h1></body></html>"
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/html; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, format, *args):
|
||||
return
|
||||
|
||||
def test_download_url_downloads_content():
|
||||
text = download_url("https://example.com")
|
||||
server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
|
||||
thread = Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
try:
|
||||
text = download_url(f"http://127.0.0.1:{server.server_address[1]}/")
|
||||
finally:
|
||||
server.shutdown()
|
||||
server.server_close()
|
||||
thread.join(timeout=5)
|
||||
|
||||
assert "Example Domain" in text
|
||||
|
||||
Reference in New Issue
Block a user