Restore CLI compat and plugin dependency handling

This commit is contained in:
Nick Sweeting
2026-03-15 06:06:18 -07:00
parent 6b482c62df
commit 1f792d7199
19 changed files with 302 additions and 92 deletions

View File

@@ -13,16 +13,16 @@ def process(tmp_path):
def disable_extractors_dict():
env = os.environ.copy()
env.update({
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"USE_READABILITY": "false",
"USE_MERCURY": "false",
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"USE_GIT": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",

View File

@@ -145,8 +145,8 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
assert snapshot is not None, "Should create at least one snapshot"
def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
"""Test that crawl creates a Seed object for input."""
def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractors_dict):
"""Test that crawl input URLs are stored on the Crawl record."""
os.chdir(tmp_path)
subprocess.run(
@@ -158,10 +158,11 @@ def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
seed = c.execute("SELECT id FROM crawls_seed").fetchone()
crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
assert seed is not None, "Seed should be created for crawl input"
assert crawl_urls is not None, "Crawl should be created for crawl input"
assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"
class TestCrawlCLI:
@@ -178,7 +179,7 @@ class TestCrawlCLI:
)
assert result.returncode == 0
assert '--depth' in result.stdout or '-d' in result.stdout
assert 'create' in result.stdout
if __name__ == '__main__':

View File

@@ -3,7 +3,7 @@ import json as pyjson
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
disable_extractors_dict.update({"SAVE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
@@ -11,7 +11,7 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
assert output_file.exists()
def test_readability_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true"})
disable_extractors_dict.update({"SAVE_READABILITY": "true"})
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
@@ -27,7 +27,7 @@ def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
assert output_file.exists()
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
disable_extractors_dict.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
output_str = add_process.stdout.decode("utf-8")

View File

@@ -39,6 +39,17 @@ def test_add_real_world_example_domain(tmp_path):
env = os.environ.copy()
env["TMP_DIR"] = str(tmp_short)
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
env["SAVE_TITLE"] = "True"
env["SAVE_WGET"] = "True"
env["SAVE_SINGLEFILE"] = "True"
env["SAVE_READABILITY"] = "False"
env["SAVE_HTMLTOTEXT"] = "True"
env["SAVE_HEADERS"] = "True"
env["SAVE_PDF"] = "False"
env["SAVE_SCREENSHOT"] = "False"
env["SAVE_ARCHIVEDOTORG"] = "False"
env["SAVE_YTDLP"] = "False"
env["SAVE_GIT"] = "False"
init = subprocess.run(
["archivebox", "init"],
@@ -50,7 +61,7 @@ def test_add_real_world_example_domain(tmp_path):
assert init.returncode == 0, f"archivebox init failed: {init.stderr}"
result = subprocess.run(
["archivebox", "add", "https://example.com"],
["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
capture_output=True,
text=True,
timeout=900,
@@ -115,19 +126,13 @@ def test_add_real_world_example_domain(tmp_path):
)
text_hits = 0
for path in (
*snapshot_dir.glob("*_readability/content.txt"),
snapshot_dir / "readability" / "content.txt",
):
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
text_hits += 1
for path in (
*snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
snapshot_dir / "htmltotext" / "htmltotext.txt",
):
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
text_hits += 1
assert text_hits >= 2, (
"Expected multiple text extractors to contain Example Domain "
f"(readability/htmltotext hits={text_hits})."
assert text_hits >= 1, (
"Expected htmltotext output to contain Example Domain "
f"(htmltotext hits={text_hits})."
)

View File

@@ -22,16 +22,16 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
env = os.environ.copy()
env.update({
# Disable most extractors
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"USE_READABILITY": "false",
"USE_MERCURY": "false",
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"USE_GIT": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
@@ -122,16 +122,16 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
# Enable only parse_html_urls for this test
env = os.environ.copy()
env.update({
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"USE_READABILITY": "false",
"USE_MERCURY": "false",
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"USE_GIT": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
@@ -202,12 +202,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
env = os.environ.copy()
env.update({
"URL_ALLOWLIST": r"monadical\.com/.*", # Only crawl same domain
"SAVE_READABILITY": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_MERCURY": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_PDF": "false",
"SAVE_HEADERS": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_TITLE": "false",
})
# Start a crawl with depth=1 (just one hop to test recursive crawling)
# Use file:// URL so it's instant, no network fetch needed
proc = subprocess.Popen(
['archivebox', 'add', '--depth=1', f'file://{test_html}'],
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -12,7 +12,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
# Verify snapshot exists
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count_before >= 1
@@ -24,7 +24,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count == 0
@@ -59,7 +59,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count_before >= 2
@@ -67,7 +67,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
count_after = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
conn.close()
assert count_after == 0
@@ -80,7 +80,7 @@ def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
crawl_count = c.execute("SELECT COUNT() FROM crawls_crawl").fetchone()[0]
conn.close()
assert crawl_count == 2

View File

@@ -9,7 +9,10 @@ from pathlib import Path
from archivebox.tests.conftest import create_test_url
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
ADMIN_HOST = 'admin.archivebox.localhost:8000'
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
project_root = Path(__file__).resolve().parents[2]
script = textwrap.dedent(
f"""
@@ -31,7 +34,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
target_url = {request_url!r}
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
resp = client.get('/web/' + target_url, HTTP_HOST={host!r})
assert resp.status_code == 302, resp.status_code
snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
@@ -46,7 +49,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
)
assert resp['Location'] == f"/{{snapshot.url_path}}"
resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
resp2 = client.get('/web/' + target_url, HTTP_HOST={host!r})
assert resp2.status_code == 302, resp2.status_code
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
assert resp2['Location'] == f"/{{snapshot.url_path}}"
@@ -208,7 +211,7 @@ def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
"""/web/https://... should work for authenticated users even when public add is off."""
url = create_test_url(domain='example.com', path='savepagenow-auth')
request_url = url.replace('https://', '')
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
assert result.returncode == 0, (
"SavePageNow shortcut (logged-in) test failed.\n"
f"stdout:\n{result.stdout}\n"
@@ -220,7 +223,7 @@ def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
"""/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
url = create_test_url(domain='example.com', path='savepagenow-public')
request_url = url.replace('https://', '')
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
assert result.returncode == 0, (
"SavePageNow shortcut (public add) test failed.\n"
f"stdout:\n{result.stdout}\n"

View File

@@ -6,14 +6,19 @@ from .fixtures import *
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
"""Test that title is extracted from the page."""
disable_extractors_dict.update({"SAVE_TITLE": "true"})
subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
add_process = subprocess.run(
['archivebox', 'add', '--plugins=title', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
os.chdir(tmp_path)
conn = sqlite3.connect("index.sqlite3")
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute("SELECT title from archivebox.core.snapshot")
c.execute("SELECT title FROM core_snapshot")
snapshot = c.fetchone()
conn.close()
@@ -27,8 +32,13 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
and breaks the layout.
"""
disable_extractors_dict.update({"SAVE_TITLE": "true"})
subprocess.run(['archivebox', 'add', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
add_process = subprocess.run(
['archivebox', 'add', '--plugins=title', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert add_process.returncode == 0, add_process.stderr or add_process.stdout
list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)
# Should not contain unescaped HTML tags in output

View File

@@ -1,5 +1,30 @@
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from threading import Thread
from archivebox.misc.util import download_url
class _ExampleHandler(BaseHTTPRequestHandler):
def do_GET(self):
body = b"<html><body><h1>Example Domain</h1></body></html>"
self.send_response(200)
self.send_header("Content-Type", "text/html; charset=utf-8")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, format, *args):
return
def test_download_url_downloads_content():
text = download_url("https://example.com")
server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
thread = Thread(target=server.serve_forever, daemon=True)
thread.start()
try:
text = download_url(f"http://127.0.0.1:{server.server_address[1]}/")
finally:
server.shutdown()
server.server_close()
thread.join(timeout=5)
assert "Example Domain" in text