Restore CLI compat and plugin dependency handling

2026-04-06 07:47:53 +10:00 · 2026-03-15 06:06:18 -07:00
parent 6b482c62df
commit 1f792d7199
19 changed files with 302 additions and 92 deletions
--- a/archivebox/tests/fixtures.py
+++ b/archivebox/tests/fixtures.py
@@ -13,16 +13,16 @@ def process(tmp_path):
 def disable_extractors_dict():
    env = os.environ.copy()
    env.update({
-        "USE_WGET": "false",
-        "USE_SINGLEFILE": "false",
-        "USE_READABILITY": "false",
-        "USE_MERCURY": "false",
+        "SAVE_WGET": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_READABILITY": "false",
+        "SAVE_MERCURY": "false",
        "SAVE_HTMLTOTEXT": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
        "SAVE_DOM": "false",
        "SAVE_HEADERS": "false",
-        "USE_GIT": "false",
+        "SAVE_GIT": "false",
        "SAVE_YTDLP": "false",
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
--- a/archivebox/tests/test_crawl.py
+++ b/archivebox/tests/test_crawl.py
@@ -145,8 +145,8 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
    assert snapshot is not None, "Should create at least one snapshot"


-def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
-    """Test that crawl creates a Seed object for input."""
+def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractors_dict):
+    """Test that crawl input URLs are stored on the Crawl record."""
    os.chdir(tmp_path)

    subprocess.run(
@@ -158,10 +158,11 @@ def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()
-    seed = c.execute("SELECT id FROM crawls_seed").fetchone()
+    crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
    conn.close()

-    assert seed is not None, "Seed should be created for crawl input"
+    assert crawl_urls is not None, "Crawl should be created for crawl input"
+    assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"


 class TestCrawlCLI:
@@ -178,7 +179,7 @@ class TestCrawlCLI:
        )

        assert result.returncode == 0
-        assert '--depth' in result.stdout or '-d' in result.stdout
+        assert 'create' in result.stdout


 if __name__ == '__main__':
--- a/archivebox/tests/test_extractors.py
+++ b/archivebox/tests/test_extractors.py
@@ -3,7 +3,7 @@ import json as pyjson


 def test_singlefile_works(tmp_path, process, disable_extractors_dict):
-    disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
+    disable_extractors_dict.update({"SAVE_SINGLEFILE": "true"})
    add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
@@ -11,7 +11,7 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
    assert output_file.exists()

 def test_readability_works(tmp_path, process, disable_extractors_dict):
-    disable_extractors_dict.update({"USE_READABILITY": "true"})
+    disable_extractors_dict.update({"SAVE_READABILITY": "true"})
    add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
@@ -27,7 +27,7 @@ def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
    assert output_file.exists()

 def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
-    disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
+    disable_extractors_dict.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
    add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
                                  capture_output=True, env=disable_extractors_dict)
    output_str = add_process.stdout.decode("utf-8")
--- a/archivebox/tests/test_real_world_add.py
+++ b/archivebox/tests/test_real_world_add.py
@@ -39,6 +39,17 @@ def test_add_real_world_example_domain(tmp_path):
    env = os.environ.copy()
    env["TMP_DIR"] = str(tmp_short)
    env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
+    env["SAVE_TITLE"] = "True"
+    env["SAVE_WGET"] = "True"
+    env["SAVE_SINGLEFILE"] = "True"
+    env["SAVE_READABILITY"] = "False"
+    env["SAVE_HTMLTOTEXT"] = "True"
+    env["SAVE_HEADERS"] = "True"
+    env["SAVE_PDF"] = "False"
+    env["SAVE_SCREENSHOT"] = "False"
+    env["SAVE_ARCHIVEDOTORG"] = "False"
+    env["SAVE_YTDLP"] = "False"
+    env["SAVE_GIT"] = "False"

    init = subprocess.run(
        ["archivebox", "init"],
@@ -50,7 +61,7 @@ def test_add_real_world_example_domain(tmp_path):
    assert init.returncode == 0, f"archivebox init failed: {init.stderr}"

    result = subprocess.run(
-        ["archivebox", "add", "https://example.com"],
+        ["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
        capture_output=True,
        text=True,
        timeout=900,
@@ -115,19 +126,13 @@ def test_add_real_world_example_domain(tmp_path):
    )

    text_hits = 0
-    for path in (
-        *snapshot_dir.glob("*_readability/content.txt"),
-        snapshot_dir / "readability" / "content.txt",
-    ):
-        if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
-            text_hits += 1
    for path in (
        *snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
        snapshot_dir / "htmltotext" / "htmltotext.txt",
    ):
        if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
            text_hits += 1
-    assert text_hits >= 2, (
-        "Expected multiple text extractors to contain Example Domain "
-        f"(readability/htmltotext hits={text_hits})."
+    assert text_hits >= 1, (
+        "Expected htmltotext output to contain Example Domain "
+        f"(htmltotext hits={text_hits})."
    )
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -22,16 +22,16 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
    env = os.environ.copy()
    env.update({
        # Disable most extractors
-        "USE_WGET": "false",
-        "USE_SINGLEFILE": "false",
-        "USE_READABILITY": "false",
-        "USE_MERCURY": "false",
+        "SAVE_WGET": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_READABILITY": "false",
+        "SAVE_MERCURY": "false",
        "SAVE_HTMLTOTEXT": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
        "SAVE_DOM": "false",
        "SAVE_HEADERS": "false",
-        "USE_GIT": "false",
+        "SAVE_GIT": "false",
        "SAVE_YTDLP": "false",
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
@@ -122,16 +122,16 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
    # Enable only parse_html_urls for this test
    env = os.environ.copy()
    env.update({
-        "USE_WGET": "false",
-        "USE_SINGLEFILE": "false",
-        "USE_READABILITY": "false",
-        "USE_MERCURY": "false",
+        "SAVE_WGET": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_READABILITY": "false",
+        "SAVE_MERCURY": "false",
        "SAVE_HTMLTOTEXT": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
        "SAVE_DOM": "false",
        "SAVE_HEADERS": "false",
-        "USE_GIT": "false",
+        "SAVE_GIT": "false",
        "SAVE_YTDLP": "false",
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
@@ -202,12 +202,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
    env = os.environ.copy()
    env.update({
        "URL_ALLOWLIST": r"monadical\.com/.*",  # Only crawl same domain
+        "SAVE_READABILITY": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_MERCURY": "false",
+        "SAVE_SCREENSHOT": "false",
+        "SAVE_PDF": "false",
+        "SAVE_HEADERS": "false",
+        "SAVE_ARCHIVEDOTORG": "false",
+        "SAVE_GIT": "false",
+        "SAVE_YTDLP": "false",
+        "SAVE_TITLE": "false",
    })

    # Start a crawl with depth=1 (just one hop to test recursive crawling)
    # Use file:// URL so it's instant, no network fetch needed
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=1', f'file://{test_html}'],
+        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
--- a/archivebox/tests/test_remove.py
+++ b/archivebox/tests/test_remove.py
@@ -12,7 +12,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
    # Verify snapshot exists
    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()
    assert count_before >= 1

@@ -24,7 +24,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()

    assert count == 0
@@ -59,7 +59,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()
    assert count_before >= 2

@@ -67,7 +67,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count_after = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()
    assert count_after == 0

@@ -80,7 +80,7 @@ def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
+    crawl_count = c.execute("SELECT COUNT() FROM crawls_crawl").fetchone()[0]
    conn.close()

    assert crawl_count == 2
--- a/archivebox/tests/test_savepagenow.py
+++ b/archivebox/tests/test_savepagenow.py
@@ -9,7 +9,10 @@ from pathlib import Path
 from archivebox.tests.conftest import create_test_url


-def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
+ADMIN_HOST = 'admin.archivebox.localhost:8000'
+
+
+def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
    project_root = Path(__file__).resolve().parents[2]
    script = textwrap.dedent(
        f"""
@@ -31,7 +34,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte

        target_url = {request_url!r}

-        resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
+        resp = client.get('/web/' + target_url, HTTP_HOST={host!r})
        assert resp.status_code == 302, resp.status_code

        snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
@@ -46,7 +49,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
            )
        assert resp['Location'] == f"/{{snapshot.url_path}}"

-        resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
+        resp2 = client.get('/web/' + target_url, HTTP_HOST={host!r})
        assert resp2.status_code == 302, resp2.status_code
        assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
        assert resp2['Location'] == f"/{{snapshot.url_path}}"
@@ -208,7 +211,7 @@ def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
    """/web/https://... should work for authenticated users even when public add is off."""
    url = create_test_url(domain='example.com', path='savepagenow-auth')
    request_url = url.replace('https://', '')
-    result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
+    result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
    assert result.returncode == 0, (
        "SavePageNow shortcut (logged-in) test failed.\n"
        f"stdout:\n{result.stdout}\n"
@@ -220,7 +223,7 @@ def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
    """/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
    url = create_test_url(domain='example.com', path='savepagenow-public')
    request_url = url.replace('https://', '')
-    result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
+    result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
    assert result.returncode == 0, (
        "SavePageNow shortcut (public add) test failed.\n"
        f"stdout:\n{result.stdout}\n"
--- a/archivebox/tests/test_title.py
+++ b/archivebox/tests/test_title.py
@@ -6,14 +6,19 @@ from .fixtures import *
 def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
    """Test that title is extracted from the page."""
    disable_extractors_dict.update({"SAVE_TITLE": "true"})
-    subprocess.run(['archivebox', 'add', 'https://example.com'],
-                                 capture_output=True, env=disable_extractors_dict)
+    add_process = subprocess.run(
+        ['archivebox', 'add', '--plugins=title', 'https://example.com'],
+        capture_output=True,
+        text=True,
+        env=disable_extractors_dict,
+    )
+    assert add_process.returncode == 0, add_process.stderr or add_process.stdout

    os.chdir(tmp_path)
    conn = sqlite3.connect("index.sqlite3")
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
-    c.execute("SELECT title from archivebox.core.snapshot")
+    c.execute("SELECT title FROM core_snapshot")
    snapshot = c.fetchone()
    conn.close()

@@ -27,8 +32,13 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
    and breaks the layout.
    """
    disable_extractors_dict.update({"SAVE_TITLE": "true"})
-    subprocess.run(['archivebox', 'add', 'https://example.com'],
-                                 capture_output=True, env=disable_extractors_dict)
+    add_process = subprocess.run(
+        ['archivebox', 'add', '--plugins=title', 'https://example.com'],
+        capture_output=True,
+        text=True,
+        env=disable_extractors_dict,
+    )
+    assert add_process.returncode == 0, add_process.stderr or add_process.stdout
    list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)

    # Should not contain unescaped HTML tags in output
--- a/archivebox/tests/test_util.py
+++ b/archivebox/tests/test_util.py
@@ -1,5 +1,30 @@
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from threading import Thread
+
 from archivebox.misc.util import download_url

+
+class _ExampleHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        body = b"<html><body><h1>Example Domain</h1></body></html>"
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html; charset=utf-8")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format, *args):
+        return
+
 def test_download_url_downloads_content():
-    text = download_url("https://example.com")
+    server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
+    thread = Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    try:
+        text = download_url(f"http://127.0.0.1:{server.server_address[1]}/")
+    finally:
+        server.shutdown()
+        server.server_close()
+        thread.join(timeout=5)
+
    assert "Example Domain" in text