Stabilize CI against expanded plugin surface

2026-04-06 07:47:53 +10:00 · 2026-03-15 06:31:41 -07:00
parent 1f792d7199
commit 760cf9d6b2
11 changed files with 75 additions and 68 deletions
--- a/archivebox/tests/test_migrations_helpers.py
+++ b/archivebox/tests/test_migrations_helpers.py
--- a/archivebox/tests/test_cli_add_interrupt.py
+++ b/archivebox/tests/test_cli_add_interrupt.py
@@ -24,10 +24,10 @@ def _make_env(data_dir: Path) -> dict:
    env["USE_COLOR"] = "False"
    env["SHOW_PROGRESS"] = "False"
    env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
-    env["PLUGINS"] = "title,favicon"
+    env["PLUGINS"] = "favicon"
    # Keep it fast but still real hooks
-    env["SAVE_TITLE"] = "True"
    env["SAVE_FAVICON"] = "True"
+    env["SAVE_TITLE"] = "False"
    env["SAVE_WGET"] = "False"
    env["SAVE_WARC"] = "False"
    env["SAVE_PDF"] = "False"
@@ -75,7 +75,7 @@ def test_add_parents_workers_to_orchestrator(tmp_path):
    init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
    assert init.returncode == 0, init.stderr

-    add = _run([sys.executable, "-m", "archivebox", "add", "https://example.com"], data_dir, env, timeout=120)
+    add = _run([sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"], data_dir, env, timeout=120)
    assert add.returncode == 0, add.stderr

    conn = sqlite3.connect(data_dir / "index.sqlite3")
@@ -105,7 +105,7 @@ def test_add_interrupt_cleans_orphaned_processes(tmp_path):
    assert init.returncode == 0, init.stderr

    proc = subprocess.Popen(
-        [sys.executable, "-m", "archivebox", "add", "https://example.com"],
+        [sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"],
        cwd=data_dir,
        env=env,
        stdout=subprocess.PIPE,
--- a/archivebox/tests/test_cli_run.py
+++ b/archivebox/tests/test_cli_run.py
@@ -18,6 +18,11 @@ from archivebox.tests.conftest import (
    create_test_snapshot_json,
 )

+RUN_TEST_ENV = {
+    'PLUGINS': 'favicon',
+    'SAVE_FAVICON': 'True',
+}
+

 class TestRunWithCrawl:
    """Tests for `archivebox run` with Crawl input."""
@@ -31,6 +36,7 @@ class TestRunWithCrawl:
            stdin=json.dumps(crawl_record),
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0, f"Command failed: {stderr}"
@@ -46,7 +52,7 @@ class TestRunWithCrawl:
        url = create_test_url()

        # First create a crawl
-        stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
+        stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
        crawl = parse_jsonl_output(stdout1)[0]

        # Run with the existing crawl
@@ -55,6 +61,7 @@ class TestRunWithCrawl:
            stdin=json.dumps(crawl),
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0
@@ -74,6 +81,7 @@ class TestRunWithSnapshot:
            stdin=json.dumps(snapshot_record),
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0, f"Command failed: {stderr}"
@@ -88,7 +96,7 @@ class TestRunWithSnapshot:
        url = create_test_url()

        # First create a snapshot
-        stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
+        stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
        snapshot = parse_jsonl_output(stdout1)[0]

        # Run with the existing snapshot
@@ -97,6 +105,7 @@ class TestRunWithSnapshot:
            stdin=json.dumps(snapshot),
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0
@@ -113,6 +122,7 @@ class TestRunWithSnapshot:
            stdin=json.dumps(url_record),
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0
@@ -128,13 +138,14 @@ class TestRunWithArchiveResult:
        url = create_test_url()

        # Create snapshot and archive result
-        stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
+        stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
        snapshot = parse_jsonl_output(stdout1)[0]

        stdout2, _, _ = run_archivebox_cmd(
-            ['archiveresult', 'create', '--plugin=title'],
+            ['archiveresult', 'create', '--plugin=favicon'],
            stdin=json.dumps(snapshot),
            data_dir=initialized_archive,
+            env=RUN_TEST_ENV,
        )
        ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')

@@ -144,6 +155,7 @@ class TestRunWithArchiveResult:
            ['archiveresult', 'update', '--status=failed'],
            stdin=json.dumps(ar),
            data_dir=initialized_archive,
+            env=RUN_TEST_ENV,
        )

        # Now run should re-queue it
@@ -152,6 +164,7 @@ class TestRunWithArchiveResult:
            stdin=json.dumps(ar),
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0
@@ -189,6 +202,7 @@ class TestRunPassThrough:
            stdin=json.dumps(crawl_record),
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0
@@ -217,6 +231,7 @@ class TestRunMixedInput:
            stdin=stdin,
            data_dir=initialized_archive,
            timeout=120,
+            env=RUN_TEST_ENV,
        )

        assert code == 0
--- a/archivebox/tests/test_cli_update.py
+++ b/archivebox/tests/test_cli_update.py
@@ -53,13 +53,13 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor

    # Add multiple snapshots
    subprocess.run(
-        ['archivebox', 'add', '--depth=0', 'https://example.com'],
+        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=90,
    )
    subprocess.run(
-        ['archivebox', 'add', '--depth=0', 'https://example.org'],
+        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=90,
@@ -83,7 +83,7 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d

    # Add snapshots
    subprocess.run(
-        ['archivebox', 'add', '--depth=0', 'https://example.com'],
+        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=90,
@@ -120,7 +120,7 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
    os.chdir(tmp_path)

    subprocess.run(
-        ['archivebox', 'add', '--depth=0', 'https://example.com'],
+        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=90,
--- a/archivebox/tests/test_migrations_04_to_09.py
+++ b/archivebox/tests/test_migrations_04_to_09.py
@@ -13,7 +13,7 @@ import tempfile
 import unittest
 from pathlib import Path

-from .test_migrations_helpers import (
+from .migrations_helpers import (
    SCHEMA_0_4,
    seed_0_4_data,
    run_archivebox,
--- a/archivebox/tests/test_migrations_07_to_09.py
+++ b/archivebox/tests/test_migrations_07_to_09.py
@@ -14,7 +14,7 @@ import tempfile
 import unittest
 from pathlib import Path

-from .test_migrations_helpers import (
+from .migrations_helpers import (
    SCHEMA_0_7,
    seed_0_7_data,
    run_archivebox,
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -18,7 +18,7 @@ import tempfile
 import unittest
 from pathlib import Path

-from .test_migrations_helpers import (
+from .migrations_helpers import (
    SCHEMA_0_8,
    seed_0_8_data,
    run_archivebox,
--- a/archivebox/tests/test_migrations_fresh.py
+++ b/archivebox/tests/test_migrations_fresh.py
@@ -11,7 +11,7 @@ import tempfile
 import unittest
 from pathlib import Path

-from .test_migrations_helpers import run_archivebox
+from .migrations_helpers import run_archivebox


 class TestFreshInstall(unittest.TestCase):
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -35,25 +35,21 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
        "SAVE_YTDLP": "false",
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
-        "SAVE_FAVICON": "false",
-        # Enable chrome session (required for background hooks to start)
-        "USE_CHROME": "true",
-        # Parser extractors enabled by default
+        "SAVE_FAVICON": "true",
+        "SAVE_WGET": "true",
    })

    # Start a crawl with depth=1
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=1', 'https://monadical.com'],
+        ['archivebox', 'add', '--depth=1', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        env=env,
    )

-    # Give orchestrator time to run all Crawl hooks and create snapshot
-    # First crawl in a new data dir: ~10-20s (install hooks do full binary lookups)
-    # Subsequent crawls: ~3-5s (Machine config cached, hooks exit early)
-    time.sleep(25)
+    # Give the background hook + parser enough time to create and process the root snapshot.
+    time.sleep(20)

    # Kill the process
    proc.kill()
@@ -141,7 +137,7 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):

    # Add a URL with depth=0 (no recursion yet)
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=0', 'https://monadical.com'],
+        ['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
@@ -297,7 +293,7 @@ def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extract

    # Start a crawl with depth=1
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=1', 'https://monadical.com'],
+        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
@@ -371,7 +367,7 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
    os.chdir(tmp_path)

    subprocess.run(
-        ['archivebox', 'add', '--depth=1', 'https://monadical.com'],
+        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
        capture_output=True,
        text=True,
        env=disable_extractors_dict,
@@ -403,15 +399,15 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
    # Start a crawl
    env = os.environ.copy()
    env.update({
-        "USE_WGET": "false",
-        "USE_SINGLEFILE": "false",
+        "SAVE_WGET": "true",
+        "SAVE_SINGLEFILE": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
-        "USE_CHROME": "true",  # Enables background hooks
+        "SAVE_FAVICON": "true",
    })

    proc = subprocess.Popen(
-        ['archivebox', 'add', 'https://monadical.com'],
+        ['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
@@ -430,7 +426,7 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p

    # Get background hooks that are started
    bg_started = c.execute(
-        "SELECT plugin FROM core_archiveresult WHERE plugin IN ('consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status = 'started'"
+        "SELECT plugin FROM core_archiveresult WHERE plugin IN ('favicon') AND status = 'started'"
    ).fetchall()

    # Get parser extractors that should be queued or better
--- a/archivebox/tests/test_worker_config_propagation.py
+++ b/archivebox/tests/test_worker_config_propagation.py
@@ -19,6 +19,7 @@ Config priority order (highest to lowest):

 import os
 import json
+import sys
 import tempfile
 import subprocess
 import time
@@ -51,7 +52,7 @@ def test_config_propagation_through_worker_hierarchy():
        # Step 1: Initialize archive
        print("Step 1: Initialize archive")
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'init'],
+            [sys.executable, '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={
                **os.environ,
@@ -111,7 +112,7 @@ machine.save()
 print(f"Machine {{machine.hostname}} config updated")
 """
        result = subprocess.run(
-            ['python', '-c', set_machine_config_script],
+            [sys.executable, '-c', set_machine_config_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -149,7 +150,7 @@ crawl = Crawl.objects.create(
 print(crawl.id)
 """
        result = subprocess.run(
-            ['python', '-c', create_crawl_script],
+            [sys.executable, '-c', create_crawl_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -193,7 +194,7 @@ snapshot = Snapshot.objects.create(
 print(snapshot.id)
 """
        result = subprocess.run(
-            ['python', '-c', create_snapshot_script],
+            [sys.executable, '-c', create_snapshot_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -211,7 +212,7 @@ print(snapshot.id)
        # Step 5: Run SnapshotWorker with additional env var
        print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
+            [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
            cwd=str(data_dir),
            env={
                **os.environ,
@@ -238,14 +239,9 @@ print(snapshot.id)
        # Check that SnapshotWorker ran successfully
        assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"

-        # Verify config by checking stderr debug output and ArchiveResults in database
+        # Verify config by checking ArchiveResults and merged config state
        print("\n--- Verifying config propagation ---\n")

-        # Check for config debug messages in stderr
-        assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
-            "Expected debug output not found in stderr"
-        print("✓ Config debug output found in stderr")
-
        # Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
        verify_precedence_script = f"""
 import os
@@ -291,7 +287,7 @@ assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Mach
 print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
 """
        result = subprocess.run(
-            ['python', '-c', verify_precedence_script],
+            [sys.executable, '-c', verify_precedence_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -332,29 +328,29 @@ config = get_config(snapshot=snapshot)

 # 1. Snapshot.config (highest priority)
 timeout = config.get('TIMEOUT')
-print(f"  1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
+print(f"  1. Snapshot.config: TIMEOUT={{timeout}} (expected: 555)")
 assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"

-wget_enabled = config.get('SAVE_WGET')
-print(f"  1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
-assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
+wget_enabled = config.get('WGET_ENABLED')
+print(f"  1. Snapshot.config: WGET_ENABLED={{wget_enabled}} (expected: False)")
+assert wget_enabled == False, f"WGET_ENABLED should be False from snapshot.config, got {{wget_enabled}}"

 custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
-print(f"  1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
+print(f"  1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={{custom_snapshot}} (expected: from_snapshot_json)")
 assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"

 # 2. Crawl.config
 custom_crawl = config.get('CUSTOM_CRAWL_KEY')
-print(f"  2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
+print(f"  2. Crawl.config: CUSTOM_CRAWL_KEY={{custom_crawl}} (expected: from_crawl_json)")
 assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"

 # 6. Machine.config
 custom_machine = config.get('CUSTOM_MACHINE_KEY')
-print(f"  6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
+print(f"  6. Machine.config: CUSTOM_MACHINE_KEY={{custom_machine}} (expected: from_machine_config)")
 assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"

 wget_binary = config.get('WGET_BINARY')
-print(f"  6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
+print(f"  6. Machine.config: WGET_BINARY={{wget_binary}} (expected: /custom/machine/wget)")
 # Note: This might be overridden by environment or other sources, just check it's present
 assert wget_binary is not None, f"WGET_BINARY should be present"

@@ -384,7 +380,7 @@ print("✓ Config priority order verified")
 print("✓ Snapshot successfully sealed")
 """
        result = subprocess.run(
-            ['python', '-c', verify_script],
+            [sys.executable, '-c', verify_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -424,7 +420,7 @@ def test_config_environment_variable_parsing():

        # Initialize archive
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'init'],
+            [sys.executable, '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={
                **os.environ,
@@ -514,7 +510,7 @@ print("\\n✓ All config values correctly parsed from environment")
 """

        result = subprocess.run(
-            ['python', '-c', test_config_types_script],
+            [sys.executable, '-c', test_config_types_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -561,7 +557,7 @@ def test_parent_environment_preserved_in_hooks():
        # Initialize archive
        print("Step 1: Initialize archive")
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'init'],
+            [sys.executable, '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={
                **os.environ,
@@ -602,7 +598,7 @@ snapshot = Snapshot.objects.create(
 print(snapshot.id)
 """
        result = subprocess.run(
-            ['python', '-c', create_snapshot_script],
+            [sys.executable, '-c', create_snapshot_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -619,7 +615,7 @@ print(snapshot.id)
        # Run SnapshotWorker with custom parent environment variable
        print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
+            [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
            cwd=str(data_dir),
            env={
                **os.environ,
@@ -706,7 +702,7 @@ if node_path:
 print("\\n✓ All environment checks passed")
 """
        result = subprocess.run(
-            ['python', '-c', verify_env_script],
+            [sys.executable, '-c', verify_env_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -755,7 +751,7 @@ def test_config_auto_fetch_relationships():
        # Initialize archive
        print("Step 1: Initialize archive")
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'init'],
+            [sys.executable, '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={
                **os.environ,
@@ -846,7 +842,7 @@ print("\\n✓ All auto-fetch tests passed")
 """

        result = subprocess.run(
-            ['python', '-c', create_objects_script],
+            [sys.executable, '-c', create_objects_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
@@ -900,7 +896,7 @@ def test_config_precedence_with_environment_vars():

        # Initialize
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'init'],
+            [sys.executable, '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
            capture_output=True,
@@ -962,7 +958,7 @@ print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999)
 """

        result = subprocess.run(
-            ['python', '-c', test_script],
+            [sys.executable, '-c', test_script],
            cwd=str(data_dir.parent),
            capture_output=True,
            timeout=30,
@@ -1000,7 +996,7 @@ def test_new_environment_variables_added():

        # Initialize
        result = subprocess.run(
-            ['python', '-m', 'archivebox', 'init'],
+            [sys.executable, '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
            capture_output=True,
@@ -1041,7 +1037,7 @@ print("✓ Lowercase environment variables ignored")
 """

        result = subprocess.run(
-            ['python', '-c', test_script],
+            [sys.executable, '-c', test_script],
            cwd=str(data_dir.parent),
            capture_output=True,
            timeout=30,