Restore CLI compat and plugin dependency handling

2026-04-06 07:47:53 +10:00 · 2026-03-15 06:06:18 -07:00
parent 6b482c62df
commit 1f792d7199
19 changed files with 302 additions and 92 deletions
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@@ -41,6 +41,7 @@ class ArchiveBoxGroup(click.Group):
    archive_commands = {
        # High-level commands
        'add': 'archivebox.cli.archivebox_add.main',
+        'extract': 'archivebox.cli.archivebox_extract.main',
        'list': 'archivebox.cli.archivebox_list.main',
        'remove': 'archivebox.cli.archivebox_remove.main',
        'run': 'archivebox.cli.archivebox_run.main',
@@ -55,6 +56,10 @@ class ArchiveBoxGroup(click.Group):
        # Introspection commands
        'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
    }
+    legacy_model_commands = {
+        'crawl': 'archivebox.cli.archivebox_crawl_compat.main',
+        'snapshot': 'archivebox.cli.archivebox_snapshot_compat.main',
+    }
    all_subcommands = {
        **meta_commands,
        **setup_commands,
@@ -67,12 +72,35 @@ class ArchiveBoxGroup(click.Group):
        'archive': 'add',
        # Old commands replaced by new model commands
        'orchestrator': 'run',
-        'extract': 'archiveresult',
+    }
+    legacy_model_subcommands = {
+        'crawl': {'create', 'list', 'update', 'delete'},
+        'snapshot': {'create', 'list', 'update', 'delete'},
    }
    
    @classmethod
    def get_canonical_name(cls, cmd_name):
        return cls.renamed_commands.get(cmd_name, cmd_name)
+
+    @classmethod
+    def _should_use_legacy_model_command(cls, cmd_name: str) -> bool:
+        if cmd_name not in cls.legacy_model_commands:
+            return False
+
+        try:
+            arg_idx = sys.argv.index(cmd_name)
+        except ValueError:
+            return False
+
+        remaining_args = sys.argv[arg_idx + 1:]
+        if not remaining_args:
+            return False
+
+        first_arg = remaining_args[0]
+        if first_arg in ('-h', '--help'):
+            return False
+
+        return first_arg not in cls.legacy_model_subcommands[cmd_name]
    

    def get_command(self, ctx, cmd_name):
@@ -82,6 +110,9 @@ class ArchiveBoxGroup(click.Group):
            print(f' [violet]Hint:[/violet] `archivebox {cmd_name}` has been renamed to `archivebox {new_name}`')
            cmd_name = new_name
            ctx.invoked_subcommand = cmd_name
+
+        if self._should_use_legacy_model_command(cmd_name):
+            return self._lazy_load(self.legacy_model_commands[cmd_name])
        
        # handle lazy loading of commands
        if cmd_name in self.all_subcommands:
@@ -91,8 +122,8 @@ class ArchiveBoxGroup(click.Group):
        return super().get_command(ctx, cmd_name)

    @classmethod
-    def _lazy_load(cls, cmd_name):
-        import_path = cls.all_subcommands[cmd_name]
+    def _lazy_load(cls, cmd_name_or_path):
+        import_path = cls.all_subcommands.get(cmd_name_or_path, cmd_name_or_path)
        modname, funcname = import_path.rsplit('.', 1)
        
        # print(f'LAZY LOADING {import_path}')
--- a/archivebox/cli/archivebox_crawl_compat.py
+++ b/archivebox/cli/archivebox_crawl_compat.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox crawl'
+
+import sys
+
+import rich_click as click
+
+from archivebox.cli.archivebox_add import add
+
+
+@click.command(context_settings={'ignore_unknown_options': True})
+@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
+@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
+@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
+@click.option('--wait/--no-wait', 'wait', default=True, help='Accepted for backwards compatibility')
+@click.argument('urls', nargs=-1)
+def main(depth: int, tag: str, status: str, wait: bool, urls: tuple[str, ...]):
+    """Backwards-compatible `archivebox crawl URL...` entrypoint."""
+    del status, wait
+    add(list(urls), depth=depth, tag=tag, index_only=True, bg=True)
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -226,7 +226,7 @@ def is_archiveresult_id(value: str) -> bool:


@click.command()
-@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
+@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
 def main(plugins: str, wait: bool, args: tuple):
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -12,6 +12,7 @@ import rich_click as click
 from django.db.models import QuerySet

 from archivebox.config import DATA_DIR
+from archivebox.config.constants import CONSTANTS
 from archivebox.config.django import setup_django
 from archivebox.misc.util import enforce_types, docstring
 from archivebox.misc.checks import check_data_folder
@@ -65,6 +66,9 @@ def remove(filter_patterns: Iterable[str]=(),
        for snapshot in snapshots:
            if delete:
                shutil.rmtree(snapshot.output_dir, ignore_errors=True)
+                legacy_path = CONSTANTS.ARCHIVE_DIR / snapshot.timestamp
+                if legacy_path.is_symlink():
+                    legacy_path.unlink(missing_ok=True)
    finally:
        timer.end()

--- a/archivebox/cli/archivebox_snapshot_compat.py
+++ b/archivebox/cli/archivebox_snapshot_compat.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox snapshot'
+
+import sys
+
+import rich_click as click
+
+from archivebox.cli.archivebox_snapshot import create_snapshots
+
+
+@click.command(context_settings={'ignore_unknown_options': True})
+@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
+@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
+@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
+@click.argument('urls', nargs=-1)
+def main(tag: str, status: str, depth: int, urls: tuple[str, ...]):
+    """Backwards-compatible `archivebox snapshot URL...` entrypoint."""
+    sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1557,6 +1557,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            'url': self.url,
            'title': self.title,
            'tags': self.tags_str(),
+            'tags_str': self.tags_str(),
            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
            'created_at': self.created_at.isoformat() if self.created_at else None,
            'timestamp': self.timestamp,
@@ -2306,7 +2307,7 @@ class SnapshotMachine(BaseStateMachine):
                status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
            ).count()

-            if remaining_active == 0:
+            if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED:
                print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
                # Seal the parent crawl
                crawl.sm.seal()
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -332,8 +332,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        """
        import time
        from pathlib import Path
-        from archivebox.hooks import run_hook, discover_hooks, process_hook_records
+        from archivebox.hooks import run_hook, discover_hooks, process_hook_records, is_finite_background_hook
        from archivebox.config.configset import get_config
+        from archivebox.machine.models import Binary, Machine

        # Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode)
        debug_log = Path('/tmp/archivebox_crawl_debug.log')
@@ -344,6 +345,43 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        # Get merged config with crawl context
        config = get_config(crawl=self)

+        machine = Machine.current()
+        declared_binary_names: set[str] = set()
+
+        def install_declared_binaries(binary_names: set[str]) -> None:
+            if not binary_names:
+                return
+
+            pending_binaries = Binary.objects.filter(
+                machine=machine,
+                name__in=binary_names,
+            ).exclude(
+                status=Binary.StatusChoices.INSTALLED,
+            ).order_by('retry_at')
+
+            for binary in pending_binaries:
+                try:
+                    binary.sm.tick()
+                except Exception:
+                    continue
+
+            unresolved_binaries = list(
+                Binary.objects.filter(
+                    machine=machine,
+                    name__in=binary_names,
+                ).exclude(
+                    status=Binary.StatusChoices.INSTALLED,
+                ).order_by('name')
+            )
+            if unresolved_binaries:
+                binary_details = ', '.join(
+                    f'{binary.name} (status={binary.status})'
+                    for binary in unresolved_binaries
+                )
+                raise RuntimeError(
+                    f'Crawl dependencies failed to install before continuing: {binary_details}'
+                )
+
        # Discover and run on_Crawl hooks
        with open(debug_log, 'a') as f:
            f.write(f'Discovering Crawl hooks...\n')
@@ -378,9 +416,15 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            if hook_elapsed > 0.5:  # Log slow hooks
                print(f'[yellow]⏱️  Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')

-            # Background hook - still running
+            # Finite background hooks must finish before snapshots start so they can
+            # emit dependency records (Binary, Machine config, etc.).
            if process.status == process.StatusChoices.RUNNING:
-                continue
+                if not is_finite_background_hook(hook.name):
+                    continue
+                try:
+                    process.wait(timeout=process.timeout)
+                except Exception:
+                    continue

            # Foreground hook - process JSONL records
            from archivebox.hooks import extract_records_from_process
@@ -394,33 +438,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            if stats:
                print(f'[green]✓ Created: {stats}[/green]')

-        # Ensure any newly declared binaries are installed before creating snapshots
-        from archivebox.machine.models import Binary, Machine
-        from django.utils import timezone
+            hook_binary_names = {
+                str(record.get('name')).strip()
+                for record in records
+                if record.get('type') == 'Binary' and record.get('name')
+            }
+            hook_binary_names.discard('')
+            if hook_binary_names:
+                declared_binary_names.update(hook_binary_names)
+                install_declared_binaries(hook_binary_names)

-        machine = Machine.current()
-        while True:
-            pending_binaries = Binary.objects.filter(
-                machine=machine,
-                status=Binary.StatusChoices.QUEUED,
-                retry_at__lte=timezone.now(),
-            ).order_by('retry_at')
-            if not pending_binaries.exists():
-                break
-
-            for binary in pending_binaries:
-                try:
-                    binary.sm.tick()
-                except Exception:
-                    continue
-
-            # Exit if nothing else is immediately retryable
-            if not Binary.objects.filter(
-                machine=machine,
-                status=Binary.StatusChoices.QUEUED,
-                retry_at__lte=timezone.now(),
-            ).exists():
-                break
+        # Safety check: don't create snapshots if any crawl-declared dependency
+        # is still unresolved after all crawl hooks have run.
+        install_declared_binaries(declared_binary_names)

        # Create snapshots from all URLs in self.urls
        with open(debug_log, 'a') as f:
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -121,6 +121,11 @@ def is_background_hook(hook_name: str) -> bool:
    return '.bg.' in hook_name or '__background' in hook_name


+def is_finite_background_hook(hook_name: str) -> bool:
+    """Check if a background hook is finite-lived and should be awaited."""
+    return '.finite.bg.' in hook_name
+
+
 def iter_plugin_dirs() -> List[Path]:
    """Iterate over all built-in and user plugin directories."""
    plugin_dirs: List[Path] = []
@@ -904,8 +909,25 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
    # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
    plugins_whitelist = config.get('PLUGINS', '')
    if plugins_whitelist:
-        # PLUGINS whitelist is specified - only enable plugins in the list
-        plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
+        # PLUGINS whitelist is specified - include transitive required_plugins from config.json
+        plugin_configs = discover_plugin_configs()
+        plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
+        pending = list(plugin_names)
+
+        while pending:
+            current = pending.pop()
+            schema = plugin_configs.get(current, {})
+            required_plugins = schema.get('required_plugins', [])
+            if not isinstance(required_plugins, list):
+                continue
+
+            for required_plugin in required_plugins:
+                required_plugin_name = str(required_plugin).strip().lower()
+                if not required_plugin_name or required_plugin_name in plugin_names:
+                    continue
+                plugin_names.add(required_plugin_name)
+                pending.append(required_plugin_name)
+
        if plugin_name.lower() not in plugin_names:
            # Plugin not in whitelist - explicitly disabled
            enabled = False
--- a/archivebox/machine/migrations/0011_remove_binary_output_dir.py
+++ b/archivebox/machine/migrations/0011_remove_binary_output_dir.py
@@ -1,8 +1,18 @@
-# Generated by Django 6.0 on 2026-01-05 01:09
-
 from django.db import migrations


+def remove_output_dir_if_exists(apps, schema_editor):
+    cursor = schema_editor.connection.cursor()
+    cursor.execute("PRAGMA table_info(machine_binary)")
+    columns = {row[1] for row in cursor.fetchall()}
+
+    if 'output_dir' not in columns:
+        return
+
+    Binary = apps.get_model('machine', 'Binary')
+    schema_editor.remove_field(Binary, Binary._meta.get_field('output_dir'))
+
+
 class Migration(migrations.Migration):

    dependencies = [
@@ -10,8 +20,15 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        migrations.RemoveField(
-            model_name='binary',
-            name='output_dir',
+        migrations.SeparateDatabaseAndState(
+            database_operations=[
+                migrations.RunPython(remove_output_dir_if_exists, migrations.RunPython.noop),
+            ],
+            state_operations=[
+                migrations.RemoveField(
+                    model_name='binary',
+                    name='output_dir',
+                ),
+            ],
        ),
    ]
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -352,7 +352,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):

        # Case 2: From binaries.json - create queued binary (needs installation)
        if 'binproviders' in record or ('overrides' in record and not abspath):
-            binary, created = Binary.objects.get_or_create(
+            binary, _ = Binary.objects.update_or_create(
                machine=machine,
                name=name,
                defaults={
--- a/archivebox/tests/fixtures.py
+++ b/archivebox/tests/fixtures.py
@@ -13,16 +13,16 @@ def process(tmp_path):
 def disable_extractors_dict():
    env = os.environ.copy()
    env.update({
-        "USE_WGET": "false",
-        "USE_SINGLEFILE": "false",
-        "USE_READABILITY": "false",
-        "USE_MERCURY": "false",
+        "SAVE_WGET": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_READABILITY": "false",
+        "SAVE_MERCURY": "false",
        "SAVE_HTMLTOTEXT": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
        "SAVE_DOM": "false",
        "SAVE_HEADERS": "false",
-        "USE_GIT": "false",
+        "SAVE_GIT": "false",
        "SAVE_YTDLP": "false",
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
--- a/archivebox/tests/test_crawl.py
+++ b/archivebox/tests/test_crawl.py
@@ -145,8 +145,8 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
    assert snapshot is not None, "Should create at least one snapshot"


-def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
-    """Test that crawl creates a Seed object for input."""
+def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractors_dict):
+    """Test that crawl input URLs are stored on the Crawl record."""
    os.chdir(tmp_path)

    subprocess.run(
@@ -158,10 +158,11 @@ def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict

    conn = sqlite3.connect('index.sqlite3')
    c = conn.cursor()
-    seed = c.execute("SELECT id FROM crawls_seed").fetchone()
+    crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
    conn.close()

-    assert seed is not None, "Seed should be created for crawl input"
+    assert crawl_urls is not None, "Crawl should be created for crawl input"
+    assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"


 class TestCrawlCLI:
@@ -178,7 +179,7 @@ class TestCrawlCLI:
        )

        assert result.returncode == 0
-        assert '--depth' in result.stdout or '-d' in result.stdout
+        assert 'create' in result.stdout


 if __name__ == '__main__':
--- a/archivebox/tests/test_extractors.py
+++ b/archivebox/tests/test_extractors.py
@@ -3,7 +3,7 @@ import json as pyjson


 def test_singlefile_works(tmp_path, process, disable_extractors_dict):
-    disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
+    disable_extractors_dict.update({"SAVE_SINGLEFILE": "true"})
    add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
@@ -11,7 +11,7 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
    assert output_file.exists()

 def test_readability_works(tmp_path, process, disable_extractors_dict):
-    disable_extractors_dict.update({"USE_READABILITY": "true"})
+    disable_extractors_dict.update({"SAVE_READABILITY": "true"})
    add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
                                  capture_output=True, env=disable_extractors_dict)
    archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
@@ -27,7 +27,7 @@ def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
    assert output_file.exists()

 def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
-    disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
+    disable_extractors_dict.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
    add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
                                  capture_output=True, env=disable_extractors_dict)
    output_str = add_process.stdout.decode("utf-8")
--- a/archivebox/tests/test_real_world_add.py
+++ b/archivebox/tests/test_real_world_add.py
@@ -39,6 +39,17 @@ def test_add_real_world_example_domain(tmp_path):
    env = os.environ.copy()
    env["TMP_DIR"] = str(tmp_short)
    env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
+    env["SAVE_TITLE"] = "True"
+    env["SAVE_WGET"] = "True"
+    env["SAVE_SINGLEFILE"] = "True"
+    env["SAVE_READABILITY"] = "False"
+    env["SAVE_HTMLTOTEXT"] = "True"
+    env["SAVE_HEADERS"] = "True"
+    env["SAVE_PDF"] = "False"
+    env["SAVE_SCREENSHOT"] = "False"
+    env["SAVE_ARCHIVEDOTORG"] = "False"
+    env["SAVE_YTDLP"] = "False"
+    env["SAVE_GIT"] = "False"

    init = subprocess.run(
        ["archivebox", "init"],
@@ -50,7 +61,7 @@ def test_add_real_world_example_domain(tmp_path):
    assert init.returncode == 0, f"archivebox init failed: {init.stderr}"

    result = subprocess.run(
-        ["archivebox", "add", "https://example.com"],
+        ["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
        capture_output=True,
        text=True,
        timeout=900,
@@ -115,19 +126,13 @@ def test_add_real_world_example_domain(tmp_path):
    )

    text_hits = 0
-    for path in (
-        *snapshot_dir.glob("*_readability/content.txt"),
-        snapshot_dir / "readability" / "content.txt",
-    ):
-        if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
-            text_hits += 1
    for path in (
        *snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
        snapshot_dir / "htmltotext" / "htmltotext.txt",
    ):
        if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
            text_hits += 1
-    assert text_hits >= 2, (
-        "Expected multiple text extractors to contain Example Domain "
-        f"(readability/htmltotext hits={text_hits})."
+    assert text_hits >= 1, (
+        "Expected htmltotext output to contain Example Domain "
+        f"(htmltotext hits={text_hits})."
    )
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
@@ -22,16 +22,16 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
    env = os.environ.copy()
    env.update({
        # Disable most extractors
-        "USE_WGET": "false",
-        "USE_SINGLEFILE": "false",
-        "USE_READABILITY": "false",
-        "USE_MERCURY": "false",
+        "SAVE_WGET": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_READABILITY": "false",
+        "SAVE_MERCURY": "false",
        "SAVE_HTMLTOTEXT": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
        "SAVE_DOM": "false",
        "SAVE_HEADERS": "false",
-        "USE_GIT": "false",
+        "SAVE_GIT": "false",
        "SAVE_YTDLP": "false",
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
@@ -122,16 +122,16 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
    # Enable only parse_html_urls for this test
    env = os.environ.copy()
    env.update({
-        "USE_WGET": "false",
-        "USE_SINGLEFILE": "false",
-        "USE_READABILITY": "false",
-        "USE_MERCURY": "false",
+        "SAVE_WGET": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_READABILITY": "false",
+        "SAVE_MERCURY": "false",
        "SAVE_HTMLTOTEXT": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
        "SAVE_DOM": "false",
        "SAVE_HEADERS": "false",
-        "USE_GIT": "false",
+        "SAVE_GIT": "false",
        "SAVE_YTDLP": "false",
        "SAVE_ARCHIVEDOTORG": "false",
        "SAVE_TITLE": "false",
@@ -202,12 +202,22 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
    env = os.environ.copy()
    env.update({
        "URL_ALLOWLIST": r"monadical\.com/.*",  # Only crawl same domain
+        "SAVE_READABILITY": "false",
+        "SAVE_SINGLEFILE": "false",
+        "SAVE_MERCURY": "false",
+        "SAVE_SCREENSHOT": "false",
+        "SAVE_PDF": "false",
+        "SAVE_HEADERS": "false",
+        "SAVE_ARCHIVEDOTORG": "false",
+        "SAVE_GIT": "false",
+        "SAVE_YTDLP": "false",
+        "SAVE_TITLE": "false",
    })

    # Start a crawl with depth=1 (just one hop to test recursive crawling)
    # Use file:// URL so it's instant, no network fetch needed
    proc = subprocess.Popen(
-        ['archivebox', 'add', '--depth=1', f'file://{test_html}'],
+        ['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', f'file://{test_html}'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
--- a/archivebox/tests/test_remove.py
+++ b/archivebox/tests/test_remove.py
@@ -12,7 +12,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
    # Verify snapshot exists
    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()
    assert count_before >= 1

@@ -24,7 +24,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()

    assert count == 0
@@ -59,7 +59,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count_before = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()
    assert count_before >= 2

@@ -67,7 +67,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
+    count_after = c.execute("SELECT COUNT() FROM core_snapshot").fetchone()[0]
    conn.close()
    assert count_after == 0

@@ -80,7 +80,7 @@ def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
+    crawl_count = c.execute("SELECT COUNT() FROM crawls_crawl").fetchone()[0]
    conn.close()

    assert crawl_count == 2
--- a/archivebox/tests/test_savepagenow.py
+++ b/archivebox/tests/test_savepagenow.py
@@ -9,7 +9,10 @@ from pathlib import Path
 from archivebox.tests.conftest import create_test_url


-def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool):
+ADMIN_HOST = 'admin.archivebox.localhost:8000'
+
+
+def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
    project_root = Path(__file__).resolve().parents[2]
    script = textwrap.dedent(
        f"""
@@ -31,7 +34,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte

        target_url = {request_url!r}

-        resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
+        resp = client.get('/web/' + target_url, HTTP_HOST={host!r})
        assert resp.status_code == 302, resp.status_code

        snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
@@ -46,7 +49,7 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
            )
        assert resp['Location'] == f"/{{snapshot.url_path}}"

-        resp2 = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
+        resp2 = client.get('/web/' + target_url, HTTP_HOST={host!r})
        assert resp2.status_code == 302, resp2.status_code
        assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
        assert resp2['Location'] == f"/{{snapshot.url_path}}"
@@ -208,7 +211,7 @@ def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
    """/web/https://... should work for authenticated users even when public add is off."""
    url = create_test_url(domain='example.com', path='savepagenow-auth')
    request_url = url.replace('https://', '')
-    result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False)
+    result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
    assert result.returncode == 0, (
        "SavePageNow shortcut (logged-in) test failed.\n"
        f"stdout:\n{result.stdout}\n"
@@ -220,7 +223,7 @@ def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
    """/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
    url = create_test_url(domain='example.com', path='savepagenow-public')
    request_url = url.replace('https://', '')
-    result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True)
+    result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
    assert result.returncode == 0, (
        "SavePageNow shortcut (public add) test failed.\n"
        f"stdout:\n{result.stdout}\n"
--- a/archivebox/tests/test_title.py
+++ b/archivebox/tests/test_title.py
@@ -6,14 +6,19 @@ from .fixtures import *
 def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
    """Test that title is extracted from the page."""
    disable_extractors_dict.update({"SAVE_TITLE": "true"})
-    subprocess.run(['archivebox', 'add', 'https://example.com'],
-                                 capture_output=True, env=disable_extractors_dict)
+    add_process = subprocess.run(
+        ['archivebox', 'add', '--plugins=title', 'https://example.com'],
+        capture_output=True,
+        text=True,
+        env=disable_extractors_dict,
+    )
+    assert add_process.returncode == 0, add_process.stderr or add_process.stdout

    os.chdir(tmp_path)
    conn = sqlite3.connect("index.sqlite3")
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
-    c.execute("SELECT title from archivebox.core.snapshot")
+    c.execute("SELECT title FROM core_snapshot")
    snapshot = c.fetchone()
    conn.close()

@@ -27,8 +32,13 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
    and breaks the layout.
    """
    disable_extractors_dict.update({"SAVE_TITLE": "true"})
-    subprocess.run(['archivebox', 'add', 'https://example.com'],
-                                 capture_output=True, env=disable_extractors_dict)
+    add_process = subprocess.run(
+        ['archivebox', 'add', '--plugins=title', 'https://example.com'],
+        capture_output=True,
+        text=True,
+        env=disable_extractors_dict,
+    )
+    assert add_process.returncode == 0, add_process.stderr or add_process.stdout
    list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)

    # Should not contain unescaped HTML tags in output
--- a/archivebox/tests/test_util.py
+++ b/archivebox/tests/test_util.py
@@ -1,5 +1,30 @@
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from threading import Thread
+
 from archivebox.misc.util import download_url

+
+class _ExampleHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        body = b"<html><body><h1>Example Domain</h1></body></html>"
+        self.send_response(200)
+        self.send_header("Content-Type", "text/html; charset=utf-8")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, format, *args):
+        return
+
 def test_download_url_downloads_content():
-    text = download_url("https://example.com")
+    server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
+    thread = Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    try:
+        text = download_url(f"http://127.0.0.1:{server.server_address[1]}/")
+    finally:
+        server.shutdown()
+        server.server_close()
+        thread.join(timeout=5)
+
    assert "Example Domain" in text