move tests into subfolder, add missing install hooks

2026-04-06 07:47:53 +10:00 · 2026-01-02 00:22:07 -08:00
parent c2afb40350
commit 65ee09ceab
80 changed files with 2659 additions and 859 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -39,11 +39,13 @@ tmp/
 data/
 data*/
 output/
+logs/
 index.sqlite3
 queue.sqlite3
 *.sqlite*
 data.*
 .archivebox_id
+ArchiveBox.conf

 # vim
 *.sw?
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -158,6 +158,63 @@ env['SAVE_FAVICON'] = 'False'
 #### Timeout Settings
 Use appropriate timeouts for migration tests (45s for init, 60s default).

+### Plugin Testing & Code Coverage
+
+**Target: 80-90% coverage** for critical plugins (screenshot, chrome, singlefile, dom)
+
+```bash
+# Run plugin tests with coverage (both Python + JavaScript)
+bash bin/test_plugins.sh screenshot
+
+# View coverage reports
+bash bin/test_plugins.sh --coverage-report
+# Or individual reports:
+coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'
+```
+
+#### Plugin Test Structure
+
+Tests are **completely isolated** from ArchiveBox - they replicate production directory structure in temp dirs:
+
+```python
+# Correct production paths:
+# Crawl:    DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
+# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
+
+with tempfile.TemporaryDirectory() as tmpdir:
+    data_dir = Path(tmpdir)
+
+    # Crawl-level plugin (e.g., chrome launcher)
+    crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / 'crawl-123'
+    chrome_dir = crawl_dir / 'chrome'
+    chrome_dir.mkdir(parents=True)
+
+    # Snapshot-level plugin (e.g., screenshot)
+    snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-456'
+    screenshot_dir = snapshot_dir / 'screenshot'
+    screenshot_dir.mkdir(parents=True)
+
+    # Run hook in its output directory
+    result = subprocess.run(
+        ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=snap-456'],
+        cwd=str(screenshot_dir),
+        env=get_test_env(),
+        capture_output=True,
+        timeout=120
+    )
+```
+
+#### Coverage Improvement Loop
+
+To improve from ~20% to 80%+:
+
+1. **Run tests**: `bash bin/test_plugins.sh screenshot` → Shows: `19.1% (13/68 ranges)`
+2. **Identify gaps**: Check hook file for untested paths (session connection vs fallback, config branches, error cases)
+3. **Add tests**: Test both execution paths (connect to session + launch own browser), skip conditions, error cases, config variations
+4. **Verify**: Re-run tests → Should show: `85%+ (58+/68 ranges)`
+
+**Critical**: JavaScript hooks have TWO paths that both must be tested (connect to session ~50% + launch browser ~30% + shared ~20%). Testing only one path = max 50% coverage possible!
+
 ## Database Migrations

 ### Generate and Apply Migrations
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@@ -41,9 +41,11 @@ class ArchiveBoxGroup(click.Group):
    archive_commands = {
        # High-level commands
        'add': 'archivebox.cli.archivebox_add.main',
+        'remove': 'archivebox.cli.archivebox_remove.main',
        'run': 'archivebox.cli.archivebox_run.main',
        'update': 'archivebox.cli.archivebox_update.main',
        'status': 'archivebox.cli.archivebox_status.main',
+        'search': 'archivebox.cli.archivebox_search.main',
        'config': 'archivebox.cli.archivebox_config.main',
        'schedule': 'archivebox.cli.archivebox_schedule.main',
        'server': 'archivebox.cli.archivebox_server.main',
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -13,8 +13,15 @@ from archivebox.misc.util import docstring, enforce_types


@enforce_types
-def install(dry_run: bool=False) -> None:
-    """Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
+def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
+    """Detect and install ArchiveBox dependencies by running a dependency-check crawl
+
+    Examples:
+        archivebox install                              # Install all dependencies
+        archivebox install wget curl                    # Install only wget and curl
+        archivebox install --binproviders=pip yt-dlp    # Install yt-dlp using only pip
+        archivebox install --binproviders=brew,apt      # Install all deps using only brew or apt
+    """

    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
    from archivebox.config.paths import ARCHIVE_DIR
@@ -24,7 +31,14 @@ def install(dry_run: bool=False) -> None:
    if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
        init()  # must init full index because we need a db to store Binary entries in

-    print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
+    # Show what we're installing
+    if binaries:
+        print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
+    else:
+        print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
+
+    if binproviders != '*':
+        print(f'[green][+] Using providers: {binproviders}[/green]')

    if IS_ROOT:
        EUID = os.geteuid()
@@ -49,6 +63,19 @@ def install(dry_run: bool=False) -> None:
    # Using a minimal crawl that will trigger on_Crawl hooks
    created_by_id = get_or_create_system_user_pk()

+    # Build config for this crawl using existing PLUGINS filter
+    crawl_config = {}
+
+    # Combine binary names and provider names into PLUGINS list
+    plugins = []
+    if binaries:
+        plugins.extend(binaries)
+    if binproviders != '*':
+        plugins.extend(binproviders.split(','))
+
+    if plugins:
+        crawl_config['PLUGINS'] = ','.join(plugins)
+
    crawl, created = Crawl.objects.get_or_create(
        urls='archivebox://install',
        defaults={
@@ -56,6 +83,7 @@ def install(dry_run: bool=False) -> None:
            'created_by_id': created_by_id,
            'max_depth': 0,
            'status': 'queued',
+            'config': crawl_config,
        }
    )

@@ -63,9 +91,12 @@ def install(dry_run: bool=False) -> None:
    if not created:
        crawl.status = 'queued'
        crawl.retry_at = timezone.now()
+        crawl.config = crawl_config  # Update config
        crawl.save()

    print(f'[+] Created dependency detection crawl: {crawl.id}')
+    if crawl_config:
+        print(f'[+] Crawl config: {crawl_config}')
    print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')

    # Verify the crawl is in the queue
@@ -100,15 +131,15 @@ def install(dry_run: bool=False) -> None:

    print()

-    # Run version to show full status
-    archivebox_path = shutil.which('archivebox') or sys.executable
-    if 'python' in archivebox_path:
-        os.system(f'{sys.executable} -m archivebox version')
-    else:
-        os.system(f'{archivebox_path} version')
+    # Show version to display full status including installed binaries
+    # Django is already loaded, so just import and call the function directly
+    from archivebox.cli.archivebox_version import version as show_version
+    show_version(quiet=False)


@click.command()
+@click.argument('binaries', nargs=-1, type=str, required=False)
+@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@docstring(install.__doc__)
 def main(**kwargs) -> None:
--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -50,6 +50,9 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
    if filter_patterns:
        result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)

+    # Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
+    result = result.select_related('crawl', 'crawl__created_by')
+
    if not result:
        stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')

--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -145,16 +145,29 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
        # Check if needs migration (0.8.x → 0.9.x)
        if snapshot.fs_migration_needed:
            try:
-                snapshot.save()  # Triggers migration + creates symlink
+                # Manually trigger filesystem migration without full save()
+                # This avoids UNIQUE constraint issues while still migrating files
+                cleanup_info = None
+                if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
+                    cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
+
+                # Update only fs_version field using queryset update (bypasses validation)
+                from archivebox.core.models import Snapshot as SnapshotModel
+                SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
+
+                # Commit the transaction
+                transaction.commit()
+
+                # Manually call cleanup since we bypassed normal save() flow
+                if cleanup_info:
+                    old_dir, new_dir = cleanup_info
+                    snapshot._cleanup_old_migration_dir(old_dir, new_dir)
+
                stats['migrated'] += 1
                print(f"    [{stats['processed']}] Migrated: {entry_path.name}")
            except Exception as e:
-                # Snapshot already exists in DB with different crawl - skip it
-                if 'UNIQUE constraint failed' in str(e):
-                    stats['skipped'] += 1
-                    print(f"    [{stats['processed']}] Skipped (already in DB): {entry_path.name}")
-                else:
-                    raise
+                stats['skipped'] += 1
+                print(f"    [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
        else:
            stats['skipped'] += 1

--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -104,40 +104,47 @@ def version(quiet: bool=False,
    failures = []

    # Setup Django before importing models
-    from archivebox.config.django import setup_django
-    setup_django()
+    try:
+        from archivebox.config.django import setup_django
+        setup_django()

-    from archivebox.machine.models import Machine, Binary
+        from archivebox.machine.models import Machine, Binary

-    machine = Machine.current()
+        machine = Machine.current()

-    # Get all binaries from the database
-    all_installed = Binary.objects.filter(
-        machine=machine
-    ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
+        # Get all binaries from the database with timeout protection
+        all_installed = Binary.objects.filter(
+            machine=machine
+        ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')

-    if not all_installed.exists():
-        prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
-    else:
-        for installed in all_installed:
-            # Skip if user specified specific binaries and this isn't one
-            if binaries and installed.name not in binaries:
-                continue
+        if not all_installed.exists():
+            prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
+        else:
+            for installed in all_installed:
+                # Skip if user specified specific binaries and this isn't one
+                if binaries and installed.name not in binaries:
+                    continue

-            if installed.is_valid:
-                display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
-                version_str = (installed.version or 'unknown')[:15]
-                provider = (installed.binprovider or 'env')[:8]
-                prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
-            else:
-                prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
-                failures.append(installed.name)
+                if installed.is_valid:
+                    display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
+                    version_str = (installed.version or 'unknown')[:15]
+                    provider = (installed.binprovider or 'env')[:8]
+                    prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
+                else:
+                    prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
+                    failures.append(installed.name)

-    # Show hint if no binaries are installed yet
-    has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
-    if not has_any_installed:
+        # Show hint if no binaries are installed yet
+        has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
+        if not has_any_installed:
+            prnt()
+            prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
+
+    except Exception as e:
+        # Handle database errors gracefully (locked, missing, etc.)
        prnt()
-        prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
+        prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
+        prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')

    if not binaries:
        # Show code and data locations
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -116,7 +116,7 @@ def upgrade_core_tables(apps, schema_editor):
            retry_at DATETIME,

            depth INTEGER NOT NULL DEFAULT 0,
-            fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
+            fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0',
            config TEXT NOT NULL DEFAULT '{}',
            notes TEXT NOT NULL DEFAULT '',
            num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
@@ -326,6 +326,16 @@ class Migration(migrations.Migration):
                    name='modified_at',
                    field=models.DateTimeField(auto_now=True),
                ),
+                # Declare fs_version (already created in database with DEFAULT '0.8.0')
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='fs_version',
+                    field=models.CharField(
+                        max_length=10,
+                        default='0.8.0',
+                        help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
+                    ),
+                ),

                # SnapshotTag table already exists from v0.7.2, just declare it in state
                migrations.CreateModel(
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -150,11 +150,7 @@ class Migration(migrations.Migration):
            name='downloaded_at',
            field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='fs_version',
-            field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
-        ),
+        # NOTE: fs_version already added by migration 0023 with default='0.8.0'
        # NOTE: modified_at already added by migration 0023
        migrations.AddField(
            model_name='snapshot',
--- a/archivebox/core/migrations/0026_add_process_to_archiveresult.py
+++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py
@@ -8,7 +8,7 @@ class Migration(migrations.Migration):

    dependencies = [
        ('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
-        ('machine', '0003_add_process_type_and_parent'),
+        ('machine', '0007_add_process_type_and_parent'),
    ]

    operations = [
--- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
+++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
@@ -0,0 +1,388 @@
+# Generated by hand on 2026-01-01
+# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields
+
+from django.db import migrations, connection
+import json
+from pathlib import Path
+
+
+def parse_cmd_field(cmd_raw):
+    """
+    Parse cmd field which could be:
+    1. JSON array string: '["wget", "-p", "url"]'
+    2. Space-separated string: 'wget -p url'
+    3. NULL/empty
+
+    Returns list of strings.
+    """
+    if not cmd_raw:
+        return []
+
+    cmd_raw = cmd_raw.strip()
+
+    if not cmd_raw:
+        return []
+
+    # Try to parse as JSON first
+    if cmd_raw.startswith('['):
+        try:
+            parsed = json.loads(cmd_raw)
+            if isinstance(parsed, list):
+                return [str(x) for x in parsed]
+        except json.JSONDecodeError:
+            pass
+
+    # Fallback: split by spaces (simple approach, doesn't handle quoted strings)
+    # This is acceptable since old cmd fields were mostly simple commands
+    return cmd_raw.split()
+
+
+def get_or_create_current_machine(cursor):
+    """Get or create Machine.current() using raw SQL."""
+    import uuid
+    import socket
+    from datetime import datetime
+
+    # Simple machine detection - get hostname as guid
+    hostname = socket.gethostname()
+    guid = f'host_{hostname}'  # Simple but stable identifier
+
+    # Check if machine exists
+    cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid])
+    row = cursor.fetchone()
+
+    if row:
+        return row[0]
+
+    # Create new machine
+    machine_id = str(uuid.uuid4())
+    now = datetime.now().isoformat()
+
+    # Check which columns exist (schema differs between 0.8.x and 0.9.x)
+    cursor.execute("PRAGMA table_info(machine_machine)")
+    machine_cols = {row[1] for row in cursor.fetchall()}
+
+    # Build INSERT statement based on available columns
+    if 'config' in machine_cols:
+        # 0.9.x schema with config column
+        cursor.execute("""
+            INSERT INTO machine_machine (
+                id, created_at, modified_at, guid, hostname,
+                hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
+                os_arch, os_family, os_platform, os_release, os_kernel,
+                stats, config, num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
+                      '', '', '', '', '', '{}', '{}', 0, 0)
+        """, [machine_id, now, now, guid, hostname])
+    else:
+        # 0.8.x schema without config column
+        cursor.execute("""
+            INSERT INTO machine_machine (
+                id, created_at, modified_at, guid, hostname,
+                hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
+                os_arch, os_family, os_platform, os_release, os_kernel,
+                stats, num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
+                      '', '', '', '', '', '{}', 0, 0)
+        """, [machine_id, now, now, guid, hostname])
+
+    return machine_id
+
+
+def get_or_create_binary(cursor, machine_id, name, abspath, version):
+    """
+    Get or create Binary record.
+
+    Args:
+        cursor: DB cursor
+        machine_id: Machine FK
+        name: Binary name (basename of command)
+        abspath: Absolute path to binary (or just name if path unknown)
+        version: Version string
+
+    Returns:
+        binary_id (str)
+    """
+    import uuid
+    from datetime import datetime
+
+    # If abspath is just a name without slashes, it's not a full path
+    # Store it in both fields for simplicity
+    if '/' not in abspath:
+        # Not a full path - store as-is
+        pass
+
+    # Check if binary exists with same machine, name, abspath, version
+    cursor.execute("""
+        SELECT id FROM machine_binary
+        WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ?
+    """, [machine_id, name, abspath, version])
+
+    row = cursor.fetchone()
+    if row:
+        return row[0]
+
+    # Create new binary
+    binary_id = str(uuid.uuid4())
+    now = datetime.now().isoformat()
+
+    # Check which columns exist (schema differs between 0.8.x and 0.9.x)
+    cursor.execute("PRAGMA table_info(machine_binary)")
+    binary_cols = {row[1] for row in cursor.fetchall()}
+
+    # Use only columns that exist in current schema
+    # 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded
+    # 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir
+    if 'binproviders' in binary_cols:
+        # 0.9.x schema
+        cursor.execute("""
+            INSERT INTO machine_binary (
+                id, created_at, modified_at, machine_id,
+                name, binproviders, overrides, binprovider, abspath, version, sha256,
+                status, retry_at, output_dir,
+                num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '',
+                      'succeeded', NULL, '', 0, 0)
+        """, [binary_id, now, now, machine_id, name, abspath, version])
+    else:
+        # 0.8.x schema (simpler)
+        cursor.execute("""
+            INSERT INTO machine_binary (
+                id, created_at, modified_at, machine_id,
+                name, binprovider, abspath, version, sha256,
+                num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0)
+        """, [binary_id, now, now, machine_id, name, abspath, version])
+
+    return binary_id
+
+
+def map_status(old_status):
+    """
+    Map old ArchiveResult status to Process status and exit_code.
+
+    Args:
+        old_status: One of: queued, started, backoff, succeeded, failed, skipped
+
+    Returns:
+        (process_status, exit_code) tuple
+    """
+    status_map = {
+        'queued': ('queued', None),
+        'started': ('running', None),
+        'backoff': ('queued', None),
+        'succeeded': ('exited', 0),
+        'failed': ('exited', 1),
+        'skipped': ('exited', None),  # Skipped = exited without error
+    }
+
+    return status_map.get(old_status, ('queued', None))
+
+
+def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id):
+    """
+    Create a Process record.
+
+    Returns:
+        process_id (str)
+    """
+    import uuid
+    from datetime import datetime
+
+    process_id = str(uuid.uuid4())
+    now = datetime.now().isoformat()
+
+    # Convert cmd array to JSON
+    cmd_json = json.dumps(cmd)
+
+    # Set retry_at to now for queued processes, NULL otherwise
+    retry_at = now if status == 'queued' else None
+
+    cursor.execute("""
+        INSERT INTO machine_process (
+            id, created_at, modified_at, machine_id, parent_id, process_type,
+            pwd, cmd, env, timeout,
+            pid, exit_code, stdout, stderr,
+            started_at, ended_at,
+            binary_id, iface_id, url,
+            status, retry_at
+        ) VALUES (?, ?, ?, ?, NULL, 'cli',
+                  ?, ?, '{}', 120,
+                  NULL, ?, '', '',
+                  ?, ?,
+                  ?, NULL, NULL,
+                  ?, ?)
+    """, [
+        process_id, now, now, machine_id,
+        pwd, cmd_json,
+        exit_code,
+        started_at, ended_at,
+        binary_id,
+        status, retry_at
+    ])
+
+    return process_id
+
+
+def copy_archiveresult_data_to_process(apps, schema_editor):
+    """
+    Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records.
+
+    For each ArchiveResult without a process_id:
+    1. Parse cmd field (handle both JSON array and space-separated string)
+    2. Extract binary name/path from cmd[0]
+    3. Get or create Binary record with machine, name, abspath, version
+    4. Create Process record with mapped fields
+    5. Link ArchiveResult.process_id to new Process
+
+    Status mapping:
+    - queued → queued (exit_code=None)
+    - started → running (exit_code=None)
+    - backoff → queued (exit_code=None)
+    - succeeded → exited (exit_code=0)
+    - failed → exited (exit_code=1)
+    - skipped → exited (exit_code=None)
+    """
+    cursor = connection.cursor()
+
+    # Check if old fields still exist (skip if fresh install or already migrated)
+    cursor.execute("PRAGMA table_info(core_archiveresult)")
+    cols = {row[1] for row in cursor.fetchall()}
+
+    print(f'DEBUG 0027: Columns found: {sorted(cols)}')
+    print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}')
+
+    if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols:
+        print('✓ Fresh install or fields already removed - skipping data copy')
+        return
+
+    # Check if process_id field exists (should exist from 0026)
+    if 'process_id' not in cols:
+        print('✗ ERROR: process_id field not found. Migration 0026 must run first.')
+        return
+
+    # Get or create Machine.current()
+    machine_id = get_or_create_current_machine(cursor)
+
+    # Get ArchiveResults without process_id that have cmd data
+    # Use plugin (extractor was renamed to plugin in migration 0025)
+    cursor.execute("""
+        SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version,
+               status, start_ts, end_ts, created_at
+        FROM core_archiveresult
+        WHERE process_id IS NULL
+        AND (cmd IS NOT NULL OR pwd IS NOT NULL)
+    """)
+
+    results = cursor.fetchall()
+
+    if not results:
+        print('✓ No ArchiveResults need Process migration')
+        return
+
+    print(f'Migrating {len(results)} ArchiveResults to Process records...')
+
+    migrated_count = 0
+    skipped_count = 0
+    error_count = 0
+
+    for i, row in enumerate(results):
+        ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row
+
+        if i == 0:
+            print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}')
+
+        try:
+            # Parse cmd field
+            cmd_array = parse_cmd_field(cmd_raw)
+
+            if i == 0:
+                print(f'DEBUG 0027: Parsed cmd: {cmd_array}')
+
+            # Extract binary info from cmd[0] if available
+            binary_id = None
+            if cmd_array and cmd_array[0]:
+                binary_name = Path(cmd_array[0]).name or plugin  # Fallback to plugin name
+                binary_abspath = cmd_array[0]
+                binary_version = cmd_version or ''
+
+                # Get or create Binary record
+                binary_id = get_or_create_binary(
+                    cursor, machine_id, binary_name, binary_abspath, binary_version
+                )
+
+                if i == 0:
+                    print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}')
+
+            # Map status
+            process_status, exit_code = map_status(status)
+
+            # Set timestamps
+            started_at = start_ts or created_at
+            ended_at = end_ts if process_status == 'exited' else None
+
+            # Create Process record
+            process_id = create_process(
+                cursor=cursor,
+                machine_id=machine_id,
+                pwd=pwd or '',
+                cmd=cmd_array,
+                status=process_status,
+                exit_code=exit_code,
+                started_at=started_at,
+                ended_at=ended_at,
+                binary_id=binary_id,
+            )
+
+            if i == 0:
+                print(f'DEBUG 0027: Created Process: id={process_id}')
+
+            # Link ArchiveResult to Process
+            cursor.execute(
+                "UPDATE core_archiveresult SET process_id = ? WHERE id = ?",
+                [process_id, ar_id]
+            )
+
+            migrated_count += 1
+
+            if i == 0:
+                print(f'DEBUG 0027: Linked ArchiveResult to Process')
+
+        except Exception as e:
+            print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
+            import traceback
+            traceback.print_exc()
+            error_count += 1
+            continue
+
+    print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors')
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0026_add_process_to_archiveresult'),
+        ('machine', '0007_add_process_type_and_parent'),
+    ]
+
+    operations = [
+        # First, copy data from old fields to Process
+        migrations.RunPython(
+            copy_archiveresult_data_to_process,
+            reverse_code=migrations.RunPython.noop,
+        ),
+
+        # Now safe to remove old fields (moved from 0025)
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='cmd',
+        ),
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='cmd_version',
+        ),
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='pwd',
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -362,24 +362,22 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        # Migrate filesystem if needed (happens automatically on save)
        if self.pk and self.fs_migration_needed:
-            from django.db import transaction
-            with transaction.atomic():
-                # Walk through migration chain automatically
-                current = self.fs_version
-                target = self._fs_current_version()
+            # Walk through migration chain automatically
+            current = self.fs_version
+            target = self._fs_current_version()

-                while current != target:
-                    next_ver = self._fs_next_version(current)
-                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
+            while current != target:
+                next_ver = self._fs_next_version(current)
+                method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'

-                    # Only run if method exists (most are no-ops)
-                    if hasattr(self, method):
-                        getattr(self, method)()
+                # Only run if method exists (most are no-ops)
+                if hasattr(self, method):
+                    getattr(self, method)()

-                    current = next_ver
+                current = next_ver

-                # Update version (still in transaction)
-                self.fs_version = target
+            # Update version
+            self.fs_version = target

        super().save(*args, **kwargs)
        if self.url not in self.crawl.urls:
@@ -486,33 +484,58 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        # Convert index.json to index.jsonl in the new directory
        self.convert_index_json_to_jsonl()

-        # Create backwards-compat symlink (INSIDE transaction)
-        symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
-        if symlink_path.is_symlink():
-            symlink_path.unlink()
+        # Schedule cleanup AFTER transaction commits successfully
+        # This ensures DB changes are committed before we delete old files
+        from django.db import transaction
+        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))

-        if not symlink_path.exists() or symlink_path == old_dir:
-            symlink_path.symlink_to(new_dir, target_is_directory=True)
+        # Return cleanup info for manual cleanup if needed (when called directly)
+        return (old_dir, new_dir)

-        # Schedule old directory deletion AFTER transaction commits
-        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
-
-    def _cleanup_old_migration_dir(self, old_dir: Path):
+    def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
        """
-        Delete old directory after successful migration.
+        Delete old directory and create symlink after successful migration.
        Called via transaction.on_commit() after DB commit succeeds.
        """
        import shutil
        import logging

+        print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
+
+        # Delete old directory
        if old_dir.exists() and not old_dir.is_symlink():
+            print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
            try:
                shutil.rmtree(old_dir)
+                print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
            except Exception as e:
                # Log but don't raise - migration succeeded, this is just cleanup
+                print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
                logging.getLogger('archivebox.migration').warning(
                    f"Could not remove old migration directory {old_dir}: {e}"
                )
+                return  # Don't create symlink if cleanup failed
+        else:
+            print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
+
+        # Create backwards-compat symlink (after old dir is deleted)
+        symlink_path = old_dir  # Same path as old_dir
+        if symlink_path.is_symlink():
+            print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
+            symlink_path.unlink()
+
+        if not symlink_path.exists():
+            print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
+            try:
+                symlink_path.symlink_to(new_dir, target_is_directory=True)
+                print(f"[DEBUG] Successfully created symlink")
+            except Exception as e:
+                print(f"[DEBUG] Failed to create symlink: {e}")
+                logging.getLogger('archivebox.migration').warning(
+                    f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
+                )
+        else:
+            print(f"[DEBUG] Symlink path already exists: {symlink_path}")

    # =========================================================================
    # Path Calculation and Migration Helpers
@@ -1616,8 +1639,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        This enables step-based execution where all hooks in a step can run in parallel.
        """
        from archivebox.hooks import discover_hooks
+        from archivebox.config.configset import get_config

-        hooks = discover_hooks('Snapshot')
+        # Get merged config with crawl-specific PLUGINS filter
+        config = get_config(crawl=self.crawl, snapshot=self)
+        hooks = discover_hooks('Snapshot', config=config)
        archiveresults = []

        for hook_path in hooks:
@@ -2212,22 +2238,19 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
    started = State(value=Snapshot.StatusChoices.STARTED)
    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)

-    # Tick Event
+    # Tick Event (polled by workers)
    tick = (
        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished', on='on_started_to_started') |
-        started.to(sealed, cond='is_finished')
+        queued.to(started, cond='can_start')
    )

+    # Manual event (triggered by last ArchiveResult finishing)
+    seal = started.to(sealed)
+
    def can_start(self) -> bool:
        can_start = bool(self.snapshot.url)
        return can_start

-    def is_finished(self) -> bool:
-        """Check if snapshot processing is complete - delegates to model method."""
-        return self.snapshot.is_finished_processing()
-
    @queued.enter
    def enter_queued(self):
        self.snapshot.update_and_requeue(
@@ -2237,29 +2260,34 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):

    @started.enter
    def enter_started(self):
-        # lock the snapshot while we create the pending archiveresults
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
-        )
+        import sys
+
+        print(f'[cyan]  🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)

        # Run the snapshot - creates pending archiveresults for all enabled plugins
        self.snapshot.run()

-        # unlock the snapshot after we're done + set status = started
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
-            status=Snapshot.StatusChoices.STARTED,
-        )
+        # Check if any archiveresults were created
+        ar_count = self.snapshot.archiveresult_set.count()
+        print(f'[cyan]  🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)

-    def on_started_to_started(self):
-        """Called when Snapshot stays in started state (archiveresults not finished yet)."""
-        # Bump retry_at so we check again in a few seconds
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=5),
-        )
+        if ar_count == 0:
+            # No archiveresults created, seal immediately
+            print(f'[cyan]  🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
+            self.seal()
+        else:
+            # Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
+            # Last AR will manually call self.seal() when done
+            self.snapshot.update_and_requeue(
+                retry_at=timezone.now() + timedelta(days=365),
+                status=Snapshot.StatusChoices.STARTED,
+            )
+            print(f'[cyan]  🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)

    @sealed.enter
    def enter_sealed(self):
+        import sys
+
        # Clean up background hooks
        self.snapshot.cleanup()

@@ -2268,6 +2296,21 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
            status=Snapshot.StatusChoices.SEALED,
        )

+        print(f'[cyan]  ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]', file=sys.stderr)
+
+        # Check if this is the last snapshot for the parent crawl - if so, seal the crawl
+        if self.snapshot.crawl:
+            crawl = self.snapshot.crawl
+            remaining_active = Snapshot.objects.filter(
+                crawl=crawl,
+                status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
+            ).count()
+
+            if remaining_active == 0:
+                print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
+                # Seal the parent crawl
+                crawl.sm.seal()
+

 class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
    class StatusChoices(models.TextChoices):
@@ -3102,8 +3145,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
            end_ts=None,
        )

+    def _check_and_seal_parent_snapshot(self):
+        """Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
+        import sys
+
+        snapshot = self.archiveresult.snapshot
+
+        # Check if all archiveresults are finished (in final states)
+        remaining_active = snapshot.archiveresult_set.exclude(
+            status__in=[
+                ArchiveResult.StatusChoices.SUCCEEDED,
+                ArchiveResult.StatusChoices.FAILED,
+                ArchiveResult.StatusChoices.SKIPPED,
+            ]
+        ).count()
+
+        if remaining_active == 0:
+            print(f'[cyan]    🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
+            # Seal the parent snapshot
+            snapshot.sm.seal()
+
    @succeeded.enter
    def enter_succeeded(self):
+        import sys
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -3113,8 +3178,15 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
        self.archiveresult.cascade_health_update(success=True)

+        print(f'[cyan]    ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
+
+        # Check if this is the last AR to finish - seal parent snapshot if so
+        self._check_and_seal_parent_snapshot()
+
    @failed.enter
    def enter_failed(self):
+        import sys
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.FAILED,
@@ -3124,16 +3196,25 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
        self.archiveresult.cascade_health_update(success=False)

+        print(f'[red]    ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
+
+        # Check if this is the last AR to finish - seal parent snapshot if so
+        self._check_and_seal_parent_snapshot()
+
    @skipped.enter
    def enter_skipped(self):
+        import sys
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SKIPPED,
            end_ts=timezone.now(),
        )

-    def after_transition(self, event: str, source: State, target: State):
-        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
+        print(f'[dim]    ⏭️  ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
+
+        # Check if this is the last AR to finish - seal parent snapshot if so
+        self._check_and_seal_parent_snapshot()


 # =============================================================================
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -240,19 +240,26 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        if not first_url:
            raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')

+        # Try to get existing snapshot
        try:
-            return Snapshot.objects.get(crawl=self, url=first_url)
+            snapshot = Snapshot.objects.get(crawl=self, url=first_url)
+            # If exists and already queued/started, return it as-is
+            if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
+                # Update retry_at to now so it can be picked up immediately
+                snapshot.retry_at = timezone.now()
+                snapshot.save(update_fields=['retry_at'])
+            return snapshot
        except Snapshot.DoesNotExist:
            pass

-        root_snapshot, _ = Snapshot.objects.update_or_create(
-            crawl=self, url=first_url,
-            defaults={
-                'status': Snapshot.INITIAL_STATE,
-                'retry_at': timezone.now(),
-                'timestamp': str(timezone.now().timestamp()),
-                'depth': 0,
-            },
+        # Create new snapshot
+        root_snapshot = Snapshot.objects.create(
+            crawl=self,
+            url=first_url,
+            status=Snapshot.INITIAL_STATE,
+            retry_at=timezone.now(),
+            timestamp=str(timezone.now().timestamp()),
+            depth=0,
        )
        return root_snapshot

@@ -362,14 +369,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith

        return created_snapshots

-    def run(self) -> 'Snapshot':
+    def run(self) -> 'Snapshot | None':
        """
        Execute this Crawl: run hooks, process JSONL, create snapshots.

        Called by the state machine when entering the 'started' state.

        Returns:
-            The root Snapshot for this crawl
+            The root Snapshot for this crawl, or None for system crawls that don't create snapshots
        """
        import time
        from pathlib import Path
@@ -407,8 +414,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith

            # Foreground hook - process JSONL records
            records = result.get('records', [])
+            if records:
+                print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
+                for record in records[:3]:  # Show first 3
+                    print(f'   Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
            overrides = {'crawl': self}
-            process_hook_records(records, overrides=overrides)
+            stats = process_hook_records(records, overrides=overrides)
+            if stats:
+                print(f'[green]✓ Created: {stats}[/green]')
+
+        # System crawls (archivebox://*) don't create snapshots - they just run hooks
+        if first_url.startswith('archivebox://'):
+            return None

        # Create snapshots from URLs
        root_snapshot = self.create_root_snapshot()
@@ -498,14 +515,15 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
    started = State(value=Crawl.StatusChoices.STARTED)
    sealed = State(value=Crawl.StatusChoices.SEALED, final=True)

-    # Tick Event
+    # Tick Event (polled by workers)
    tick = (
        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished', on='on_started_to_started') |
-        started.to(sealed, cond='is_finished')
+        queued.to(started, cond='can_start')
    )

+    # Manual event (triggered by last Snapshot sealing)
+    seal = started.to(sealed)
+
    def can_start(self) -> bool:
        if not self.crawl.urls:
            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
@@ -516,55 +534,38 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
            return False
        return True

-    def is_finished(self) -> bool:
-        from archivebox.core.models import Snapshot
-
-        # Check if any snapshots exist for this crawl
-        snapshots = Snapshot.objects.filter(crawl=self.crawl)
-
-        # If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
-        if not snapshots.exists():
-            return True
-
-        # If snapshots exist, check if all are sealed
-        # Snapshots handle their own background hooks via the step system,
-        # so we just need to wait for all snapshots to reach sealed state
-        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
-            return False
-
-        return True
-
    @started.enter
    def enter_started(self):
-        # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
-        self.crawl.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=30),  # Lock for 30 seconds
-        )
+        import sys
+        from archivebox.core.models import Snapshot
+
+        print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr)

        try:
            # Run the crawl - runs hooks, processes JSONL, creates snapshots
-            self.crawl.run()
+            root_snapshot = self.crawl.run()
+
+            if root_snapshot:
+                print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr)
+                # Update status to STARTED
+                # Set retry_at to far future so workers don't claim us (we're waiting for snapshots to finish)
+                # Last snapshot will manually call self.seal() when done
+                self.crawl.update_and_requeue(
+                    retry_at=timezone.now() + timedelta(days=365),
+                    status=Crawl.StatusChoices.STARTED,
+                )
+            else:
+                # No snapshots (system crawl like archivebox://install)
+                print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
+                # Seal immediately since there's no work to do
+                self.seal()

-            # Update status to STARTED once snapshots are created
-            # Set retry_at to future so we don't busy-loop - wait for snapshots to process
-            self.crawl.update_and_requeue(
-                retry_at=timezone.now() + timedelta(seconds=5),  # Check again in 5s
-                status=Crawl.StatusChoices.STARTED,
-            )
        except Exception as e:
            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
            import traceback
            traceback.print_exc()
-            # Re-raise so the worker knows it failed
            raise

-    def on_started_to_started(self):
-        """Called when Crawl stays in started state (snapshots not sealed yet)."""
-        # Bump retry_at so we check again in a few seconds
-        self.crawl.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=5),
-        )
-
    @sealed.enter
    def enter_sealed(self):
        # Clean up background hooks and run on_CrawlEnd hooks
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -480,7 +480,7 @@ def run_hook(
            returncode=returncode,
            stdout=stdout,
            stderr=stderr,
-            output_json=output_json,
+            output_json=None,  # Legacy field, we now use records for JSONL
            output_files=new_files,
            duration_ms=duration_ms,
            hook=str(script),
@@ -922,10 +922,14 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
    if plugins_whitelist:
        # PLUGINS whitelist is specified - only enable plugins in the list
        plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
+        import sys
+        print(f"DEBUG: PLUGINS whitelist='{plugins_whitelist}', checking plugin '{plugin_name}', plugin_names={plugin_names}", file=sys.stderr)
        if plugin_name.lower() not in plugin_names:
            # Plugin not in whitelist - explicitly disabled
+            print(f"DEBUG: Plugin '{plugin_name}' NOT in whitelist, disabling", file=sys.stderr)
            enabled = False
        else:
+            print(f"DEBUG: Plugin '{plugin_name}' IS in whitelist, enabling", file=sys.stderr)
            # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
            enabled_key = f'{plugin_upper}_ENABLED'
            enabled = config.get(enabled_key)
@@ -935,6 +939,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
                enabled = enabled.lower() not in ('false', '0', 'no', '')
    else:
        # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
+        import sys
+        print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr)
        enabled_key = f'{plugin_upper}_ENABLED'
        enabled = config.get(enabled_key)
        if enabled is None:
--- a/archivebox/machine/migrations/0005_converge_binary_model.py
+++ b/archivebox/machine/migrations/0005_converge_binary_model.py
@@ -0,0 +1,72 @@
+# Generated by hand on 2026-01-01
+# Converges machine app for 0.8.6rc0 → 0.9.x migration path
+# Drops old InstalledBinary table and ensures Binary table exists
+
+from django.db import migrations, connection
+
+
+def converge_binary_table(apps, schema_editor):
+    """
+    Drop machine_installedbinary if it exists (0.8.6rc0 path).
+    Create machine_binary if it doesn't exist (needed by Process model).
+    """
+    cursor = connection.cursor()
+
+    # Check what tables exist
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('machine_installedbinary', 'machine_binary')")
+    existing_tables = {row[0] for row in cursor.fetchall()}
+
+    print(f'DEBUG 0005: Existing tables: {existing_tables}')
+
+    # Drop old InstalledBinary table if it exists (0.8.6rc0 path)
+    if 'machine_installedbinary' in existing_tables:
+        print('✓ Dropping machine_installedbinary table (0.8.6rc0 divergence)')
+        cursor.execute("DROP TABLE IF EXISTS machine_installedbinary")
+
+    # Create Binary table if it doesn't exist
+    # This handles the case where 0.8.6rc0's 0001_initial didn't create it
+    if 'machine_binary' not in existing_tables:
+        print('✓ Creating machine_binary table with correct schema')
+        cursor.execute("""
+            CREATE TABLE machine_binary (
+                id TEXT PRIMARY KEY NOT NULL,
+                created_at DATETIME NOT NULL,
+                modified_at DATETIME NOT NULL,
+                num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
+                num_uses_failed INTEGER NOT NULL DEFAULT 0,
+                machine_id TEXT NOT NULL REFERENCES machine_machine(id) ON DELETE CASCADE,
+                name VARCHAR(63) NOT NULL,
+                binproviders VARCHAR(255) NOT NULL DEFAULT 'env',
+                overrides TEXT NOT NULL DEFAULT '{}',
+                binprovider VARCHAR(63) NOT NULL DEFAULT 'env',
+                abspath VARCHAR(255) NOT NULL,
+                version VARCHAR(128) NOT NULL,
+                sha256 VARCHAR(64) NOT NULL DEFAULT '',
+                status VARCHAR(16) NOT NULL DEFAULT 'succeeded',
+                retry_at DATETIME NULL,
+                output_dir VARCHAR(255) NOT NULL DEFAULT ''
+            )
+        """)
+
+        # Create indexes
+        cursor.execute("CREATE INDEX machine_binary_machine_id_idx ON machine_binary(machine_id)")
+        cursor.execute("CREATE INDEX machine_binary_name_idx ON machine_binary(name)")
+        cursor.execute("CREATE INDEX machine_binary_abspath_idx ON machine_binary(abspath)")
+
+        print('✓ machine_binary table created')
+    else:
+        print('✓ machine_binary table already exists')
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('machine', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            converge_binary_table,
+            reverse_code=migrations.RunPython.noop,
+        ),
+    ]
--- a/archivebox/machine/migrations/0006_process.py
+++ b/archivebox/machine/migrations/0006_process.py
@@ -9,7 +9,7 @@ from django.db import migrations, models
 class Migration(migrations.Migration):

    dependencies = [
-        ('machine', '0001_initial'),
+        ('machine', '0005_converge_binary_model'),
    ]

    operations = [
--- a/archivebox/machine/migrations/0007_add_process_type_and_parent.py
+++ b/archivebox/machine/migrations/0007_add_process_type_and_parent.py
@@ -7,7 +7,7 @@ from django.db import migrations, models
 class Migration(migrations.Migration):

    dependencies = [
-        ('machine', '0002_process'),
+        ('machine', '0006_process'),
    ]

    operations = [
--- a/archivebox/machine/migrations/0008_add_worker_type_field.py
+++ b/archivebox/machine/migrations/0008_add_worker_type_field.py
@@ -0,0 +1,18 @@
+# Generated by Django 6.0 on 2026-01-02 03:36
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('machine', '0007_add_process_type_and_parent'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='process',
+            name='worker_type',
+            field=models.CharField(blank=True, db_index=True, default='', help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)', max_length=32),
+        ),
+    ]
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -203,13 +203,14 @@ class BinaryManager(models.Manager):

 class Binary(ModelWithHealthStats):
    """
-    Tracks an binary on a specific machine.
+    Tracks a binary on a specific machine.

-    Follows the unified state machine pattern:
+    Simple state machine with 2 states:
    - queued: Binary needs to be installed
-    - started: Installation in progress
-    - succeeded: Binary installed successfully (abspath, version, sha256 populated)
-    - failed: Installation failed
+    - installed: Binary installed successfully (abspath, version, sha256 populated)
+
+    Installation is synchronous during queued→installed transition.
+    If installation fails, Binary stays in queued with retry_at set for later retry.

    State machine calls run() which executes on_Binary__install_* hooks
    to install the binary using the specified providers.
@@ -217,9 +218,7 @@ class Binary(ModelWithHealthStats):

    class StatusChoices(models.TextChoices):
        QUEUED = 'queued', 'Queued'
-        STARTED = 'started', 'Started'
-        SUCCEEDED = 'succeeded', 'Succeeded'
-        FAILED = 'failed', 'Failed'
+        INSTALLED = 'installed', 'Installed'

    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -323,8 +322,31 @@ class Binary(ModelWithHealthStats):
        machine = Machine.current()
        overrides = overrides or {}

-        # Case 1: From binaries.jsonl - create queued binary
-        if 'binproviders' in record or ('overrides' in record and not record.get('abspath')):
+        # Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders
+        # This happens when on_Crawl hooks detect already-installed binaries
+        abspath = record.get('abspath')
+        version = record.get('version')
+        binproviders = record.get('binproviders')
+
+        if abspath and version and binproviders:
+            # Binary is already installed, create INSTALLED record with binproviders filter
+            binary, _ = Binary.objects.update_or_create(
+                machine=machine,
+                name=name,
+                defaults={
+                    'abspath': abspath,
+                    'version': version,
+                    'sha256': record.get('sha256', ''),
+                    'binprovider': record.get('binprovider', 'env'),
+                    'binproviders': binproviders,  # Preserve the filter
+                    'status': Binary.StatusChoices.INSTALLED,
+                    'retry_at': None,
+                }
+            )
+            return binary
+
+        # Case 2: From binaries.json - create queued binary (needs installation)
+        if 'binproviders' in record or ('overrides' in record and not abspath):
            binary, created = Binary.objects.get_or_create(
                machine=machine,
                name=name,
@@ -337,25 +359,23 @@ class Binary(ModelWithHealthStats):
            )
            return binary

-        # Case 2: From hook output - update with installation results
-        abspath = record.get('abspath')
-        version = record.get('version')
-        if not abspath or not version:
-            return None
+        # Case 3: From on_Binary__install hook output - update with installation results
+        if abspath and version:
+            binary, _ = Binary.objects.update_or_create(
+                machine=machine,
+                name=name,
+                defaults={
+                    'abspath': abspath,
+                    'version': version,
+                    'sha256': record.get('sha256', ''),
+                    'binprovider': record.get('binprovider', 'env'),
+                    'status': Binary.StatusChoices.INSTALLED,
+                    'retry_at': None,
+                }
+            )
+            return binary

-        binary, _ = Binary.objects.update_or_create(
-            machine=machine,
-            name=name,
-            defaults={
-                'abspath': abspath,
-                'version': version,
-                'sha256': record.get('sha256', ''),
-                'binprovider': record.get('binprovider', 'env'),
-                'status': Binary.StatusChoices.SUCCEEDED,
-                'retry_at': None,
-            }
-        )
-        return binary
+        return None

    @property
    def OUTPUT_DIR(self):
@@ -403,8 +423,7 @@ class Binary(ModelWithHealthStats):
        # Discover ALL on_Binary__install_* hooks
        hooks = discover_hooks('Binary', config=config)
        if not hooks:
-            self.status = self.StatusChoices.FAILED
-            self.save()
+            # No hooks available - stay queued, will retry later
            return

        # Run each hook - they decide if they can handle this binary
@@ -456,15 +475,21 @@ class Binary(ModelWithHealthStats):
                                self.version = record.get('version', '')
                                self.sha256 = record.get('sha256', '')
                                self.binprovider = record.get('binprovider', 'env')
-                                self.status = self.StatusChoices.SUCCEEDED
+                                self.status = self.StatusChoices.INSTALLED
                                self.save()
+
+                                # Symlink binary into LIB_BIN_DIR if configured
+                                from django.conf import settings
+                                lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None)
+                                if lib_bin_dir:
+                                    self.symlink_to_lib_bin(lib_bin_dir)
+
                                return
                        except json.JSONDecodeError:
                            continue

-        # No hook succeeded
-        self.status = self.StatusChoices.FAILED
-        self.save()
+        # No hook succeeded - leave status as QUEUED (will retry later)
+        # Don't set to FAILED since we don't have that status anymore

    def cleanup(self):
        """
@@ -484,10 +509,75 @@ class Binary(ModelWithHealthStats):
        for plugin_dir in output_dir.iterdir():
            if not plugin_dir.is_dir():
                continue
+
            pid_file = plugin_dir / 'hook.pid'
            cmd_file = plugin_dir / 'cmd.sh'
            safe_kill_process(pid_file, cmd_file)

+    def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None:
+        """
+        Symlink this binary into LIB_BIN_DIR for unified PATH management.
+
+        After a binary is installed by any binprovider (pip, npm, brew, apt, etc),
+        we symlink it into LIB_BIN_DIR so that:
+        1. All binaries can be found in a single directory
+        2. PATH only needs LIB_BIN_DIR prepended (not multiple provider-specific paths)
+        3. Binary priorities are clear (symlink points to the canonical install location)
+
+        Args:
+            lib_bin_dir: Path to LIB_BIN_DIR (e.g., /data/lib/arm64-darwin/bin)
+
+        Returns:
+            Path to the created symlink, or None if symlinking failed
+
+        Example:
+            >>> binary = Binary.objects.get(name='yt-dlp')
+            >>> binary.symlink_to_lib_bin('/data/lib/arm64-darwin/bin')
+            Path('/data/lib/arm64-darwin/bin/yt-dlp')
+        """
+        import sys
+        from pathlib import Path
+
+        if not self.abspath:
+            return None
+
+        binary_abspath = Path(self.abspath).resolve()
+        lib_bin_dir = Path(lib_bin_dir).resolve()
+
+        # Create LIB_BIN_DIR if it doesn't exist
+        try:
+            lib_bin_dir.mkdir(parents=True, exist_ok=True)
+        except (OSError, PermissionError) as e:
+            print(f"Failed to create LIB_BIN_DIR {lib_bin_dir}: {e}", file=sys.stderr)
+            return None
+
+        # Get binary name (last component of path)
+        binary_name = binary_abspath.name
+        symlink_path = lib_bin_dir / binary_name
+
+        # Remove existing symlink/file if it exists
+        if symlink_path.exists() or symlink_path.is_symlink():
+            try:
+                # Check if it's already pointing to the right place
+                if symlink_path.is_symlink() and symlink_path.resolve() == binary_abspath:
+                    # Already correctly symlinked, nothing to do
+                    return symlink_path
+
+                # Remove old symlink/file
+                symlink_path.unlink()
+            except (OSError, PermissionError) as e:
+                print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr)
+                return None
+
+        # Create new symlink
+        try:
+            symlink_path.symlink_to(binary_abspath)
+            print(f"Symlinked {binary_name} -> {symlink_path}", file=sys.stderr)
+            return symlink_path
+        except (OSError, PermissionError) as e:
+            print(f"Failed to create symlink {symlink_path} -> {binary_abspath}: {e}", file=sys.stderr)
+            return None
+

 # =============================================================================
 # Process Model
@@ -627,6 +717,16 @@ class Process(models.Model):
        help_text='Type of process (cli, worker, orchestrator, binary, supervisord)'
    )

+    # Worker type (only for WORKER processes: crawl, snapshot, archiveresult)
+    worker_type = models.CharField(
+        max_length=32,
+        default='',
+        null=False,
+        blank=True,
+        db_index=True,
+        help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)'
+    )
+
    # Execution metadata
    pwd = models.CharField(max_length=512, default='', null=False, blank=True,
        help_text='Working directory for process execution')
@@ -895,11 +995,16 @@ class Process(models.Model):
        ppid = os.getppid()
        machine = machine or Machine.current()

+        # Debug logging
+        import sys
+        print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
+
        # Get parent process start time from OS
        try:
            os_parent = psutil.Process(ppid)
            os_parent_start = os_parent.create_time()
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+            print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr)
            return None  # Parent process doesn't exist

        # Find matching Process record
@@ -910,12 +1015,18 @@ class Process(models.Model):
            started_at__gte=timezone.now() - PID_REUSE_WINDOW,
        ).order_by('-started_at')

+        print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr)
+
        for candidate in candidates:
            if candidate.started_at:
                db_start_time = candidate.started_at.timestamp()
-                if abs(db_start_time - os_parent_start) < START_TIME_TOLERANCE:
+                time_diff = abs(db_start_time - os_parent_start)
+                print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr)
+                if time_diff < START_TIME_TOLERANCE:
+                    print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr)
                    return candidate

+        print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr)
        return None  # No matching ArchiveBox parent process

    @classmethod
@@ -1584,69 +1695,38 @@ class BinaryMachine(BaseStateMachine, strict_states=True):
    """
    State machine for managing Binary installation lifecycle.

-    Hook Lifecycle:
+    Simple 2-state machine:
    ┌─────────────────────────────────────────────────────────────┐
    │ QUEUED State                                                │
    │  • Binary needs to be installed                             │
    └─────────────────────────────────────────────────────────────┘
-                            ↓ tick() when can_start()
+                            ↓ tick() when can_install()
+                            ↓ Synchronous installation during transition
    ┌─────────────────────────────────────────────────────────────┐
-    │ STARTED State → enter_started()                             │
-    │  1. binary.run()                                            │
-    │     • discover_hooks('Binary') → all on_Binary__install_*   │
-    │     • Try each provider hook in sequence:                   │
-    │       - run_hook(script, output_dir, ...)                   │
-    │       - If returncode == 0:                                 │
-    │         * Read stdout.log                                   │
-    │         * Parse JSONL for 'Binary' record with abspath      │
-    │         * Update self: abspath, version, sha256, provider   │
-    │         * Set status=SUCCEEDED, RETURN                      │
-    │     • If no hook succeeds: set status=FAILED                │
-    └─────────────────────────────────────────────────────────────┘
-                            ↓ tick() checks status
-    ┌─────────────────────────────────────────────────────────────┐
-    │ SUCCEEDED / FAILED                                          │
-    │  • Set by binary.run() based on hook results                │
-    │  • Health stats incremented (num_uses_succeeded/failed)     │
+    │ INSTALLED State                                             │
+    │  • Binary installed (abspath, version, sha256 set)          │
+    │  • Health stats incremented                                 │
    └─────────────────────────────────────────────────────────────┘
+
+    If installation fails, Binary stays in QUEUED with retry_at bumped.
    """

    model_attr_name = 'binary'

    # States
    queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
-    started = State(value=Binary.StatusChoices.STARTED)
-    succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
-    failed = State(value=Binary.StatusChoices.FAILED, final=True)
+    installed = State(value=Binary.StatusChoices.INSTALLED, final=True)

-    # Tick Event - transitions based on conditions
+    # Tick Event - install happens during transition
    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(succeeded, cond='is_succeeded') |
-        started.to(failed, cond='is_failed')
+        queued.to.itself(unless='can_install') |
+        queued.to(installed, cond='can_install', on='on_install')
    )

-    def can_start(self) -> bool:
+    def can_install(self) -> bool:
        """Check if binary installation can start."""
        return bool(self.binary.name and self.binary.binproviders)

-    def is_succeeded(self) -> bool:
-        """Check if installation succeeded (status was set by run())."""
-        return self.binary.status == Binary.StatusChoices.SUCCEEDED
-
-    def is_failed(self) -> bool:
-        """Check if installation failed (status was set by run())."""
-        return self.binary.status == Binary.StatusChoices.FAILED
-
-    def is_finished(self) -> bool:
-        """Check if installation has completed (success or failure)."""
-        return self.binary.status in (
-            Binary.StatusChoices.SUCCEEDED,
-            Binary.StatusChoices.FAILED,
-        )
-
    @queued.enter
    def enter_queued(self):
        """Binary is queued for installation."""
@@ -1655,43 +1735,48 @@ class BinaryMachine(BaseStateMachine, strict_states=True):
            status=Binary.StatusChoices.QUEUED,
        )

-    @started.enter
-    def enter_started(self):
-        """Start binary installation."""
-        # Lock the binary while installation runs
-        self.binary.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=300),  # 5 min timeout for installation
-            status=Binary.StatusChoices.STARTED,
-        )
+    def on_install(self):
+        """Called during queued→installed transition. Runs installation synchronously."""
+        import sys

-        # Run installation hooks
+        print(f'[cyan]      🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]', file=sys.stderr)
+
+        # Run installation hooks (synchronous, updates abspath/version/sha256 and sets status)
        self.binary.run()

-        # Save updated status (run() updates status to succeeded/failed)
-        self.binary.save()
+        # Check if installation succeeded by looking at updated status
+        # Note: Binary.run() updates self.binary.status internally but doesn't refresh our reference
+        self.binary.refresh_from_db()

-    @succeeded.enter
-    def enter_succeeded(self):
+        if self.binary.status != Binary.StatusChoices.INSTALLED:
+            # Installation failed - abort transition, stay in queued
+            print(f'[red]      ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]', file=sys.stderr)
+
+            # Bump retry_at to try again later
+            self.binary.update_and_requeue(
+                retry_at=timezone.now() + timedelta(seconds=300),  # Retry in 5 minutes
+                status=Binary.StatusChoices.QUEUED,  # Ensure we stay queued
+            )
+
+            # Increment health stats for failure
+            self.binary.increment_health_stats(success=False)
+
+            # Abort the transition - this will raise an exception and keep us in queued
+            raise Exception(f'Binary {self.binary.name} installation failed')
+
+        print(f'[cyan]      ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]', file=sys.stderr)
+
+    @installed.enter
+    def enter_installed(self):
        """Binary installed successfully."""
        self.binary.update_and_requeue(
            retry_at=None,
-            status=Binary.StatusChoices.SUCCEEDED,
+            status=Binary.StatusChoices.INSTALLED,
        )

        # Increment health stats
        self.binary.increment_health_stats(success=True)

-    @failed.enter
-    def enter_failed(self):
-        """Binary installation failed."""
-        self.binary.update_and_requeue(
-            retry_at=None,
-            status=Binary.StatusChoices.FAILED,
-        )
-
-        # Increment health stats
-        self.binary.increment_health_stats(success=False)
-

 # =============================================================================
 # Process State Machine
--- a/archivebox/plugins/accessibility/tests/test_accessibility.py
+++ b/archivebox/plugins/accessibility/tests/test_accessibility.py
@@ -80,8 +80,7 @@ class TestAccessibilityWithChrome(TestCase):
                # Run accessibility hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir,
-            env=get_test_env()),
+                    cwd=str(snapshot_chrome_dir),
                    capture_output=True,
                    text=True,
                    timeout=60,
--- a/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
+++ b/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -39,30 +39,36 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    CHROME_NAVIGATE_HOOK,
 )

-# Get LIB_DIR and NODE_MODULES_DIR from shared helpers
-LIB_DIR = get_lib_dir()
-NODE_MODULES_DIR = get_node_modules_dir()
-NPM_PREFIX = LIB_DIR / 'npm'
-
-# Chromium install location (relative to DATA_DIR)
-CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
-
-
@pytest.fixture(scope="session", autouse=True)
-def ensure_chromium_and_puppeteer_installed():
-    """Ensure Chromium and puppeteer are installed before running tests."""
+def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
+    """Ensure Chromium and puppeteer are installed before running tests.
+
+    Puppeteer handles Chromium installation automatically in its own cache.
+    We only need to install puppeteer itself to LIB_DIR/npm.
+    """
    from abx_pkg import Binary, NpmProvider, BinProviderOverrides

+    # Set DATA_DIR if not already set (required by abx_pkg)
+    if not os.environ.get('DATA_DIR'):
+        # Use isolated temp dir for direct pytest runs
+        test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
+        os.environ['DATA_DIR'] = str(test_data_dir)
+
+    # Compute paths AFTER setting DATA_DIR
+    lib_dir = get_lib_dir()
+    node_modules_dir = get_node_modules_dir()
+    npm_prefix = lib_dir / 'npm'
+
    # Rebuild pydantic models
    NpmProvider.model_rebuild()

-    # Install puppeteer-core if not available
-    puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core'
+    # Install puppeteer if not available (it will handle Chromium in its own cache)
+    puppeteer_core_path = node_modules_dir / 'puppeteer-core'
    if not puppeteer_core_path.exists():
-        print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
-        NPM_PREFIX.mkdir(parents=True, exist_ok=True)
+        print(f"\n[*] Installing puppeteer to {npm_prefix}...")
+        npm_prefix.mkdir(parents=True, exist_ok=True)

-        provider = NpmProvider(npm_prefix=NPM_PREFIX)
+        provider = NpmProvider(npm_prefix=npm_prefix)
        try:
            binary = Binary(
                name='puppeteer',
@@ -70,36 +76,25 @@ def ensure_chromium_and_puppeteer_installed():
                overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
            )
            binary.install()
-            print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
+            print(f"[*] Puppeteer installed successfully to {npm_prefix}")
        except Exception as e:
            pytest.skip(f"Failed to install puppeteer: {e}")

-    # Install Chromium via @puppeteer/browsers if not available
+    # Find Chromium binary (puppeteer installs it automatically in its cache)
    chromium_binary = find_chromium_binary()
    if not chromium_binary:
-        print(f"\n[*] Installing Chromium to {CHROMIUM_INSTALL_DIR}...")
-        CHROMIUM_INSTALL_DIR.mkdir(parents=True, exist_ok=True)
-
-        result = subprocess.run(
-            ['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
-            cwd=str(CHROMIUM_INSTALL_DIR.parent),
-            capture_output=True,
-            text=True,
-            timeout=300
-        )
-        if result.returncode != 0:
-            pytest.skip(f"Failed to install Chromium: {result.stderr}")
-
-        chromium_binary = find_chromium_binary()
-        if not chromium_binary:
-            pytest.skip("Chromium installed but binary not found")
-
-        print(f"[*] Chromium installed: {chromium_binary}")
+        pytest.skip("Chromium not found - puppeteer should install it automatically")

    # Set CHROME_BINARY env var for tests
    os.environ['CHROME_BINARY'] = chromium_binary


+# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__)
+LIB_DIR = get_lib_dir()
+NODE_MODULES_DIR = get_node_modules_dir()
+NPM_PREFIX = LIB_DIR / 'npm'
+
+
 def test_hook_scripts_exist():
    """Verify chrome hooks exist."""
    assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
@@ -208,8 +203,7 @@ def test_chrome_launch_and_tab_creation():
        env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
        result = subprocess.run(
            ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
-            cwd=str(snapshot_chrome_dir,
-            env=get_test_env()),
+            cwd=str(snapshot_chrome_dir),
            capture_output=True,
            text=True,
            timeout=60,
@@ -269,8 +263,7 @@ def test_chrome_navigation():

        result = subprocess.run(
            ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
-            cwd=str(snapshot_chrome_dir,
-            env=get_test_env()),
+            cwd=str(snapshot_chrome_dir),
            capture_output=True,
            text=True,
            timeout=60,
@@ -281,8 +274,7 @@ def test_chrome_navigation():
        # Navigate to URL
        result = subprocess.run(
            ['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
-            cwd=str(snapshot_chrome_dir,
-            env=get_test_env()),
+            cwd=str(snapshot_chrome_dir),
            capture_output=True,
            text=True,
            timeout=120,
@@ -417,8 +409,7 @@ def test_multiple_snapshots_share_chrome():
            # Create tab for this snapshot
            result = subprocess.run(
                ['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
-                cwd=str(snapshot_chrome_dir,
-            env=get_test_env()),
+                cwd=str(snapshot_chrome_dir),
                capture_output=True,
                text=True,
                timeout=60,
--- a/archivebox/plugins/consolelog/tests/test_consolelog.py
+++ b/archivebox/plugins/consolelog/tests/test_consolelog.py
@@ -80,8 +80,7 @@ class TestConsolelogWithChrome(TestCase):
                # Run consolelog hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir,
-            env=get_test_env()),
+                    cwd=str(snapshot_chrome_dir),
                    capture_output=True,
                    text=True,
                    timeout=120,  # Longer timeout as it waits for navigation
--- a/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
+++ b/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Detect gallery-dl binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if gallery-dl is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary_found(binary: Binary, name: str):
+    """Output Binary JSONL record for an installed binary."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',  # Already installed
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+    """Output Binary JSONL record for a missing binary that needs installation."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,  # Providers that can install it
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
+    gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
+
+    if not gallerydl_enabled:
+        sys.exit(0)
+
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=gallerydl_binary, binproviders=[provider]).load()
+        if binary.abspath:
+            # Binary found
+            output_binary_found(binary, name='gallery-dl')
+        else:
+            # Binary not found
+            output_binary_missing(name='gallery-dl', binproviders='pip')
+    except Exception:
+        # Binary not found
+        output_binary_missing(name='gallery-dl', binproviders='pip')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/git/on_Crawl__09_git_install.py
+++ b/archivebox/plugins/git/on_Crawl__09_git_install.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Detect git binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if git is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary_found(binary: Binary, name: str):
+    """Output Binary JSONL record for an installed binary."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',  # Already installed
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+    """Output Binary JSONL record for a missing binary that needs installation."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,  # Providers that can install it
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    git_enabled = get_env_bool('GIT_ENABLED', True)
+    git_binary = get_env('GIT_BINARY', 'git')
+
+    if not git_enabled:
+        sys.exit(0)
+
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=git_binary, binproviders=[provider]).load()
+        if binary.abspath:
+            # Binary found
+            output_binary_found(binary, name='git')
+        else:
+            # Binary not found
+            output_binary_missing(name='git', binproviders='apt,brew')
+    except Exception:
+        # Binary not found
+        output_binary_missing(name='git', binproviders='apt,brew')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
+++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
@@ -123,8 +123,7 @@ def test_scrolls_page_and_outputs_stats():

            result = subprocess.run(
                ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
-                cwd=str(infiniscroll_dir,
-            env=get_test_env()),
+                cwd=str(infiniscroll_dir),
                capture_output=True,
                text=True,
                timeout=60,
@@ -188,8 +187,7 @@ def test_config_scroll_limit_honored():

            result = subprocess.run(
                ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
-                cwd=str(infiniscroll_dir,
-            env=get_test_env()),
+                cwd=str(infiniscroll_dir),
                capture_output=True,
                text=True,
                timeout=60,
@@ -248,8 +246,7 @@ def test_config_timeout_honored():
            start_time = time.time()
            result = subprocess.run(
                ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
-                cwd=str(infiniscroll_dir,
-            env=get_test_env()),
+                cwd=str(infiniscroll_dir),
                capture_output=True,
                text=True,
                timeout=30,
--- a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
+++ b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Detect mercury-parser binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if mercury-parser is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary_found(binary: Binary, name: str):
+    """Output Binary JSONL record for an installed binary."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',  # Already installed
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+    """Output Binary JSONL record for a missing binary that needs installation."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,  # Providers that can install it
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
+    mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser')
+
+    if not mercury_enabled:
+        sys.exit(0)
+
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=mercury_binary, binproviders=[provider]).load()
+        if binary.abspath:
+            # Binary found
+            output_binary_found(binary, name='mercury-parser')
+        else:
+            # Binary not found
+            output_binary_missing(name='mercury-parser', binproviders='npm')
+    except Exception:
+        # Binary not found
+        output_binary_missing(name='mercury-parser', binproviders='npm')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/readability/on_Crawl__11_readability_install.py
+++ b/archivebox/plugins/readability/on_Crawl__11_readability_install.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+Detect readability-extractor binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if readability is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary_found(binary: Binary, name: str):
+    """Output Binary JSONL record for an installed binary."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',  # Already installed
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+    """Output Binary JSONL record for a missing binary that needs installation."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,  # Providers that can install it
+        'overrides': {
+            'packages': ['git+https://github.com/ArchiveBox/readability-extractor.git'],
+        },
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    readability_enabled = get_env_bool('READABILITY_ENABLED', True)
+    readability_binary = get_env('READABILITY_BINARY', 'readability-extractor')
+
+    if not readability_enabled:
+        sys.exit(0)
+
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=readability_binary, binproviders=[provider]).load()
+        if binary.abspath:
+            # Binary found
+            output_binary_found(binary, name='readability-extractor')
+        else:
+            # Binary not found
+            output_binary_missing(name='readability-extractor', binproviders='npm')
+    except Exception:
+        # Binary not found
+        output_binary_missing(name='readability-extractor', binproviders='npm')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/screenshot/tests/test_screenshot.py
+++ b/archivebox/plugins/screenshot/tests/test_screenshot.py
@@ -27,11 +27,21 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    run_hook_and_parse,
    LIB_DIR,
    NODE_MODULES_DIR,
+    CHROME_PLUGIN_DIR,
 )

+# Import chrome test fixture to ensure puppeteer is installed
+from archivebox.plugins.chrome.tests.test_chrome import ensure_chromium_and_puppeteer_installed
+

 PLUGIN_DIR = get_plugin_dir(__file__)
 SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
+
+# Get Chrome hooks for setting up sessions
+CHROME_LAUNCH_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
+CHROME_TAB_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_tab.*')
+CHROME_NAVIGATE_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_navigate.*')
+
 TEST_URL = 'https://example.com'


@@ -53,18 +63,162 @@ def test_verify_deps_with_abx_pkg():


 def test_extracts_screenshot_from_example_com():
-    """Test full workflow: extract screenshot from real example.com via hook."""
-    # Prerequisites checked by earlier test
+    """Test full workflow: extract screenshot from real example.com via hook.
+
+    Replicates production directory structure:
+        DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/
+        DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/
+        DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/
+
+    This exercises the "connect to existing session" code path which is the primary
+    path in production and accounts for ~50% of the code.
+    """
+    import signal
+    import time
+    import os

    with tempfile.TemporaryDirectory() as tmpdir:
-        tmpdir = Path(tmpdir)
+        # Replicate exact production directory structure
+        data_dir = Path(tmpdir)
+        crawl_id = 'test-screenshot-crawl'
+        snapshot_id = 'test-screenshot-snap'
+
+        # Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
+        crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id
+        chrome_dir = crawl_dir / 'chrome'
+        chrome_dir.mkdir(parents=True)
+
+        # Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id
+        snapshot_chrome_dir = snapshot_dir / 'chrome'
+        snapshot_chrome_dir.mkdir(parents=True)
+
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir()

-        # Run screenshot extraction hook
        env = get_test_env()
-        print(f"\n[DEBUG] NODE_V8_COVERAGE={env.get('NODE_V8_COVERAGE', 'NOT SET')}", file=sys.stderr)
+        env['CHROME_HEADLESS'] = 'true'
+
+        # Step 1: Launch Chrome session at crawl level (background process)
+        chrome_launch_process = subprocess.Popen(
+            ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
+            cwd=str(chrome_dir),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            env=env
+        )
+
+        # Wait for Chrome to launch
+        for i in range(15):
+            if chrome_launch_process.poll() is not None:
+                stdout, stderr = chrome_launch_process.communicate()
+                pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
+            if (chrome_dir / 'cdp_url.txt').exists():
+                break
+            time.sleep(1)
+
+        assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist"
+        assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
+
+        chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
+
+        try:
+            # Step 2: Create tab at snapshot level
+            env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
+            result = subprocess.run(
+                ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
+                cwd=str(snapshot_chrome_dir),
+                capture_output=True,
+                text=True,
+                timeout=60,
+                env=env
+            )
+            assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
+            assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist"
+
+            # Step 3: Navigate to URL
+            result = subprocess.run(
+                ['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
+                cwd=str(snapshot_chrome_dir),
+                capture_output=True,
+                text=True,
+                timeout=120,
+                env=env
+            )
+            assert result.returncode == 0, f"Navigation failed: {result.stderr}"
+            assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist"
+
+            # Step 4: Take screenshot (should connect to existing session)
+            # Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt
+            result = subprocess.run(
+                ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
+                cwd=str(screenshot_dir),
+                capture_output=True,
+                text=True,
+                timeout=120,
+                env=env
+            )
+
+            assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}"
+
+            # Parse JSONL output
+            result_json = None
+            for line in result.stdout.strip().split('\n'):
+                line = line.strip()
+                if line.startswith('{'):
+                    try:
+                        record = json.loads(line)
+                        if record.get('type') == 'ArchiveResult':
+                            result_json = record
+                            break
+                    except json.JSONDecodeError:
+                        pass
+
+            assert result_json, "Should have ArchiveResult JSONL output"
+            assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
+            assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}"
+
+            # Verify filesystem output
+            screenshot_file = screenshot_dir / 'screenshot.png'
+            assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}"
+
+            # Verify file is valid PNG
+            file_size = screenshot_file.stat().st_size
+            assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
+            assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
+
+            # Check PNG magic bytes
+            screenshot_data = screenshot_file.read_bytes()
+            assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
+
+        finally:
+            # Cleanup: Kill Chrome
+            try:
+                chrome_launch_process.send_signal(signal.SIGTERM)
+                chrome_launch_process.wait(timeout=5)
+            except:
+                pass
+            try:
+                os.kill(chrome_pid, signal.SIGKILL)
+            except OSError:
+                pass
+
+
+def test_extracts_screenshot_without_session():
+    """Test screenshot extraction without existing Chrome session (fallback to own browser)."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Create proper snapshot directory structure
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        # Don't set up Chrome session or staticfile - screenshot should launch its own browser
+        env = get_test_env()
        result = subprocess.run(
-            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
-            cwd=tmpdir,
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'],
+            cwd=str(screenshot_dir),
            capture_output=True,
            text=True,
            timeout=120,
@@ -73,7 +227,7 @@ def test_extracts_screenshot_from_example_com():

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

-        # Parse JSONL output (clean format without RESULT_JSON= prefix)
+        # Parse JSONL output
        result_json = None
        for line in result.stdout.strip().split('\n'):
            line = line.strip()
@@ -88,20 +242,54 @@ def test_extracts_screenshot_from_example_com():

        assert result_json, "Should have ArchiveResult JSONL output"
        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
-        assert result_json['output_str'] == 'screenshot.png'
+        assert 'screenshot.png' in result_json['output_str']

-        # Verify filesystem output (hook creates screenshot.png directly in working dir)
-        screenshot_file = tmpdir / 'screenshot.png'
+        # Verify file created
+        screenshot_file = screenshot_dir / 'screenshot.png'
        assert screenshot_file.exists(), "screenshot.png not created"
+        assert screenshot_file.stat().st_size > 1000, "Screenshot too small"

-        # Verify file is valid PNG
-        file_size = screenshot_file.stat().st_size
-        assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
-        assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"

-        # Check PNG magic bytes
-        screenshot_data = screenshot_file.read_bytes()
-        assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
+def test_skips_when_staticfile_exists():
+    """Test that screenshot skips when staticfile extractor already handled the URL."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-skip'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        # Create staticfile output to simulate staticfile extractor already ran
+        staticfile_dir = snapshot_dir / 'staticfile'
+        staticfile_dir.mkdir()
+        (staticfile_dir / 'index.html').write_text('<html></html>')
+
+        env = get_test_env()
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-skip'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=30,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Should exit successfully: {result.stderr}"
+
+        # Should emit skipped status
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass
+
+        assert result_json, "Should have ArchiveResult JSONL output"
+        assert result_json['status'] == 'skipped', f"Should skip: {result_json}"


 def test_config_save_screenshot_false_skips():
@@ -134,13 +322,11 @@ def test_config_save_screenshot_false_skips():

 def test_reports_missing_chrome():
    """Test that script reports error when Chrome is not found."""
-    import os
-
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Set CHROME_BINARY to nonexistent path
-        env = os.environ.copy()
+        env = get_test_env()
        env['CHROME_BINARY'] = '/nonexistent/chrome'

        result = subprocess.run(
@@ -158,6 +344,59 @@ def test_reports_missing_chrome():
            assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined


+def test_custom_resolution_and_user_agent():
+    """Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        env['CHROME_RESOLUTION'] = '800,600'
+        env['CHROME_USER_AGENT'] = 'Test/1.0'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+
+        screenshot_file = screenshot_dir / 'screenshot.png'
+        assert screenshot_file.exists(), "screenshot.png not created"
+        # Resolution affects file size
+        assert screenshot_file.stat().st_size > 500, "Screenshot too small"
+
+
+def test_ssl_check_disabled():
+    """Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        env['CHROME_CHECK_SSL_VALIDITY'] = 'False'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Should succeed: {result.stderr}"
+        assert (screenshot_dir / 'screenshot.png').exists()
+
+
 def test_config_timeout_honored():
    """Test that CHROME_TIMEOUT config is respected."""
    import os
@@ -182,5 +421,410 @@ def test_config_timeout_honored():
        assert result.returncode in (0, 1), "Should complete without hanging"


+def test_missing_url_argument():
+    """Test that hook fails gracefully when URL argument is missing."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        env = get_test_env()
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), '--snapshot-id=test-missing-url'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=30,
+            env=env
+        )
+
+        # Should exit with error
+        assert result.returncode != 0, "Should fail when URL is missing"
+        assert 'Usage:' in result.stderr or 'url' in result.stderr.lower()
+
+
+def test_missing_snapshot_id_argument():
+    """Test that hook fails gracefully when snapshot-id argument is missing."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        env = get_test_env()
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=30,
+            env=env
+        )
+
+        # Should exit with error
+        assert result.returncode != 0, "Should fail when snapshot-id is missing"
+        assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower()
+
+
+def test_invalid_resolution_format():
+    """Test that invalid CHROME_RESOLUTION format is handled gracefully."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        # Invalid resolution formats to test parseResolution error handling
+        for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']:
+            env['CHROME_RESOLUTION'] = bad_resolution
+            result = subprocess.run(
+                ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'],
+                cwd=str(screenshot_dir),
+                capture_output=True,
+                text=True,
+                timeout=120,
+                env=env
+            )
+            # Should either fail gracefully or fall back to default
+            # (depending on implementation - script should not crash with uncaught error)
+            assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
+
+
+def test_boolean_env_var_parsing():
+    """Test that boolean environment variables are parsed correctly."""
+    import time
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+
+        # Test various boolean formats for CHROME_HEADLESS
+        for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']:
+            env['CHROME_HEADLESS'] = bool_val
+            result = subprocess.run(
+                ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'],
+                cwd=str(screenshot_dir),
+                capture_output=True,
+                text=True,
+                timeout=120,
+                env=env
+            )
+            # Should either succeed or fail, but shouldn't crash on boolean parsing
+            assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}"
+
+            # Clean up screenshot file if created
+            screenshot_file = screenshot_dir / 'screenshot.png'
+            if screenshot_file.exists():
+                screenshot_file.unlink()
+
+            time.sleep(0.5)  # Brief pause between attempts
+
+
+def test_integer_env_var_parsing():
+    """Test that integer environment variables are parsed correctly."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+
+        # Test valid and invalid integer formats for CHROME_TIMEOUT
+        test_cases = [
+            ('60', True),      # Valid integer
+            ('invalid', True), # Invalid - should use default
+            ('', True),        # Empty - should use default
+        ]
+
+        for timeout_val, should_work in test_cases:
+            env['CHROME_TIMEOUT'] = timeout_val
+            result = subprocess.run(
+                ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'],
+                cwd=str(screenshot_dir),
+                capture_output=True,
+                text=True,
+                timeout=120,
+                env=env
+            )
+            # Should either succeed or fail gracefully, but shouldn't crash on int parsing
+            assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}"
+
+            # Clean up screenshot file if created
+            screenshot_file = screenshot_dir / 'screenshot.png'
+            if screenshot_file.exists():
+                screenshot_file.unlink()
+
+
+def test_extracts_screenshot_with_all_config_options():
+    """Test screenshot with comprehensive config to exercise all code paths."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        # Set ALL config options to exercise all code paths
+        env = get_test_env()
+        env['CHROME_HEADLESS'] = 'true'
+        env['CHROME_RESOLUTION'] = '800,600'
+        env['CHROME_USER_AGENT'] = 'TestBot/1.0'
+        env['CHROME_CHECK_SSL_VALIDITY'] = 'false'  # Exercises checkSsl branch
+        env['CHROME_TIMEOUT'] = '60'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}"
+
+        # Verify JSONL output with success
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            if line.strip().startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass
+
+        assert result_json, "Should have ArchiveResult JSONL output"
+        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
+        assert 'screenshot.png' in result_json['output_str']
+
+        # Verify file created
+        screenshot_file = screenshot_dir / 'screenshot.png'
+        assert screenshot_file.exists(), "screenshot.png should be created"
+        assert screenshot_file.stat().st_size > 1000, "Screenshot should have content"
+
+
+def test_headless_mode_false():
+    """Test headless=false code path specifically."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        # Explicitly test headless=false (exercises the ternary false branch)
+        env['CHROME_HEADLESS'] = 'false'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+        # Should work or fail gracefully
+        assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}"
+
+
+def test_invalid_url_causes_error():
+    """Test error path with invalid URL that causes navigation failure."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        env['CHROME_TIMEOUT'] = '5'  # Short timeout
+
+        # Use invalid URL to trigger error path
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=30,
+            env=env
+        )
+
+        # Should fail due to navigation error
+        assert result.returncode != 0, "Should fail on invalid URL"
+        # Should NOT emit JSONL (transient error)
+        jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
+        assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}"
+
+
+def test_with_corrupted_cdp_url_falls_back():
+    """Test that corrupted CDP URL file causes fallback to launching browser."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        # Create chrome directory with corrupted CDP URL
+        chrome_dir = snapshot_dir / 'chrome'
+        chrome_dir.mkdir()
+        (chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid')
+
+        env = get_test_env()
+        env['CHROME_HEADLESS'] = 'true'
+        env['CHROME_TIMEOUT'] = '5'  # Short timeout for fast test
+
+        # Screenshot should try CDP, fail quickly, then fall back to launching own browser
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=30,
+            env=env
+        )
+
+        # Should succeed by falling back to launching browser
+        assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}"
+        assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure"
+
+        # Verify screenshot was created via fallback path
+        screenshot_file = screenshot_dir / 'screenshot.png'
+        assert screenshot_file.exists(), "Screenshot should be created via fallback"
+
+
+def test_user_agent_is_applied():
+    """Test that CHROME_USER_AGENT is actually applied when launching browser."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)'
+        env['CHROME_HEADLESS'] = 'true'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        # Should succeed with custom user agent
+        assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}"
+        screenshot_file = screenshot_dir / 'screenshot.png'
+        assert screenshot_file.exists(), "Screenshot should be created"
+
+
+def test_check_ssl_false_branch():
+    """Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        env['CHROME_CHECK_SSL_VALIDITY'] = 'false'
+        env['CHROME_HEADLESS'] = 'true'
+
+        # Test with both boolean false and string 'false'
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}"
+        assert (screenshot_dir / 'screenshot.png').exists()
+
+
+def test_alternative_env_var_names():
+    """Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc)."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        # Use alternative env var names (without CHROME_ prefix)
+        env['TIMEOUT'] = '45'
+        env['RESOLUTION'] = '1024,768'
+        env['USER_AGENT'] = 'AltBot/1.0'
+        env['CHECK_SSL_VALIDITY'] = 'false'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}"
+        assert (screenshot_dir / 'screenshot.png').exists()
+
+
+def test_very_large_resolution():
+    """Test screenshot with very large resolution."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        env['CHROME_RESOLUTION'] = '3840,2160'  # 4K resolution
+        env['CHROME_HEADLESS'] = 'true'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Should handle large resolution: {result.stderr}"
+        screenshot_file = screenshot_dir / 'screenshot.png'
+        assert screenshot_file.exists()
+        # 4K screenshot should be larger
+        assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial"
+
+
+def test_very_small_resolution():
+    """Test screenshot with very small resolution."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        data_dir = Path(tmpdir)
+        snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small'
+        screenshot_dir = snapshot_dir / 'screenshot'
+        screenshot_dir.mkdir(parents=True)
+
+        env = get_test_env()
+        env['CHROME_RESOLUTION'] = '320,240'  # Very small
+        env['CHROME_HEADLESS'] = 'true'
+
+        result = subprocess.run(
+            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'],
+            cwd=str(screenshot_dir),
+            capture_output=True,
+            text=True,
+            timeout=120,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Should handle small resolution: {result.stderr}"
+        assert (screenshot_dir / 'screenshot.png').exists()
+
+
 if __name__ == '__main__':
    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py
+++ b/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""
+Detect single-file binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if single-file is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary_found(binary: Binary, name: str):
+    """Output Binary JSONL record for an installed binary."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',  # Already installed
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+    """Output Binary JSONL record for a missing binary that needs installation."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,  # Providers that can install it
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    singlefile_enabled = get_env_bool('SINGLEFILE_ENABLED', True)
+
+    if not singlefile_enabled:
+        sys.exit(0)
+
+    provider = EnvProvider()
+    found = False
+
+    # Try single-file-cli first, then single-file
+    for binary_name in ['single-file-cli', 'single-file']:
+        try:
+            binary = Binary(name=binary_name, binproviders=[provider]).load()
+            if binary.abspath:
+                # Binary found
+                output_binary_found(binary, name='single-file')
+                found = True
+                break
+        except Exception:
+            continue
+
+    if not found:
+        # Binary not found
+        output_binary_missing(name='single-file', binproviders='npm')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/wget/on_Crawl__06_wget_install.py
+++ b/archivebox/plugins/wget/on_Crawl__06_wget_install.py
@@ -40,8 +40,8 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-def output_binary(binary: Binary, name: str):
-    """Output Binary JSONL record to stdout."""
+def output_binary_found(binary: Binary, name: str):
+    """Output Binary JSONL record for an installed binary."""
    machine_id = os.environ.get('MACHINE_ID', '')

    record = {
@@ -50,7 +50,20 @@ def output_binary(binary: Binary, name: str):
        'abspath': str(binary.abspath),
        'version': str(binary.version) if binary.version else '',
        'sha256': binary.sha256 or '',
-        'binprovider': 'env',
+        'binprovider': 'env',  # Already installed
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+    """Output Binary JSONL record for a missing binary that needs installation."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,  # Providers that can install it
        'machine_id': machine_id,
    }
    print(json.dumps(record))
@@ -89,16 +102,19 @@ def main():
        binary_path = ''

    if not binary_path:
-        if use_wget:
-            errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
+        # Binary not found
        computed['WGET_BINARY'] = ''
+        if use_wget:
+            # Emit Binary record for installation
+            output_binary_missing(name='wget', binproviders='apt,brew')
    else:
+        # Binary found
        computed['WGET_BINARY'] = binary_path
        wget_version = str(binary.version) if binary.version else 'unknown'
        computed['WGET_VERSION'] = wget_version

-        # Output Binary JSONL record
-        output_binary(binary, name='wget')
+        # Output Binary JSONL record for installed binary
+        output_binary_found(binary, name='wget')

    # Check for compression support
    if computed.get('WGET_BINARY'):
--- a/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py
+++ b/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Detect yt-dlp binary and emit Binary JSONL record.
+
+Output: Binary JSONL record to stdout if yt-dlp is found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary_found(binary: Binary, name: str):
+    """Output Binary JSONL record for an installed binary."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',  # Already installed
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def output_binary_missing(name: str, binproviders: str):
+    """Output Binary JSONL record for a missing binary that needs installation."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,  # Providers that can install it
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True)
+    ytdlp_binary = get_env('YTDLP_BINARY', 'yt-dlp')
+
+    if not ytdlp_enabled:
+        sys.exit(0)
+
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=ytdlp_binary, binproviders=[provider]).load()
+        if binary.abspath:
+            # Binary found
+            output_binary_found(binary, name='yt-dlp')
+        else:
+            # Binary not found
+            output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt')
+    except Exception:
+        # Binary not found
+        output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/tests/fixtures.py
+++ b/archivebox/tests/fixtures.py
--- a/archivebox/tests/test_add.py
+++ b/archivebox/tests/test_add.py
--- a/archivebox/tests/test_cli_add.py
+++ b/archivebox/tests/test_cli_add.py
--- a/archivebox/tests/test_cli_config.py
+++ b/archivebox/tests/test_cli_config.py
--- a/archivebox/tests/test_cli_extract.py
+++ b/archivebox/tests/test_cli_extract.py
--- a/archivebox/tests/test_cli_help.py
+++ b/archivebox/tests/test_cli_help.py
--- a/archivebox/tests/test_cli_init.py
+++ b/archivebox/tests/test_cli_init.py
--- a/archivebox/tests/test_cli_install.py
+++ b/archivebox/tests/test_cli_install.py
--- a/archivebox/tests/test_cli_manage.py
+++ b/archivebox/tests/test_cli_manage.py
--- a/archivebox/tests/test_cli_remove.py
+++ b/archivebox/tests/test_cli_remove.py
--- a/archivebox/tests/test_cli_schedule.py
+++ b/archivebox/tests/test_cli_schedule.py
--- a/archivebox/tests/test_cli_search.py
+++ b/archivebox/tests/test_cli_search.py
--- a/archivebox/tests/test_cli_server.py
+++ b/archivebox/tests/test_cli_server.py
--- a/archivebox/tests/test_cli_shell.py
+++ b/archivebox/tests/test_cli_shell.py
--- a/archivebox/tests/test_cli_status.py
+++ b/archivebox/tests/test_cli_status.py
--- a/archivebox/tests/test_cli_update.py
+++ b/archivebox/tests/test_cli_update.py
--- a/archivebox/tests/test_cli_version.py
+++ b/archivebox/tests/test_cli_version.py
--- a/archivebox/tests/test_config.py
+++ b/archivebox/tests/test_config.py
--- a/archivebox/tests/test_crawl.py
+++ b/archivebox/tests/test_crawl.py
--- a/archivebox/tests/test_extract.py
+++ b/archivebox/tests/test_extract.py
--- a/archivebox/tests/test_extractors.py
+++ b/archivebox/tests/test_extractors.py
--- a/archivebox/tests/test_init.py
+++ b/archivebox/tests/test_init.py
--- a/archivebox/tests/test_install.py
+++ b/archivebox/tests/test_install.py
--- a/archivebox/tests/test_list.py
+++ b/archivebox/tests/test_list.py
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -10,6 +10,7 @@ Migration tests from 0.8.x to 0.9.x.
 - New fields like depth, retry_at, etc.
 """

+import json
 import shutil
 import sqlite3
 import subprocess
@@ -78,29 +79,43 @@ class TestMigrationFrom08x(unittest.TestCase):
        self.assertTrue(ok, msg)

    def test_migration_preserves_crawls(self):
-        """Migration should preserve all Crawl records."""
+        """Migration should preserve all Crawl records and create default crawl if needed."""
        result = run_archivebox(self.work_dir, ['init'], timeout=45)
        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")

+        # Count snapshots with NULL crawl_id in original data
+        snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None)
+
+        # Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
        expected_count = len(self.original_data['crawls'])
+        if snapshots_without_crawl > 0:
+            expected_count += 1  # Migration 0024 creates a default crawl
+
        ok, msg = verify_crawl_count(self.db_path, expected_count)
        self.assertTrue(ok, msg)

    def test_migration_preserves_snapshot_crawl_links(self):
-        """Migration should preserve snapshot-to-crawl relationships."""
+        """Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
        result = run_archivebox(self.work_dir, ['init'], timeout=45)
        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")

        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()

-        # Check EVERY snapshot still has its crawl_id
+        # Check EVERY snapshot has a crawl_id after migration
        for snapshot in self.original_data['snapshots']:
            cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],))
            row = cursor.fetchone()
            self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
-            self.assertEqual(row[0], snapshot['crawl_id'],
-                f"Crawl ID mismatch for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
+
+            if snapshot['crawl_id'] is not None:
+                # Snapshots that had a crawl should keep it
+                self.assertEqual(row[0], snapshot['crawl_id'],
+                    f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
+            else:
+                # Snapshots without a crawl should now have one (the default crawl)
+                self.assertIsNotNone(row[0],
+                    f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL")

        conn.close()

@@ -153,7 +168,7 @@ class TestMigrationFrom08x(unittest.TestCase):
        result = run_archivebox(self.work_dir, ['init'], timeout=45)
        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")

-        result = run_archivebox(self.work_dir, ['list'])
+        result = run_archivebox(self.work_dir, ['snapshot', 'list'])
        self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")

        # Verify ALL snapshots appear in output
@@ -475,357 +490,227 @@ class TestFilesystemMigration08to09(unittest.TestCase):
        """Clean up temporary directory."""
        shutil.rmtree(self.work_dir, ignore_errors=True)

-    def test_filesystem_migration_with_real_archiving(self):
+    def test_archiveresult_files_preserved_after_migration(self):
        """
-        Test that filesystem migration works with real archived content.
+        Test that ArchiveResult output files are reorganized into new structure.

-        Steps:
-        1. Initialize archivebox
-        2. Archive https://example.com (creates real files)
-        3. Manually set fs_version to 0.8.0
-        4. Trigger migration by saving snapshot
-        5. Verify files are organized correctly
+        This test verifies that:
+        1. Migration preserves ArchiveResult data in Process/Binary records
+        2. Running `archivebox update` reorganizes files into new structure
+        3. New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
+        4. All files are moved (no data loss)
+        5. Old archive/timestamp/ directories are cleaned up
        """
-        # Step 1: Initialize
-        result = run_archivebox(self.work_dir, ['init'], timeout=45)
-        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+        # Use the real 0.7.2 database which has actual ArchiveResults with files
+        gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data')
+        if not gold_db.exists():
+            self.skipTest(f"Gold standard database not found at {gold_db}")

-        # Step 2: Archive example.com with ALL extractors enabled
-        # This ensures we test migration with all file types
-        try:
-            result = run_archivebox(
-                self.work_dir,
-                ['add', '--depth=0', 'https://example.com'],
-                timeout=300,  # 5 minutes for all extractors
-                env={
-                    'SAVE_TITLE': 'True',
-                    'SAVE_FAVICON': 'True',
-                    'SAVE_WGET': 'True',
-                    'SAVE_SCREENSHOT': 'True',
-                    'SAVE_DOM': 'True',
-                    'SAVE_SINGLEFILE': 'True',
-                    'SAVE_READABILITY': 'True',
-                    'SAVE_MERCURY': 'True',
-                    'SAVE_PDF': 'True',
-                    'SAVE_YTDLP': 'True',
-                    'SAVE_ARCHIVEDOTORG': 'True',
-                    'SAVE_HEADERS': 'True',
-                    'SAVE_HTMLTOTEXT': 'True',
-                    'SAVE_GIT': 'True',
-                }
-            )
-        except subprocess.TimeoutExpired as e:
-            # If timeout, still continue - we want to test with whatever files were created
-            print(f"\n[!] Add command timed out after {e.timeout}s, continuing with partial results...")
-            # Note: Snapshot may still have been created even if command timed out
+        # Copy gold database to test directory
+        import shutil
+        for item in gold_db.iterdir():
+            if item.is_dir():
+                shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
+            else:
+                shutil.copy2(item, self.work_dir / item.name)

-        # Step 3: Get the snapshot and verify files were created
-        conn = sqlite3.connect(str(self.db_path))
-        cursor = conn.cursor()
-        cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
-        row = cursor.fetchone()
-        conn.close()
-
-        if not row:
-            self.skipTest("Failed to create snapshot for https://example.com")
-
-        snapshot_id, url, timestamp, fs_version = row
-
-        # Verify initial fs_version is 0.9.0 (current version)
-        self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
-
-        # Verify output directory exists
-        output_dir = self.work_dir / 'archive' / timestamp
-        self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
-
-        # List all files created (for debugging)
-        files_before = list(output_dir.rglob('*'))
-        files_before_count = len([f for f in files_before if f.is_file()])
-        print(f"\n[*] Files created by archiving: {files_before_count}")
-        for f in sorted(files_before):
-            if f.is_file():
-                print(f"    {f.relative_to(output_dir)}")
-
-        # Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
-        conn = sqlite3.connect(str(self.db_path))
-        cursor = conn.cursor()
-        cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
-        conn.commit()
-
-        # Verify the update worked
-        cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
-        updated_version = cursor.fetchone()[0]
-        conn.close()
-        self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
-
-        # Step 5: Trigger migration by running a command that loads and saves the snapshot
-        # We'll use the Python API directly to trigger save()
-        import os
-        import sys
-        import django
-
-        # Setup Django
-        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
-        os.environ['DATA_DIR'] = str(self.work_dir)
-
-        # Add parent dir to path so we can import archivebox
-        sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-        try:
-            django.setup()
-            from archivebox.core.models import Snapshot
-
-            # Load the snapshot (should trigger migration on save)
-            snapshot = Snapshot.objects.get(url='https://example.com')
-
-            # Verify fs_migration_needed returns True
-            self.assertTrue(snapshot.fs_migration_needed,
-                          f"fs_migration_needed should be True for fs_version='0.8.0'")
-
-            # Save to trigger migration
-            print(f"\n[*] Triggering filesystem migration by saving snapshot...")
-            snapshot.save()
-
-            # Refresh from DB
-            snapshot.refresh_from_db()
-
-            # Verify migration completed
-            self.assertEqual(snapshot.fs_version, '0.9.0',
-                           f"Migration failed: fs_version is still '{snapshot.fs_version}'")
-            self.assertFalse(snapshot.fs_migration_needed,
-                           "fs_migration_needed should be False after migration")
-
-            print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
-
-        except Exception as e:
-            self.fail(f"Failed to trigger migration via Django: {e}")
-
-        # Step 6: Verify files still exist and are accessible
-        # For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
-        files_after = list(output_dir.rglob('*'))
-        files_after_count = len([f for f in files_after if f.is_file()])
-
-        print(f"\n[*] Files after migration: {files_after_count}")
-
-        # Verify no files were lost
-        self.assertGreaterEqual(files_after_count, files_before_count,
-                               f"Files were lost during migration: {files_before_count} -> {files_after_count}")
-
-
-class TestDBOnlyCommands(unittest.TestCase):
-    """Test that status/search/list commands only use DB, not filesystem."""
-
-    def setUp(self):
-        """Create a temporary directory with 0.8.x schema and data."""
-        self.work_dir = Path(tempfile.mkdtemp())
-        self.db_path = self.work_dir / 'index.sqlite3'
-
-        create_data_dir_structure(self.work_dir)
-        conn = sqlite3.connect(str(self.db_path))
-        conn.executescript(SCHEMA_0_8)
-        conn.close()
-        self.original_data = seed_0_8_data(self.db_path)
-
-    def tearDown(self):
-        """Clean up temporary directory."""
-        shutil.rmtree(self.work_dir, ignore_errors=True)
-
-    def test_status_works_with_empty_archive(self):
-        """Status command should work with empty archive/ (queries DB only)."""
-        result = run_archivebox(self.work_dir, ['init'], timeout=45)
-        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
-
-        # Add a snapshot to DB
-        result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
-
-        # Empty the archive directory (but keep it existing)
+        # Count archive directories and files BEFORE migration
        archive_dir = self.work_dir / 'archive'
-        if archive_dir.exists():
-            for item in archive_dir.iterdir():
-                if item.is_dir():
-                    shutil.rmtree(item)
-                else:
-                    item.unlink()
+        dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else []
+        dirs_before_count = len([d for d in dirs_before if d.is_dir()])

-        # Status should still work (queries DB only, doesn't scan filesystem)
-        result = run_archivebox(self.work_dir, ['status'])
-        self.assertEqual(result.returncode, 0,
-                        f"Status should work with empty archive: {result.stderr}")
+        # Count total files in all archive directories
+        files_before = []
+        for d in dirs_before:
+            if d.is_dir():
+                files_before.extend([f for f in d.rglob('*') if f.is_file()])
+        files_before_count = len(files_before)

-        # Should show count from DB
-        output = result.stdout + result.stderr
-        self.assertIn('Total', output,
-                     "Status should show DB statistics even with no files")
+        # Sample some specific files to check they're preserved
+        sample_files = [
+            'favicon.ico',
+            'screenshot.png',
+            'singlefile.html',
+            'headers.json',
+        ]
+        sample_paths_before = {}
+        for d in dirs_before:
+            if d.is_dir():
+                for sample_file in sample_files:
+                    matching = list(d.glob(sample_file))
+                    if matching:
+                        sample_paths_before[f"{d.name}/{sample_file}"] = matching[0]

-    def test_list_works_with_empty_archive(self):
-        """List command should work with empty archive/ (queries DB only)."""
-        result = run_archivebox(self.work_dir, ['init'], timeout=45)
-        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+        print(f"\n[*] Archive directories before migration: {dirs_before_count}")
+        print(f"[*] Total files before migration: {files_before_count}")
+        print(f"[*] Sample files found: {len(sample_paths_before)}")

-        # Add a snapshot to DB
-        result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
+        # Run init to trigger migration
+        result = run_archivebox(self.work_dir, ['init'], timeout=60)
+        self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")

-        # Empty the archive directory (but keep it existing)
-        archive_dir = self.work_dir / 'archive'
-        if archive_dir.exists():
-            for item in archive_dir.iterdir():
-                if item.is_dir():
-                    shutil.rmtree(item)
-                else:
-                    item.unlink()
+        # Count archive directories and files AFTER migration
+        dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else []
+        dirs_after_count = len([d for d in dirs_after if d.is_dir()])

-        # List should still work (queries DB only, doesn't scan filesystem)
-        result = run_archivebox(self.work_dir, ['list'])
-        self.assertEqual(result.returncode, 0,
-                        f"List should work with empty archive: {result.stderr}")
+        files_after = []
+        for d in dirs_after:
+            if d.is_dir():
+                files_after.extend([f for f in d.rglob('*') if f.is_file()])
+        files_after_count = len(files_after)

-        # Should show snapshot from DB
-        output = result.stdout + result.stderr
-        self.assertIn('example.com', output,
-                     "Snapshot should appear in list output even with no files")
+        # Verify sample files still exist
+        sample_paths_after = {}
+        for d in dirs_after:
+            if d.is_dir():
+                for sample_file in sample_files:
+                    matching = list(d.glob(sample_file))
+                    if matching:
+                        sample_paths_after[f"{d.name}/{sample_file}"] = matching[0]

-    def test_search_works_with_empty_archive(self):
-        """Search command should work with empty archive/ (queries DB only)."""
-        result = run_archivebox(self.work_dir, ['init'], timeout=45)
-        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+        print(f"[*] Archive directories after migration: {dirs_after_count}")
+        print(f"[*] Total files after migration: {files_after_count}")
+        print(f"[*] Sample files found: {len(sample_paths_after)}")

-        # Add a snapshot to DB
-        result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
+        # Verify files still in old structure after migration (not moved yet)
+        self.assertEqual(dirs_before_count, dirs_after_count,
+                        f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}")
+        self.assertEqual(files_before_count, files_after_count,
+                        f"Files lost during migration: {files_before_count} -> {files_after_count}")

-        # Empty the archive directory (but keep it existing)
-        archive_dir = self.work_dir / 'archive'
-        if archive_dir.exists():
-            for item in archive_dir.iterdir():
-                if item.is_dir():
-                    shutil.rmtree(item)
-                else:
-                    item.unlink()
-
-        # Search should still work (queries DB only, doesn't scan filesystem)
-        result = run_archivebox(self.work_dir, ['search'])
-        self.assertEqual(result.returncode, 0,
-                        f"Search should work with empty archive: {result.stderr}")
-
-        # Should show snapshot from DB
-        output = result.stdout + result.stderr
-        self.assertIn('example.com', output,
-                     "Snapshot should appear in search output even with no files")
-
-
-class TestUpdateCommandArchitecture(unittest.TestCase):
-    """Test new update command architecture: filters=DB only, no filters=scan filesystem."""
-
-    def setUp(self):
-        """Create a temporary directory with 0.8.x schema and data."""
-        self.work_dir = Path(tempfile.mkdtemp())
-        self.db_path = self.work_dir / 'index.sqlite3'
-        create_data_dir_structure(self.work_dir)
-
-    def tearDown(self):
-        """Clean up temporary directory."""
-        shutil.rmtree(self.work_dir, ignore_errors=True)
-
-    def test_update_with_filters_uses_db_only(self):
-        """Update with filters should only query DB, not scan filesystem."""
-        # Initialize with data
-        conn = sqlite3.connect(str(self.db_path))
-        conn.executescript(SCHEMA_0_8)
-        conn.close()
-        seed_0_8_data(self.db_path)
-
-        result = run_archivebox(self.work_dir, ['init'], timeout=45)
-        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
-
-        # Run update with filter - should not scan filesystem
-        # Use a URL from the seeded data
-        result = run_archivebox(self.work_dir, ['update', 'example.com'], timeout=120)
-        # Should complete successfully (or with orchestrator error, which is okay)
-        # The key is it should not scan filesystem
-
-    def test_update_without_filters_imports_orphans(self):
-        """Update without filters should scan filesystem and import orphaned directories."""
-        # Initialize empty DB
-        result = run_archivebox(self.work_dir, ['init'], timeout=45)
-        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
-
-        # Create an orphaned directory in archive/
-        timestamp = '1609459200'
-        orphan_dir = self.work_dir / 'archive' / timestamp
-        orphan_dir.mkdir(parents=True, exist_ok=True)
-
-        index_data = {
-            'url': 'https://orphan.example.com',
-            'timestamp': timestamp,
-            'title': 'Orphaned Snapshot',
-        }
-        (orphan_dir / 'index.json').write_text(json.dumps(index_data))
-        (orphan_dir / 'index.html').write_text('<html>Orphan</html>')
-
-        # Count snapshots before update
-        conn = sqlite3.connect(str(self.db_path))
-        cursor = conn.cursor()
-        cursor.execute("SELECT COUNT(*) FROM core_snapshot")
-        count_before = cursor.fetchone()[0]
-        conn.close()
-
-        # Run full update (no filters) - should scan filesystem
+        # Run update to trigger filesystem reorganization
+        print(f"\n[*] Running archivebox update to reorganize filesystem...")
        result = run_archivebox(self.work_dir, ['update'], timeout=120)
+        self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")

-        # Check if orphan was imported
-        conn = sqlite3.connect(str(self.db_path))
-        cursor = conn.cursor()
-        cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
-                      ('https://orphan.example.com',))
-        orphan_count = cursor.fetchone()[0]
-        conn.close()
+        # Check new filesystem structure
+        # New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
+        users_dir = self.work_dir / 'users'
+        snapshots_base = None

-        # If update succeeded, orphan should be imported
-        if result.returncode == 0:
-            self.assertGreaterEqual(orphan_count, 1,
-                                  "Orphaned snapshot should be imported by update")
+        if users_dir.exists():
+            # Find the snapshots directory
+            for user_dir in users_dir.iterdir():
+                if user_dir.is_dir():
+                    user_snapshots = user_dir / 'snapshots'
+                    if user_snapshots.exists():
+                        snapshots_base = user_snapshots
+                        break

+        print(f"[*] New structure base: {snapshots_base}")

-class TestTimestampUniqueness(unittest.TestCase):
-    """Test timestamp uniqueness constraint."""
+        # Count files in new structure
+        # Structure: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files...
+        files_new_structure = []
+        new_sample_files = {}

-    def setUp(self):
-        """Create a temporary directory."""
-        self.work_dir = Path(tempfile.mkdtemp())
-        self.db_path = self.work_dir / 'index.sqlite3'
-        create_data_dir_structure(self.work_dir)
+        if snapshots_base and snapshots_base.exists():
+            for date_dir in snapshots_base.iterdir():
+                if date_dir.is_dir():
+                    for domain_dir in date_dir.iterdir():
+                        if domain_dir.is_dir():
+                            for snap_dir in domain_dir.iterdir():
+                                if snap_dir.is_dir():
+                                    # Files are directly in snap-uuid/ directory (no plugin subdirs)
+                                    for f in snap_dir.rglob('*'):
+                                        if f.is_file():
+                                            files_new_structure.append(f)
+                                            # Track sample files
+                                            if f.name in sample_files:
+                                                new_sample_files[f"{snap_dir.name}/{f.name}"] = f

-    def tearDown(self):
-        """Clean up temporary directory."""
-        shutil.rmtree(self.work_dir, ignore_errors=True)
+        files_new_count = len(files_new_structure)
+        print(f"[*] Files in new structure: {files_new_count}")
+        print(f"[*] Sample files in new structure: {len(new_sample_files)}")

-    def test_timestamp_uniqueness_constraint_exists(self):
-        """Database should have timestamp uniqueness constraint after migration."""
-        # Initialize with 0.8.x and migrate
-        conn = sqlite3.connect(str(self.db_path))
-        conn.executescript(SCHEMA_0_8)
-        conn.close()
+        # Check old structure (should be gone or empty)
+        old_archive_dir = self.work_dir / 'archive'
+        old_files_remaining = []
+        unmigrated_dirs = []
+        if old_archive_dir.exists():
+            for d in old_archive_dir.glob('*'):
+                # Only count REAL directories, not symlinks (symlinks are the migrated ones)
+                if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit():
+                    # This is a timestamp directory (old structure)
+                    files_in_dir = [f for f in d.rglob('*') if f.is_file()]
+                    if files_in_dir:
+                        unmigrated_dirs.append((d.name, len(files_in_dir)))
+                        old_files_remaining.extend(files_in_dir)

-        result = run_archivebox(self.work_dir, ['init'], timeout=45)
-        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+        old_files_count = len(old_files_remaining)
+        print(f"[*] Files remaining in old structure: {old_files_count}")
+        if unmigrated_dirs:
+            print(f"[*] Unmigrated directories: {unmigrated_dirs}")

-        # Check if unique_timestamp constraint exists
+        # CRITICAL: Verify files were moved to new structure
+        self.assertGreater(files_new_count, 0,
+                          "No files found in new structure after update")
+
+        # CRITICAL: Verify old structure is cleaned up
+        self.assertEqual(old_files_count, 0,
+                        f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories")
+
+        # CRITICAL: Verify all files were moved (total count should match)
+        total_after_update = files_new_count + old_files_count
+        self.assertEqual(files_before_count, total_after_update,
+                        f"Files lost during reorganization: {files_before_count} before → {total_after_update} after")
+
+        # CRITICAL: Verify sample files exist in new structure
+        self.assertGreater(len(new_sample_files), 0,
+                          f"Sample files not found in new structure")
+
+        # Verify new path format
+        for path_key, file_path in new_sample_files.items():
+            # Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
+            path_parts = file_path.parts
+            self.assertIn('snapshots', path_parts,
+                         f"New path should contain 'snapshots': {file_path}")
+            self.assertIn('users', path_parts,
+                         f"New path should contain 'users': {file_path}")
+            print(f"    ✓ {path_key} → {file_path.relative_to(self.work_dir)}")
+
+        # Verify Process and Binary records were created
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()

-        # Query sqlite_master for constraints
-        cursor.execute("""
-            SELECT sql FROM sqlite_master
-            WHERE type='table' AND name='core_snapshot'
-        """)
-        table_sql = cursor.fetchone()[0]
+        cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
+        archiveresult_count = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM machine_process")
+        process_count = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM machine_binary")
+        binary_count = cursor.fetchone()[0]
+
+        cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL")
+        linked_count = cursor.fetchone()[0]
+
        conn.close()

-        # Should contain unique_timestamp constraint or UNIQUE(timestamp)
-        has_constraint = 'unique_timestamp' in table_sql.lower() or \
-                        'unique' in table_sql.lower() and 'timestamp' in table_sql.lower()
+        print(f"[*] ArchiveResults: {archiveresult_count}")
+        print(f"[*] Process records created: {process_count}")
+        print(f"[*] Binary records created: {binary_count}")
+        print(f"[*] ArchiveResults linked to Process: {linked_count}")
+
+        # Verify data migration happened correctly
+        # The 0.7.2 gold database has 44 ArchiveResults
+        self.assertEqual(archiveresult_count, 44,
+                        f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}")
+
+        # Each ArchiveResult should create one Process record
+        self.assertEqual(process_count, 44,
+                        f"Expected 44 Process records (1 per ArchiveResult), got {process_count}")
+
+        # The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
+        self.assertEqual(binary_count, 7,
+                        f"Expected 7 unique Binary records, got {binary_count}")
+
+        # ALL ArchiveResults should be linked to Process records
+        self.assertEqual(linked_count, 44,
+                        f"Expected all 44 ArchiveResults linked to Process, got {linked_count}")
+
+

-        self.assertTrue(has_constraint,
-                       f"Timestamp uniqueness constraint should exist. Table SQL: {table_sql}")


 if __name__ == '__main__':
--- a/archivebox/tests/test_recursive_crawl.py
+++ b/archivebox/tests/test_recursive_crawl.py
--- a/archivebox/tests/test_remove.py
+++ b/archivebox/tests/test_remove.py
--- a/archivebox/tests/test_schedule.py
+++ b/archivebox/tests/test_schedule.py
--- a/archivebox/tests/test_search.py
+++ b/archivebox/tests/test_search.py
--- a/archivebox/tests/test_snapshot.py
+++ b/archivebox/tests/test_snapshot.py
--- a/archivebox/tests/test_status.py
+++ b/archivebox/tests/test_status.py
--- a/archivebox/tests/test_title.py
+++ b/archivebox/tests/test_title.py
--- a/archivebox/tests/test_update.py
+++ b/archivebox/tests/test_update.py
--- a/archivebox/tests/test_util.py
+++ b/archivebox/tests/test_util.py
--- a/archivebox/tests/test_version.py
+++ b/archivebox/tests/test_version.py
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -76,11 +76,11 @@ class Orchestrator:
        self.idle_count: int = 0
        self._last_cleanup_time: float = 0.0  # For throttling cleanup_stale_running()

-        # CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
-        # to keep execution strictly sequential and deterministic
+        # In foreground mode (exit_on_idle=True), limit workers but allow enough
+        # for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker
        if self.exit_on_idle:
            self.MAX_WORKERS_PER_TYPE = 1
-            self.MAX_TOTAL_WORKERS = 1
+            self.MAX_TOTAL_WORKERS = 3  # Allow one worker of each type to run concurrently
    
    def __repr__(self) -> str:
        return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -157,32 +157,41 @@ class Orchestrator:
            self._last_cleanup_time = now

        return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
+
+    def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int:
+        """Get count of running workers for a specific worker type."""
+        return len(WorkerClass.get_running_workers())
    
    def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
        """Determine if we should spawn a new worker of the given type."""
        if queue_count == 0:
            return False
-        
+
        # Check per-type limit
        running_workers = WorkerClass.get_running_workers()
-        if len(running_workers) >= self.MAX_WORKERS_PER_TYPE:
+        running_count = len(running_workers)
+
+        if running_count >= self.MAX_WORKERS_PER_TYPE:
            return False
-        
+
        # Check total limit
-        if self.get_total_worker_count() >= self.MAX_TOTAL_WORKERS:
+        total_workers = self.get_total_worker_count()
+        if total_workers >= self.MAX_TOTAL_WORKERS:
            return False
-        
+
        # Check if we already have enough workers for the queue size
        # Spawn more gradually - don't flood with workers
-        if len(running_workers) > 0 and queue_count <= len(running_workers) * WorkerClass.MAX_CONCURRENT_TASKS:
+        if running_count > 0 and queue_count <= running_count * WorkerClass.MAX_CONCURRENT_TASKS:
            return False
-        
+
        return True
    
    def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
        """Spawn a new worker process. Returns PID or None if spawn failed."""
        try:
+            print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
            pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id)
+            print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')

            # CRITICAL: Block until worker registers itself in Process table
            # This prevents race condition where orchestrator spawns multiple workers
@@ -202,6 +211,15 @@ class Orchestrator:
                # 3. RUNNING status
                # 4. Parent is this orchestrator
                # 5. Started recently (within last 10 seconds)
+
+                # Debug: Check all processes with this PID first
+                if elapsed < 0.5:
+                    all_procs = list(Process.objects.filter(pid=pid))
+                    print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
+                    print(f'[yellow]  Found {len(all_procs)} Process records for pid={pid}[/yellow]')
+                    for p in all_procs:
+                        print(f'[yellow]  -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]')
+
                worker_process = Process.objects.filter(
                    pid=pid,
                    process_type=Process.TypeChoices.WORKER,
@@ -212,6 +230,7 @@ class Orchestrator:

                if worker_process:
                    # Worker successfully registered!
+                    print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
                    return pid

                time.sleep(poll_interval)
@@ -244,7 +263,7 @@ class Orchestrator:
        Returns dict of queue sizes by worker type.
        """
        queue_sizes = {}
-        
+
        for WorkerClass in self.WORKER_TYPES:
            # Get queue for this worker type
            # Need to instantiate worker to get queue (for model access)
@@ -392,11 +411,18 @@ class Orchestrator:

    def _run_orchestrator_loop(self, progress, task_ids):
        """Run the main orchestrator loop with optional progress display."""
+        last_queue_sizes = {}
+        last_snapshot_count = None
        try:
            while True:
                # Check queues and spawn workers
                queue_sizes = self.check_queues_and_spawn_workers()

+                # Debug queue sizes (only when changed)
+                if progress and queue_sizes != last_queue_sizes:
+                    progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]')
+                    last_queue_sizes = queue_sizes.copy()
+
                # Update progress bars
                if progress:
                    from archivebox.core.models import Snapshot
@@ -412,6 +438,11 @@ class Orchestrator:

                    active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))

+                    # Debug snapshot count (only when changed)
+                    if len(active_snapshots) != last_snapshot_count:
+                        progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]')
+                        last_snapshot_count = len(active_snapshots)
+
                    # Track which snapshots are still active
                    active_ids = set()

@@ -461,7 +492,9 @@ class Orchestrator:
                            del task_ids[snapshot_id]

                # Track idle state
-                if self.has_pending_work(queue_sizes) or self.has_running_workers():
+                has_pending = self.has_pending_work(queue_sizes)
+                has_running = self.has_running_workers()
+                if has_pending or has_running:
                    self.idle_count = 0
                    self.on_tick(queue_sizes)
                else:
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -60,8 +60,8 @@ class Worker:
    # Configuration (can be overridden by subclasses)
    MAX_TICK_TIME: ClassVar[int] = 60
    MAX_CONCURRENT_TASKS: ClassVar[int] = 1
-    POLL_INTERVAL: ClassVar[float] = 0.2  # How often to check for new work (seconds)
-    IDLE_TIMEOUT: ClassVar[int] = 50  # Exit after N idle iterations (10 sec at 0.2 poll interval)
+    POLL_INTERVAL: ClassVar[float] = 0.1  # How often to check for new work (seconds)
+    IDLE_TIMEOUT: ClassVar[int] = 100  # Exit after N idle iterations (10 sec at 0.1 poll interval)

    def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
        self.worker_id = worker_id
@@ -93,7 +93,9 @@ class Worker:
        Returns the claimed object or None if queue is empty or claim failed.
        """
        Model = self.get_model()
-        obj = self.get_queue().first()
+
+        queue = self.get_queue()
+        obj = queue.first()
        if obj is None:
            return None

@@ -132,10 +134,17 @@ class Worker:
        self.pid = os.getpid()
        # Register this worker process in the database
        self.db_process = Process.current()
-        # Explicitly set process_type to WORKER to prevent mis-detection
+        # Explicitly set process_type to WORKER and store worker type name
+        update_fields = []
        if self.db_process.process_type != Process.TypeChoices.WORKER:
            self.db_process.process_type = Process.TypeChoices.WORKER
-            self.db_process.save(update_fields=['process_type'])
+            update_fields.append('process_type')
+        # Store worker type name (crawl/snapshot/archiveresult) in worker_type field
+        if not self.db_process.worker_type:
+            self.db_process.worker_type = self.name
+            update_fields.append('worker_type')
+        if update_fields:
+            self.db_process.save(update_fields=update_fields)

        # Determine worker type for logging
        worker_type_name = self.__class__.__name__
@@ -316,7 +325,12 @@ class Worker:

        Process.cleanup_stale_running()
        # Convert Process objects to dicts to match the expected API contract
-        processes = Process.get_running(process_type=Process.TypeChoices.WORKER)
+        # Filter by worker_type to get only workers of this specific type (crawl/snapshot/archiveresult)
+        processes = Process.objects.filter(
+            process_type=Process.TypeChoices.WORKER,
+            worker_type=cls.name,  # Filter by specific worker type
+            status__in=['running', 'started']
+        )
        # Note: worker_id is not stored on Process model, it's dynamically generated
        # We return process_id (UUID) and pid (OS process ID) instead
        return [
@@ -334,7 +348,11 @@ class Worker:
        """Get count of running workers of this type."""
        from archivebox.machine.models import Process

-        return Process.get_running_count(process_type=Process.TypeChoices.WORKER)
+        return Process.objects.filter(
+            process_type=Process.TypeChoices.WORKER,
+            worker_type=cls.name,  # Filter by specific worker type
+            status__in=['running', 'started']
+        ).count()


 class CrawlWorker(Worker):
--- a/bin/test_plugins.sh
+++ b/bin/test_plugins.sh
@@ -3,18 +3,23 @@
 #
 # All plugin tests use pytest and are located in pluginname/tests/test_*.py
 #
-# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage]
+# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage] [--coverage-report]
 #
 # Examples:
 #   ./bin/test_plugins.sh                     # Run all plugin tests with coverage
 #   ./bin/test_plugins.sh chrome              # Run chrome plugin tests with coverage
 #   ./bin/test_plugins.sh parse_*             # Run all parse_* plugin tests with coverage
 #   ./bin/test_plugins.sh --no-coverage       # Run all tests without coverage
+#   ./bin/test_plugins.sh --coverage-report   # Just show coverage report without running tests
 #
-# Coverage results are saved to .coverage and can be viewed with:
-#   coverage combine
-#   coverage report
+# For running individual hooks with coverage:
+#   NODE_V8_COVERAGE=./coverage/js node <hook>.js [args]  # JS hooks
+#   coverage run --parallel-mode <hook>.py [args]         # Python hooks
+#
+# Coverage results are saved to .coverage (Python) and coverage/js (JavaScript):
+#   coverage combine && coverage report
 #   coverage json
+#   ./bin/test_plugins.sh --coverage-report

 set -e

@@ -30,15 +35,134 @@ ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
 # Parse arguments
 PLUGIN_FILTER=""
 ENABLE_COVERAGE=true
+COVERAGE_REPORT_ONLY=false

 for arg in "$@"; do
    if [ "$arg" = "--no-coverage" ]; then
        ENABLE_COVERAGE=false
+    elif [ "$arg" = "--coverage-report" ]; then
+        COVERAGE_REPORT_ONLY=true
    else
        PLUGIN_FILTER="$arg"
    fi
 done

+# Function to show JS coverage report (inlined from convert_v8_coverage.js)
+show_js_coverage() {
+    local coverage_dir="$1"
+
+    if [ ! -d "$coverage_dir" ] || [ -z "$(ls -A "$coverage_dir" 2>/dev/null)" ]; then
+        echo "No JavaScript coverage data collected"
+        echo "(JS hooks may not have been executed during tests)"
+        return
+    fi
+
+    node - "$coverage_dir" << 'ENDJS'
+const fs = require('fs');
+const path = require('path');
+const coverageDir = process.argv[2];
+
+const files = fs.readdirSync(coverageDir).filter(f => f.startsWith('coverage-') && f.endsWith('.json'));
+if (files.length === 0) {
+    console.log('No coverage files found');
+    process.exit(0);
+}
+
+const coverageByFile = {};
+
+files.forEach(file => {
+    const data = JSON.parse(fs.readFileSync(path.join(coverageDir, file), 'utf8'));
+    data.result.forEach(script => {
+        const url = script.url;
+        if (url.startsWith('node:') || url.includes('node_modules')) return;
+
+        if (!coverageByFile[url]) {
+            coverageByFile[url] = { totalRanges: 0, executedRanges: 0 };
+        }
+
+        script.functions.forEach(func => {
+            func.ranges.forEach(range => {
+                coverageByFile[url].totalRanges++;
+                if (range.count > 0) coverageByFile[url].executedRanges++;
+            });
+        });
+    });
+});
+
+const allFiles = Object.keys(coverageByFile).sort();
+const pluginFiles = allFiles.filter(url => url.includes('archivebox/plugins'));
+const otherFiles = allFiles.filter(url => !url.startsWith('node:') && !url.includes('archivebox/plugins'));
+
+console.log('Total files with coverage: ' + allFiles.length + '\n');
+console.log('Plugin files: ' + pluginFiles.length);
+console.log('Node internal: ' + allFiles.filter(u => u.startsWith('node:')).length);
+console.log('Other: ' + otherFiles.length + '\n');
+
+console.log('JavaScript Coverage Report');
+console.log('='.repeat(80));
+console.log('');
+
+if (otherFiles.length > 0) {
+    console.log('Non-plugin files with coverage:');
+    otherFiles.forEach(url => console.log('  ' + url));
+    console.log('');
+}
+
+if (pluginFiles.length === 0) {
+    console.log('No plugin files covered');
+    process.exit(0);
+}
+
+let totalRanges = 0, totalExecuted = 0;
+
+pluginFiles.forEach(url => {
+    const cov = coverageByFile[url];
+    const pct = cov.totalRanges > 0 ? (cov.executedRanges / cov.totalRanges * 100).toFixed(1) : '0.0';
+    const match = url.match(/archivebox\/plugins\/.+/);
+    const displayPath = match ? match[0] : url;
+    console.log(displayPath + ': ' + pct + '% (' + cov.executedRanges + '/' + cov.totalRanges + ' ranges)');
+    totalRanges += cov.totalRanges;
+    totalExecuted += cov.executedRanges;
+});
+
+console.log('');
+console.log('-'.repeat(80));
+const overallPct = totalRanges > 0 ? (totalExecuted / totalRanges * 100).toFixed(1) : '0.0';
+console.log('Total: ' + overallPct + '% (' + totalExecuted + '/' + totalRanges + ' ranges)');
+ENDJS
+}
+
+# If --coverage-report only, just show the report and exit
+if [ "$COVERAGE_REPORT_ONLY" = true ]; then
+    cd "$ROOT_DIR" || exit 1
+    echo "=========================================="
+    echo "Python Coverage Summary"
+    echo "=========================================="
+    coverage combine 2>/dev/null || true
+    coverage report --include="archivebox/plugins/*" --omit="*/tests/*"
+    echo ""
+
+    echo "=========================================="
+    echo "JavaScript Coverage Summary"
+    echo "=========================================="
+    show_js_coverage "$ROOT_DIR/coverage/js"
+    echo ""
+
+    echo "For detailed coverage reports:"
+    echo "  Python:     coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
+    echo "  Python:     coverage json  # LLM-friendly format"
+    echo "  Python:     coverage html  # Interactive HTML report"
+    exit 0
+fi
+
+# Set DATA_DIR for tests (required by abx_pkg and plugins)
+# Use temp dir to isolate tests from project files
+if [ -z "$DATA_DIR" ]; then
+    export DATA_DIR=$(mktemp -d -t archivebox_plugin_tests.XXXXXX)
+    # Clean up on exit
+    trap "rm -rf '$DATA_DIR'" EXIT
+fi
+
 # Reset coverage data if collecting coverage
 if [ "$ENABLE_COVERAGE" = true ]; then
    echo "Resetting coverage data..."
@@ -161,19 +285,14 @@ elif [ $FAILED_PLUGINS -eq 0 ]; then
        echo "=========================================="
        echo "JavaScript Coverage Summary"
        echo "=========================================="
-        if [ -d "$ROOT_DIR/coverage/js" ] && [ "$(ls -A "$ROOT_DIR/coverage/js" 2>/dev/null)" ]; then
-            node "$ROOT_DIR/bin/convert_v8_coverage.js" "$ROOT_DIR/coverage/js"
-        else
-            echo "No JavaScript coverage data collected"
-            echo "(JS hooks may not have been executed during tests)"
-        fi
+        show_js_coverage "$ROOT_DIR/coverage/js"
        echo ""

        echo "For detailed coverage reports (from project root):"
        echo "  Python:     coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
        echo "  Python:     coverage json  # LLM-friendly format"
        echo "  Python:     coverage html  # Interactive HTML report"
-        echo "  JavaScript: node bin/convert_v8_coverage.js coverage/js"
+        echo "  JavaScript: ./bin/test_plugins.sh --coverage-report"
    fi

    exit 0
--- a/archivebox/Architecture.md
+++ b/archivebox/Architecture.md
--- a/old/TODO_archivebox_jsonl_cli.md
+++ b/old/TODO_archivebox_jsonl_cli.md
--- a/old/TODO_cli_refactor.md
+++ b/old/TODO_cli_refactor.md
--- a/old/TODO_hook_concurrency.md
+++ b/old/TODO_hook_concurrency.md
--- a/old/TODO_process_tracking.md
+++ b/old/TODO_process_tracking.md
--- a/old/archivebox.ts
+++ b/old/archivebox.ts
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1 +0,0 @@
-import pytest
--- a/tests/test_cli_crawl.py
+++ b/tests/test_cli_crawl.py
@@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tests for archivebox crawl command.
-Verify crawl creates snapshots with depth.
-"""
-
-import os
-import subprocess
-import sqlite3
-
-from .fixtures import *
-
-
-def test_crawl_creates_snapshots(tmp_path, process, disable_extractors_dict):
-    """Test that crawl command works on existing snapshots."""
-    os.chdir(tmp_path)
-
-    # First add a snapshot
-    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Then run crawl on it
-    result = subprocess.run(
-        ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-        timeout=30,
-    )
-
-    assert result.returncode in [0, 1, 2]  # May succeed or fail depending on URL
-
-    # Check snapshot was created
-    conn = sqlite3.connect("index.sqlite3")
-    c = conn.cursor()
-    count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
-    conn.close()
-
-    assert count == 1
-
-
-def test_crawl_with_depth_0(tmp_path, process, disable_extractors_dict):
-    """Test crawl with depth=0 works on existing snapshot."""
-    os.chdir(tmp_path)
-
-    # First add a snapshot
-    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Then crawl it
-    subprocess.run(
-        ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-        timeout=30,
-    )
-
-    conn = sqlite3.connect("index.sqlite3")
-    c = conn.cursor()
-    count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
-    conn.close()
-
-    # Should have at least 1 snapshot from the add command
-    assert count >= 1
-
-
-def test_crawl_creates_crawl_record(tmp_path, process, disable_extractors_dict):
-    """Test that add+crawl creates Crawl records."""
-    os.chdir(tmp_path)
-
-    # First add a snapshot (this creates a Crawl)
-    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Then crawl it
-    subprocess.run(
-        ['archivebox', 'crawl', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-        timeout=30,
-    )
-
-    conn = sqlite3.connect("index.sqlite3")
-    c = conn.cursor()
-    crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
-    conn.close()
-
-    # Should have at least 1 crawl from the add command
-    assert crawl_count >= 1
--- a/tests/test_cli_snapshot.py
+++ b/tests/test_cli_snapshot.py
@@ -1,63 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tests for archivebox snapshot command.
-Verify snapshot command works with snapshot IDs/URLs.
-"""
-
-import os
-import subprocess
-import sqlite3
-
-from .fixtures import *
-
-
-def test_snapshot_command_works_with_url(tmp_path, process, disable_extractors_dict):
-    """Test that snapshot command works with URL."""
-    os.chdir(tmp_path)
-
-    # Add a snapshot first
-    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Try to view/interact with snapshot
-    result = subprocess.run(
-        ['archivebox', 'snapshot', 'https://example.com'],
-        capture_output=True,
-        text=True,
-        env=disable_extractors_dict,
-        timeout=30,
-    )
-
-    # Should complete (exit code depends on implementation)
-    assert result.returncode in [0, 1, 2]
-
-
-def test_snapshot_command_with_timestamp(tmp_path, process, disable_extractors_dict):
-    """Test snapshot command with timestamp ID."""
-    os.chdir(tmp_path)
-
-    # Add snapshot
-    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Get snapshot timestamp
-    conn = sqlite3.connect("index.sqlite3")
-    c = conn.cursor()
-    timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
-    conn.close()
-
-    # Try snapshot command with timestamp
-    result = subprocess.run(
-        ['archivebox', 'snapshot', str(timestamp)],
-        capture_output=True,
-        env=disable_extractors_dict,
-        timeout=30,
-    )
-
-    assert result.returncode in [0, 1, 2]