more migration id/uuid and config propagation fixes

2026-04-06 07:47:53 +10:00 · 2026-01-04 16:16:26 -08:00
parent 839ae744cf
commit 456aaee287
16 changed files with 789 additions and 94 deletions
--- a/archivebox/base_models/models.py
+++ b/archivebox/base_models/models.py
@@ -111,7 +111,7 @@ class ModelWithOutputDir(ModelWithUUID):
    def save(self, *args, **kwargs):
        super().save(*args, **kwargs)
-        self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
        # Note: index.json is deprecated, models should use write_index_jsonl() for full data
    @property
@@ -127,5 +127,5 @@ class ModelWithOutputDir(ModelWithUUID):
        return f'{self.output_dir_parent}/{self.output_dir_name}'
    @property
-    def OUTPUT_DIR(self) -> Path:
+    def output_dir(self) -> Path:
-        return DATA_DIR / self.output_dir_str
+        raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -118,12 +118,12 @@ class BaseConfigSet(BaseSettings):
 def get_config(
    scope: str = "global",
    defaults: Optional[Dict] = None,
    persona: Any = None,
    user: Any = None,
    crawl: Any = None,
    snapshot: Any = None,
    machine: Any = None,
 ) -> Dict[str, Any]:
    """
    Get merged config from all sources.
@@ -134,17 +134,18 @@ def get_config(
    3. Per-user config (user.config JSON field)
    4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
    5. Environment variables
-    6. Config file (ArchiveBox.conf)
+    6. Per-machine config (machine.config JSON field - resolved binary paths)
-    7. Plugin schema defaults (config.json)
+    7. Config file (ArchiveBox.conf)
-    8. Core config defaults
+    8. Plugin schema defaults (config.json)
    9. Core config defaults
    Args:
        scope: Config scope ('global', 'crawl', 'snapshot', etc.)
        defaults: Default values to start with
        persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
        user: User object with config JSON field
        crawl: Crawl object with config JSON field
        snapshot: Snapshot object with config JSON field
        machine: Machine object with config JSON field (defaults to Machine.current())
    Returns:
        Merged config dict
@@ -184,6 +185,18 @@ def get_config(
        file_config = BaseConfigSet.load_from_file(config_file)
        config.update(file_config)
    # Apply machine config overrides (cached binary paths, etc.)
    if machine is None:
        # Default to current machine if not provided
        try:
            from archivebox.machine.models import Machine
            machine = Machine.current()
        except Exception:
            pass  # Machine might not be available during early init
    if machine and hasattr(machine, "config") and machine.config:
        config.update(machine.config)
    # Override with environment variables
    for key in config:
        env_val = os.environ.get(key)
@@ -221,8 +234,8 @@ def get_config(
        config.update(crawl.config)
    # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
-    if crawl and hasattr(crawl, "OUTPUT_DIR"):
+    if crawl and hasattr(crawl, "output_dir"):
-        config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
+        config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
    # Apply snapshot config overrides (highest priority)
    if snapshot and hasattr(snapshot, "config") and snapshot.config:
@@ -260,7 +273,7 @@ def get_flat_config() -> Dict[str, Any]:
    Replaces abx.pm.hook.get_FLAT_CONFIG()
    """
-    return get_config(scope="global")
+    return get_config()
 def get_all_configs() -> Dict[str, BaseConfigSet]:
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -176,6 +176,7 @@ class ConstantsDict(Mapping):
        CRONTABS_DIR_NAME,
        "invalid",
        "users",
        "machine",
        # Backwards compatibility with old directory names
        "user_plugins",          # old name for USER_PLUGINS_DIR (now 'plugins')
        "user_templates",        # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -15,6 +15,7 @@ def get_table_columns(table_name):
 def upgrade_core_tables(apps, schema_editor):
    """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
    from archivebox.uuid_compat import uuid7
    cursor = connection.cursor()
    # Check if core_archiveresult table exists
@@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor):
    if has_data:
        if has_uuid and not has_abid:
-            # Migrating from v0.7.2 (has uuid, minimal fields)
+            # Migrating from v0.7.2+ (has uuid column)
-            print('Migrating ArchiveResult from v0.7.2 schema...')
+            print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
            cursor.execute("""
                INSERT OR IGNORE INTO core_archiveresult_new (
                    id, uuid, snapshot_id, cmd, pwd, cmd_version,
@@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor):
                FROM core_archiveresult;
            """)
        else:
-            print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
+            # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
            print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
            cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
            old_records = cursor.fetchall()
            for record in old_records:
                new_uuid = uuid7().hex
                cursor.execute("""
                    INSERT OR IGNORE INTO core_archiveresult_new (
                        id, uuid, snapshot_id, cmd, pwd, cmd_version,
                        start_ts, end_ts, status, extractor, output
                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
    cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
    cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor):
    # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
    # transformed by migration 0023, so we don't need to copy them here.
    # NOTE: UUIDs are already populated by migration 0023 for all migration paths
    # Debug: Check Snapshot timestamps at end of RunPython
    cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
--- a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
+++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
@@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7
 def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    """
-    Migrate ArchiveResult from integer PK to UUID PK.
+    Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration).
    Handles both migration paths:
    - 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs
    - 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs
    Strategy:
-    1. Add old_id field to store current integer IDs
+    1. Create new table with UUID as primary key (no temporary columns)
-    2. Generate UUIDs for any records missing them
+    2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x)
-    3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
+    3. Copy all data with UUID as new id
    4. Drop old table, rename new table
    5. Recreate indexes
    Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid)
    """
    cursor = connection.cursor()
@@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
    row_count = cursor.fetchone()[0]
-    if row_count == 0:
+    # Don't skip if table is empty - we still need to recreate to remove uuid column
-        print('No ArchiveResult records to migrate')
+    # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029)
        return
-    print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
+    if row_count == 0:
        print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...')
    else:
        print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
    # Step 0: Check if machine_process table exists, if not NULL out process_id values
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
@@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
        print('machine_process table does not exist yet, setting process_id to NULL')
        cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
-    # Step 1: Create new table with UUID as primary key
+    # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns)
    cursor.execute("""
        CREATE TABLE core_archiveresult_new (
            id TEXT PRIMARY KEY NOT NULL,
            old_id INTEGER,
            uuid TEXT UNIQUE,
            created_at DATETIME NOT NULL,
            modified_at DATETIME NOT NULL,
@@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    """)
    # Step 2: Generate UUIDs for records that don't have them
-    cursor.execute("SELECT id, uuid FROM core_archiveresult")
+    # Check if uuid column exists (0.8.x has it, 0.7.x doesn't)
-    records = cursor.fetchall()
+    cursor.execute("PRAGMA table_info(core_archiveresult)")
    columns = cursor.fetchall()
    col_names = [col[1] for col in columns]
    has_uuid_column = 'uuid' in col_names
-    id_to_uuid = {}
+    if has_uuid_column:
-    for old_id, existing_uuid in records:
+        cursor.execute("SELECT id, uuid FROM core_archiveresult")
-        if existing_uuid:
+        records = cursor.fetchall()
-            # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
+        id_to_uuid = {}
-            # (existing UUIDs might be stored with or without dashes in old schema)
+        for old_id, existing_uuid in records:
-            id_to_uuid[old_id] = UUID(existing_uuid).hex
+            if existing_uuid:
-        else:
+                # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
-            # Generate new UUIDv7 (time-ordered) as 32-char hex
+                # (existing UUIDs might be stored with or without dashes in old schema)
-            id_to_uuid[old_id] = uuid7().hex
+                id_to_uuid[old_id] = UUID(existing_uuid).hex
            else:
                # Generate new UUIDv7 (time-ordered) as 32-char hex
                id_to_uuid[old_id] = uuid7().hex
    else:
        # 0.7.x path: no uuid column, generate new UUIDs for all records
        cursor.execute("SELECT id FROM core_archiveresult")
        records = cursor.fetchall()
        id_to_uuid = {old_id: uuid7().hex for (old_id,) in records}
    # Step 3: Copy data with UUIDs as new primary key
    cursor.execute("SELECT * FROM core_archiveresult")
    old_records = cursor.fetchall()
-    # Get column names
+    # col_names already fetched in Step 2
-    cursor.execute("PRAGMA table_info(core_archiveresult)")
+    inserted_count = 0
    columns = cursor.fetchall()
    col_names = [col[1] for col in columns]
    for i, record in enumerate(old_records):
        old_id = record[col_names.index('id')]
        new_uuid = id_to_uuid[old_id]
@@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
        # Build insert with new structure
        values = {col_names[i]: record[i] for i in range(len(col_names))}
-        # Check which fields exist in new table
+        # List of fields to copy (all fields from new schema except id, old_id, uuid)
        fields_to_copy = [
            'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
            'status', 'retry_at', 'start_ts', 'end_ts',
@@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
            'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
        ]
-        # Build INSERT statement
+        # Build INSERT statement (only copy fields that exist in source)
        existing_fields = [f for f in fields_to_copy if f in values]
        placeholders = ', '.join(['?'] * (len(existing_fields) + 3))  # +3 for id, old_id, uuid
        field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
-        insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
+        if i == 0:
            print(f'[0029] Source columns: {col_names}')
            print(f'[0029] Copying fields: {existing_fields}')
-        cursor.execute(
+        placeholders = ', '.join(['?'] * (len(existing_fields) + 1))  # +1 for id
-            f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
+        field_list = 'id, ' + ', '.join(existing_fields)
-            insert_values
+
-        )
+        insert_values = [new_uuid] + [values.get(f) for f in existing_fields]
        try:
            cursor.execute(
                f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
                insert_values
            )
            inserted_count += 1
        except Exception as e:
            print(f'[0029] ERROR inserting record {old_id}: {e}')
            if i == 0:
                print(f'[0029] First record values: {insert_values[:5]}...')
                raise
    print(f'[0029] Inserted {inserted_count}/{len(old_records)} records')
    # Step 4: Replace old table with new table
    cursor.execute("DROP TABLE core_archiveresult")
@@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
    cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
    cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
    cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
    print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
@@ -159,23 +188,17 @@ class Migration(migrations.Migration):
                ),
            ],
            state_operations=[
-                # Remove old uuid field
+                # Remove uuid field (was added in 0025, we're merging it into id)
                migrations.RemoveField(
                    model_name='archiveresult',
                    name='uuid',
                ),
-                # Change id from AutoField to UUIDField
+                # Change id from AutoField to UUIDField (absorbing the uuid field)
                migrations.AlterField(
                    model_name='archiveresult',
                    name='id',
                    field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
                ),
                # Add old_id field to preserve legacy integer IDs
                migrations.AddField(
                    model_name='archiveresult',
                    name='old_id',
                    field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
                ),
            ],
        ),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def domain(self) -> str:
        return url_domain(self.url)
-    @cached_property
+    @property
    def output_dir(self):
        """The filesystem path to the snapshot's output directory."""
        import os
@@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
        # Clean up .pid files from output directory
-        if self.OUTPUT_DIR.exists():
+        if Path(self.output_dir).exists():
-            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+            for pid_file in Path(self.output_dir).glob('**/*.pid'):
                pid_file.unlink(missing_ok=True)
        # Update all STARTED ArchiveResults from filesystem
@@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    # UUID primary key (migrated from integer in 0029)
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    # old_id preserves the legacy integer ID for backward compatibility
    old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
    # Note: uuid field was removed in migration 0029 when id became UUID
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)
@@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    @property
    def output_dir_parent(self) -> str:
-        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
+        return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))
    # Properties that delegate to Process model (for backwards compatibility)
    # These properties will replace the direct fields after migration is complete
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -180,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        return crawl
    @property
-    def OUTPUT_DIR(self) -> Path:
+    def output_dir(self) -> Path:
        """
        Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
        Domain is extracted from the first URL in the crawl.
@@ -383,7 +383,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                f.flush()
            hook_start = time.time()
            plugin_name = hook.parent.name
-            output_dir = self.OUTPUT_DIR / plugin_name
+            output_dir = self.output_dir / plugin_name
            output_dir.mkdir(parents=True, exist_ok=True)
            # Run hook using Process.launch() - returns Process model
@@ -427,7 +427,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            f.write(f'Created {len(created_snapshots)} snapshots\n')
            f.write(f'=== Crawl.run() complete ===\n\n')
            f.flush()
-        return created_snapshots[0] if created_snapshots else None
+
        # Return first snapshot for this crawl (newly created or existing)
        # This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created
        return self.snapshot_set.first()
    def is_finished(self) -> bool:
        """Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
@@ -467,8 +470,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]')
        # Clean up .pid files from output directory
-        if self.OUTPUT_DIR.exists():
+        if self.output_dir.exists():
-            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+            for pid_file in self.output_dir.glob('**/*.pid'):
                pid_file.unlink(missing_ok=True)
        # Run on_CrawlEnd hooks
@@ -479,7 +482,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        for hook in hooks:
            plugin_name = hook.parent.name
-            output_dir = self.OUTPUT_DIR / plugin_name
+            output_dir = self.output_dir / plugin_name
            output_dir.mkdir(parents=True, exist_ok=True)
            process = run_hook(
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -207,7 +207,7 @@ def discover_hooks(
        # Get merged config if not provided (lazy import to avoid circular dependency)
        if config is None:
            from archivebox.config.configset import get_config
-            config = get_config(scope='global')
+            config = get_config()
        enabled_hooks = []
@@ -703,7 +703,7 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
    # Get merged config if not provided
    if config is None:
        from archivebox.config.configset import get_config
-        config = get_config(scope='global')
+        config = get_config()
    # Support explicit ENABLED_PLUGINS override (legacy)
    if 'ENABLED_PLUGINS' in config:
@@ -967,9 +967,9 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
    else:
        # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
        import sys
        print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
        enabled_key = f'{plugin_upper}_ENABLED'
        enabled = config.get(enabled_key)
        print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr)
        if enabled is None:
            enabled = True
        elif isinstance(enabled, str):
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -378,7 +378,7 @@ class Binary(ModelWithHealthStats):
        return None
    @property
-    def OUTPUT_DIR(self):
+    def output_dir(self):
        """Return the output directory for this binary installation."""
        from pathlib import Path
        from django.conf import settings
@@ -412,10 +412,10 @@ class Binary(ModelWithHealthStats):
        from archivebox.config.configset import get_config
        # Get merged config (Binary doesn't have crawl/snapshot context)
-        config = get_config(scope='global')
+        config = get_config()
        # Create output directory
-        output_dir = self.OUTPUT_DIR
+        output_dir = self.output_dir
        output_dir.mkdir(parents=True, exist_ok=True)
        self.output_dir = str(output_dir)
        self.save()
@@ -514,7 +514,7 @@ class Binary(ModelWithHealthStats):
                print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]')
        # Clean up .pid files from output directory
-        output_dir = self.OUTPUT_DIR
+        output_dir = self.output_dir
        if output_dir.exists():
            for pid_file in output_dir.glob('**/*.pid'):
                pid_file.unlink(missing_ok=True)
@@ -1276,6 +1276,128 @@ class Process(models.Model):
        """Path to stderr log."""
        return Path(self.pwd) / 'stderr.log' if self.pwd else None
    def tail_stdout(self, lines: int = 50, follow: bool = False):
        """
        Tail stdout log file (like `tail` or `tail -f`).
        Args:
            lines: Number of lines to show (default 50)
            follow: If True, follow the file and yield new lines as they appear
        Yields:
            Lines from stdout
        """
        if not self.stdout_file or not self.stdout_file.exists():
            return
        if follow:
            # Follow mode - yield new lines as they appear (tail -f)
            import time
            with open(self.stdout_file, 'r') as f:
                # Seek to end minus roughly 'lines' worth of bytes
                f.seek(0, 2)  # Seek to end
                file_size = f.tell()
                # Rough estimate: 100 bytes per line
                seek_pos = max(0, file_size - (lines * 100))
                f.seek(seek_pos)
                # Skip partial line if we seeked to middle
                if seek_pos > 0:
                    f.readline()
                # Yield existing lines
                for line in f:
                    yield line.rstrip('\n')
                # Now follow for new lines
                while True:
                    line = f.readline()
                    if line:
                        yield line.rstrip('\n')
                    else:
                        time.sleep(0.1)  # Wait before checking again
        else:
            # Just get last N lines (tail -n)
            try:
                content = self.stdout_file.read_text()
                for line in content.splitlines()[-lines:]:
                    yield line
            except Exception:
                return
    def tail_stderr(self, lines: int = 50, follow: bool = False):
        """
        Tail stderr log file (like `tail` or `tail -f`).
        Args:
            lines: Number of lines to show (default 50)
            follow: If True, follow the file and yield new lines as they appear
        Yields:
            Lines from stderr
        """
        if not self.stderr_file or not self.stderr_file.exists():
            return
        if follow:
            # Follow mode - yield new lines as they appear (tail -f)
            import time
            with open(self.stderr_file, 'r') as f:
                # Seek to end minus roughly 'lines' worth of bytes
                f.seek(0, 2)  # Seek to end
                file_size = f.tell()
                # Rough estimate: 100 bytes per line
                seek_pos = max(0, file_size - (lines * 100))
                f.seek(seek_pos)
                # Skip partial line if we seeked to middle
                if seek_pos > 0:
                    f.readline()
                # Yield existing lines
                for line in f:
                    yield line.rstrip('\n')
                # Now follow for new lines
                while True:
                    line = f.readline()
                    if line:
                        yield line.rstrip('\n')
                    else:
                        time.sleep(0.1)  # Wait before checking again
        else:
            # Just get last N lines (tail -n)
            try:
                content = self.stderr_file.read_text()
                for line in content.splitlines()[-lines:]:
                    yield line
            except Exception:
                return
    def pipe_stdout(self, lines: int = 10, follow: bool = True):
        """
        Pipe stdout to sys.stdout.
        Args:
            lines: Number of initial lines to show
            follow: If True, follow the file and print new lines as they appear
        """
        import sys
        for line in self.tail_stdout(lines=lines, follow=follow):
            print(line, file=sys.stdout, flush=True)
    def pipe_stderr(self, lines: int = 10, follow: bool = True):
        """
        Pipe stderr to sys.stderr.
        Args:
            lines: Number of initial lines to show
            follow: If True, follow the file and print new lines as they appear
        """
        import sys
        for line in self.tail_stderr(lines=lines, follow=follow):
            print(line, file=sys.stderr, flush=True)
    def _write_pid_file(self) -> None:
        """Write PID file with mtime set to process start time."""
        if self.pid and self.started_at and self.pid_file:
--- a/archivebox/plugins/chrome/config.json
+++ b/archivebox/plugins/chrome/config.json
@@ -3,6 +3,12 @@
  "type": "object",
  "additionalProperties": false,
  "properties": {
    "CHROME_ENABLED": {
      "type": "boolean",
      "default": true,
      "x-aliases": ["USE_CHROME"],
      "description": "Enable Chrome/Chromium browser integration for archiving"
    },
    "CHROME_BINARY": {
      "type": "string",
      "default": "chromium",
--- a/archivebox/plugins/screenshot/tests/test_screenshot.py
+++ b/archivebox/plugins/screenshot/tests/test_screenshot.py
@@ -201,16 +201,18 @@ def test_config_save_screenshot_false_skips():
    """Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
    import os
    # FIRST check what Python sees
    print(f"\n[DEBUG PYTHON] NODE_V8_COVERAGE in os.environ: {'NODE_V8_COVERAGE' in os.environ}")
    print(f"[DEBUG PYTHON] Value: {os.environ.get('NODE_V8_COVERAGE', 'NOT SET')}")
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        env = os.environ.copy()
        env['SCREENSHOT_ENABLED'] = 'False'
-        # DEBUG: Check if NODE_V8_COVERAGE is in env
+        # Check what's in the copied env
-        if 'NODE_V8_COVERAGE' in env:
+        print(f"[DEBUG ENV COPY] NODE_V8_COVERAGE in env: {'NODE_V8_COVERAGE' in env}")
-            print(f"\n[DEBUG] NODE_V8_COVERAGE in env: {env['NODE_V8_COVERAGE']}")
+        print(f"[DEBUG ENV COPY] Value: {env.get('NODE_V8_COVERAGE', 'NOT SET')}")
        else:
            print("\n[DEBUG] NODE_V8_COVERAGE NOT in env")
        result = subprocess.run(
            ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
@@ -221,6 +223,12 @@ def test_config_save_screenshot_false_skips():
            timeout=30
        )
        print(f"[DEBUG RESULT] Exit code: {result.returncode}")
        print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
        # FORCE FAILURE to verify test actually runs
        assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
        assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
        # Feature disabled - temporary failure, should NOT emit JSONL
--- a/archivebox/tests/test_migrations_07_to_09.py
+++ b/archivebox/tests/test_migrations_07_to_09.py
@@ -136,7 +136,7 @@ class TestMigrationFrom07x(unittest.TestCase):
        result = run_archivebox(self.work_dir, ['init'], timeout=45)
        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
-        result = run_archivebox(self.work_dir, ['list'])
+        result = run_archivebox(self.work_dir, ['snapshot', 'list'])
        self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
        # Verify ALL snapshots appear in output
--- a/archivebox/tests/test_worker_config_propagation.py
+++ b/archivebox/tests/test_worker_config_propagation.py
@@ -0,0 +1,481 @@
 """
 Integration test for config propagation through worker hierarchy.
 Tests that config is properly merged and passed through:
    Parent CLI/Orchestrator
    └── CrawlWorker subprocess (via Process.env)
        └── SnapshotWorker subprocess (via Process.env)
            └── Hook subprocess (via Process.env)
 Config priority order (highest to lowest):
 1. Snapshot.config (JSON field)
 2. Crawl.config (JSON field)
 3. User.config (JSON field)
 4. Environment variables (os.environ + Process.env)
 5. Config file (ArchiveBox.conf)
 6. Plugin defaults (config.json)
 7. Core defaults
 """
 import os
 import json
 import tempfile
 import subprocess
 import time
 from pathlib import Path
 def test_config_propagation_through_worker_hierarchy():
    """
    Integration test: Verify config is properly merged at every level.
    Test flow:
    1. Create test archive with custom config in ArchiveBox.conf
    2. Set custom env vars before spawning worker
    3. Create Crawl with custom crawl.config JSON field
    4. Create Snapshot with custom snapshot.config JSON field
    5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
    6. Verify worker received merged config from all sources
    7. Verify hook subprocess also received correct config
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        data_dir = Path(tmpdir) / 'test_archive'
        data_dir.mkdir()
        print(f"\n{'='*80}")
        print(f"Test: Config Propagation Through Worker Hierarchy")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")
        # Step 1: Initialize archive
        print("Step 1: Initialize archive")
        result = subprocess.run(
            ['python', '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
            },
            capture_output=True,
            timeout=60,
        )
        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
        print(f"✓ Archive initialized\n")
        # Step 2: Write custom config to ArchiveBox.conf
        print("Step 2: Write custom config to ArchiveBox.conf")
        config_file = data_dir / 'ArchiveBox.conf'
        config_file.write_text("""
 [GENERAL]
 # Custom timeout in config file
 TIMEOUT = 999
 [ARCHIVING_CONFIG]
 # Enable all plugins for proper testing
 SAVE_WGET = True
 SAVE_WARC = True
 SAVE_PDF = True
 SAVE_DOM = True
 SAVE_SINGLEFILE = True
 SAVE_READABILITY = True
 SAVE_MERCURY = True
 SAVE_HTMLTOTEXT = True
 SAVE_GIT = True
 SAVE_MEDIA = True
 SAVE_ARCHIVE_DOT_ORG = True
 SAVE_TITLE = True
 SAVE_FAVICON = True
 SAVE_SCREENSHOT = True
 """)
        print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
        # Step 2.5: Set Machine.config values
        print("Step 2.5: Set Machine.config with custom binary path")
        set_machine_config_script = f"""
 import os
 os.environ['DATA_DIR'] = '{data_dir}'
 from archivebox.config.django import setup_django
 setup_django()
 from archivebox.machine.models import Machine
 machine = Machine.current()
 machine.config = {{
    'CUSTOM_MACHINE_KEY': 'from_machine_config',
    'WGET_BINARY': '/custom/machine/wget',  # Machine-specific binary path
 }}
 machine.save()
 print(f"Machine {{machine.hostname}} config updated")
 """
        result = subprocess.run(
            ['python', '-c', set_machine_config_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
            },
            capture_output=True,
            timeout=30,
        )
        assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
        print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
        # Step 3: Create Crawl via Django ORM with custom crawl.config
        print("Step 3: Create Crawl with custom crawl.config JSON")
        create_crawl_script = f"""
 import os
 os.environ['DATA_DIR'] = '{data_dir}'
 from archivebox.config.django import setup_django
 setup_django()
 from django.utils import timezone
 from archivebox.crawls.models import Crawl
 # Create crawl with custom config
 crawl = Crawl.objects.create(
    status='queued',
    retry_at=timezone.now(),
    urls='https://example.com',
    config={{
        'TIMEOUT': 777,  # Crawl-level override (higher priority than file)
        'CUSTOM_CRAWL_KEY': 'from_crawl_json',
    }}
 )
 print(crawl.id)
 """
        result = subprocess.run(
            ['python', '-c', create_crawl_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
            },
            capture_output=True,
            timeout=30,
        )
        assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
        # Extract UUID from output (last line should be the UUID)
        crawl_id = result.stdout.decode().strip().split('\n')[-1]
        print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
        # Step 4: Create Snapshot with custom snapshot.config
        print("Step 4: Create Snapshot with custom snapshot.config JSON")
        create_snapshot_script = f"""
 import os
 os.environ['DATA_DIR'] = '{data_dir}'
 from archivebox.config.django import setup_django
 setup_django()
 from django.utils import timezone
 from archivebox.core.models import Snapshot
 from archivebox.crawls.models import Crawl
 crawl = Crawl.objects.get(id='{crawl_id}')
 snapshot = Snapshot.objects.create(
    url='https://example.com',
    crawl=crawl,
    status='queued',
    retry_at=timezone.now(),
    config={{
        'TIMEOUT': 555,  # Snapshot-level override (highest priority)
        'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
        'SAVE_SCREENSHOT': True,  # Keep screenshot enabled
        'SAVE_WGET': False,  # But disable wget as a test of per-snapshot override
    }}
 )
 print(snapshot.id)
 """
        result = subprocess.run(
            ['python', '-c', create_snapshot_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
            },
            capture_output=True,
            timeout=30,
        )
        assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
        # Extract UUID from output (last line should be the UUID)
        snapshot_id = result.stdout.decode().strip().split('\n')[-1]
        print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
        # Step 5: Run SnapshotWorker with additional env var
        print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
        result = subprocess.run(
            ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
            cwd=str(data_dir),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
                'ENV_VAR_KEY': 'from_environment',  # Environment variable
            },
            capture_output=True,
            timeout=120,
        )
        stdout = result.stdout.decode()
        stderr = result.stderr.decode()
        print("\n--- SnapshotWorker stdout ---")
        print(stdout)
        print("\n--- SnapshotWorker stderr ---")
        print(stderr)
        print("--- End output ---\n")
        # Step 6: Verify config was properly merged
        print("Step 6: Verify config merging")
        # Check that SnapshotWorker ran successfully
        assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
        # Verify config by checking stderr debug output and ArchiveResults in database
        print("\n--- Verifying config propagation ---\n")
        # Check for config debug messages in stderr
        assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
            "Expected debug output not found in stderr"
        print("✓ Config debug output found in stderr")
        # Verify config values were actually used by checking ArchiveResults
        verify_script = f"""
 import os
 os.environ['DATA_DIR'] = '{data_dir}'
 from archivebox.config.django import setup_django
 setup_django()
 from archivebox.core.models import Snapshot, ArchiveResult
 from archivebox.config.configset import get_config
 snapshot = Snapshot.objects.get(id='{snapshot_id}')
 print(f"Snapshot status: {{snapshot.status}}")
 print(f"Snapshot URL: {{snapshot.url}}")
 # Check that snapshot reached sealed state
 assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
 # Verify all config sources are present in merged config
 print("\\nVerifying config merge priority:")
 config = get_config(snapshot=snapshot)
 # 1. Snapshot.config (highest priority)
 timeout = config.get('TIMEOUT')
 print(f"  1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
 assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
 wget_enabled = config.get('SAVE_WGET')
 print(f"  1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
 assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
 custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
 print(f"  1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
 assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
 # 2. Crawl.config
 custom_crawl = config.get('CUSTOM_CRAWL_KEY')
 print(f"  2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
 assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
 # 6. Machine.config
 custom_machine = config.get('CUSTOM_MACHINE_KEY')
 print(f"  6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
 assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
 wget_binary = config.get('WGET_BINARY')
 print(f"  6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
 # Note: This might be overridden by environment or other sources, just check it's present
 assert wget_binary is not None, f"WGET_BINARY should be present"
 # Check ArchiveResults to verify plugins actually ran with correct config
 results = ArchiveResult.objects.filter(snapshot=snapshot)
 print(f"\\nArchiveResults created: {{results.count()}}")
 for ar in results.order_by('plugin'):
    print(f"  {{ar.plugin}}: {{ar.status}}")
 # Verify SAVE_WGET=False was respected (should have no wget result)
 wget_results = results.filter(plugin='wget')
 print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
 assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
 # Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
 screenshot_results = results.filter(plugin='screenshot')
 print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
 assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
 print("\\n✓ All config sources correctly merged:")
 print("  - Snapshot.config overrides (highest priority)")
 print("  - Crawl.config values present")
 print("  - Machine.config values present")
 print("  - File config values present")
 print("✓ Config priority order verified")
 print("✓ Snapshot successfully sealed")
 """
        result = subprocess.run(
            ['python', '-c', verify_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
            },
            capture_output=True,
            timeout=30,
        )
        print(result.stdout.decode())
        if result.returncode != 0:
            print("\nVerification error:")
            print(result.stderr.decode())
        assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
        print("\n" + "="*80)
        print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
        print("="*80 + "\n")
 def test_config_environment_variable_parsing():
    """
    Test that Process._build_env() correctly serializes config values,
    and get_config() correctly parses them back from environment.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        data_dir = Path(tmpdir) / 'test_archive'
        data_dir.mkdir()
        print(f"\n{'='*80}")
        print(f"Test: Config Environment Variable Parsing")
        print(f"DATA_DIR: {data_dir}")
        print(f"{'='*80}\n")
        # Initialize archive
        result = subprocess.run(
            ['python', '-m', 'archivebox', 'init'],
            cwd=str(data_dir),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
            },
            capture_output=True,
            timeout=60,
        )
        assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
        # Test various data types in config
        test_config_types_script = f"""
 import os
 os.environ['DATA_DIR'] = '{data_dir}'
 from archivebox.config.django import setup_django
 setup_django()
 from archivebox.config.configset import get_config
 from archivebox.machine.models import Process, Machine
 # Test get_config() with no overrides (baseline)
 config = get_config()
 print(f"Baseline config keys: {{len(config)}}")
 # Create a test Process with various config types
 process = Process.objects.create(
    machine=Machine.current(),
    process_type=Process.TypeChoices.WORKER,
    pwd='{data_dir}',
    cmd=['test'],
    env={{
        'STRING_VAL': 'hello',
        'INT_VAL': 123,
        'FLOAT_VAL': 45.67,
        'BOOL_TRUE': True,
        'BOOL_FALSE': False,
        'LIST_VAL': ['a', 'b', 'c'],
        'DICT_VAL': {{'key': 'value'}},
        'NONE_VAL': None,
    }},
 )
 # Test _build_env() serialization
 env = process._build_env()
 print(f"\\nSerialized environment:")
 print(f"  STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
 print(f"  INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
 print(f"  FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
 print(f"  BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
 print(f"  BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
 print(f"  LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
 print(f"  DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
 print(f"  NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
 # Verify all are strings (required by subprocess.Popen)
 assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
 assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
 assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
 assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
 assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
 assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
 assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
 print("\\n✓ All environment values correctly serialized as strings")
 # Now test that get_config() can parse them back
 # Simulate subprocess by setting os.environ
 import json
 for key, val in env.items():
    if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
        os.environ[key] = val
 # Get config again - should parse from environment
 config = get_config()
 print(f"\\nParsed from environment:")
 print(f"  STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
 print(f"  INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
 print(f"  FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
 print(f"  BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
 print(f"  BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
 print(f"  LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
 print(f"  DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
 print("\\n✓ All config values correctly parsed from environment")
 """
        result = subprocess.run(
            ['python', '-c', test_config_types_script],
            cwd=str(data_dir.parent),
            env={
                **os.environ,
                'DATA_DIR': str(data_dir),
                'USE_COLOR': 'False',
            },
            capture_output=True,
            timeout=30,
        )
        print(result.stdout.decode())
        if result.stderr:
            print("Script stderr:")
            print(result.stderr.decode())
        assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
        print("\n" + "="*80)
        print("✓ TEST PASSED: Config serialization and parsing works correctly")
        print("="*80 + "\n")
 if __name__ == '__main__':
    # Run as standalone script
    test_config_propagation_through_worker_hierarchy()
    test_config_environment_variable_parsing()
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -308,8 +308,8 @@ class Worker:
            crawl = Crawl.objects.get(id=crawl_id)
            cmd = [sys.executable, '-m', 'archivebox', 'run', '--crawl-id', str(crawl_id)]
-            pwd = Path(crawl.OUTPUT_DIR)  # Run in crawl's output directory
+            pwd = Path(crawl.output_dir)  # Run in crawl's output directory
-            env = get_config(scope='crawl', crawl=crawl)
+            env = get_config(crawl=crawl)
        elif cls.name == 'snapshot':
            snapshot_id = kwargs.get('snapshot_id')
@@ -321,7 +321,7 @@ class Worker:
            cmd = [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', str(snapshot_id)]
            pwd = Path(snapshot.output_dir)  # Run in snapshot's output directory
-            env = get_config(scope='snapshot', snapshot=snapshot)
+            env = get_config(snapshot=snapshot)
        else:
            raise ValueError(f"Unknown worker type: {cls.name}")
@@ -459,6 +459,8 @@ class CrawlWorker(Worker):
        from pathlib import Path
        from archivebox.core.models import Snapshot
        from archivebox.machine.models import Process
        import sys
        import threading
        debug_log = Path('/tmp/archivebox_crawl_worker_debug.log')
@@ -514,7 +516,9 @@ class CrawlWorker(Worker):
            with open(debug_log, 'a') as f:
                f.write(f'  Spawning worker for {snapshot.url} (status={snapshot.status})\n')
                f.flush()
-            SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
+
            pid = SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
            log_worker_event(
                worker_type='CrawlWorker',
                event=f'Spawned SnapshotWorker for {snapshot.url}',
@@ -522,6 +526,18 @@ class CrawlWorker(Worker):
                pid=self.pid,
            )
            # Pipe the SnapshotWorker's stderr to our stderr so we can see what's happening
            # Get the Process record that was just created
            worker_process = Process.objects.filter(pid=pid).first()
            if worker_process:
                # Pipe stderr in background thread so it doesn't block
                def pipe_worker_stderr():
                    for line in worker_process.tail_stderr(lines=0, follow=True):
                        print(f'  [SnapshotWorker] {line}', file=sys.stderr, flush=True)
                thread = threading.Thread(target=pipe_worker_stderr, daemon=True)
                thread.start()
    def _is_crawl_finished(self) -> bool:
        """Check if all snapshots are sealed."""
        from pathlib import Path
@@ -626,16 +642,28 @@ class SnapshotWorker(Worker):
        """Execute all hooks sequentially."""
        from archivebox.hooks import discover_hooks, is_background_hook, extract_step
        from archivebox.core.models import ArchiveResult
        from archivebox.config.configset import get_config
        self.on_startup()
        try:
            # Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
            config = get_config(snapshot=self.snapshot)
            # Discover all hooks for this snapshot
-            hooks = discover_hooks('Snapshot', config=self.snapshot.config)
+            hooks = discover_hooks('Snapshot', config=config)
            hooks = sorted(hooks, key=lambda h: h.name)  # Sort by name (includes step prefix)
            import sys
            print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
            if hooks:
                print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
            else:
                print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
            # Execute each hook sequentially
            for hook_path in hooks:
                print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
                hook_name = hook_path.name
                plugin = self._extract_plugin_name(hook_name)
                hook_step = extract_step(hook_name)
@@ -661,7 +689,7 @@ class SnapshotWorker(Worker):
                    ar.save(update_fields=['status', 'start_ts', 'modified_at'])
                # Fork and run the hook
-                process = self._run_hook(hook_path, ar)
+                process = self._run_hook(hook_path, ar, config)
                if is_background:
                    # Track but don't wait
@@ -698,7 +726,7 @@ class SnapshotWorker(Worker):
        finally:
            self.on_shutdown()
-    def _run_hook(self, hook_path: Path, ar: Any) -> Any:
+    def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any:
        """Fork and run a hook using Process model, return Process."""
        from archivebox.hooks import run_hook
@@ -710,7 +738,7 @@ class SnapshotWorker(Worker):
        process = run_hook(
            script=hook_path,
            output_dir=output_dir,
-            config=self.snapshot.config,
+            config=config,
            timeout=120,
            parent=self.db_process,
            url=str(self.snapshot.url),
--- a/bin/test_plugins.sh
+++ b/bin/test_plugins.sh
@@ -179,7 +179,7 @@ if [ "$ENABLE_COVERAGE" = true ]; then
    export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
    echo "Python coverage: enabled (subprocess support)"
-    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)"
+    echo "JavaScript coverage: enabled (NODE_V8_COVERAGE=$NODE_V8_COVERAGE)"
    echo ""
 fi