From 456aaee287c69c9bf26a282639c6042a74bcb7b4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 4 Jan 2026 16:16:26 -0800 Subject: [PATCH] more migration id/uuid and config propagation fixes --- archivebox/base_models/models.py | 6 +- archivebox/config/configset.py | 29 +- archivebox/config/constants.py | 1 + .../core/migrations/0023_upgrade_to_0_9_0.py | 18 +- ...options_alter_snapshot_options_and_more.py | 1 + .../0029_migrate_archiveresult_to_uuid_pk.py | 113 ++-- archivebox/core/models.py | 11 +- archivebox/crawls/models.py | 15 +- archivebox/hooks.py | 6 +- archivebox/machine/models.py | 130 ++++- archivebox/plugins/chrome/config.json | 6 + .../screenshot/tests/test_screenshot.py | 18 +- archivebox/tests/test_migrations_07_to_09.py | 2 +- .../tests/test_worker_config_propagation.py | 481 ++++++++++++++++++ archivebox/workers/worker.py | 44 +- bin/test_plugins.sh | 2 +- 16 files changed, 789 insertions(+), 94 deletions(-) create mode 100644 archivebox/tests/test_worker_config_propagation.py diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index adfbce35..7d0bbb05 100755 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -111,7 +111,7 @@ class ModelWithOutputDir(ModelWithUUID): def save(self, *args, **kwargs): super().save(*args, **kwargs) - self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + Path(self.output_dir).mkdir(parents=True, exist_ok=True) # Note: index.json is deprecated, models should use write_index_jsonl() for full data @property @@ -127,5 +127,5 @@ class ModelWithOutputDir(ModelWithUUID): return f'{self.output_dir_parent}/{self.output_dir_name}' @property - def OUTPUT_DIR(self) -> Path: - return DATA_DIR / self.output_dir_str + def output_dir(self) -> Path: + raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property') diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 7e56e22a..805cb86e 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -118,12 +118,12 @@ class BaseConfigSet(BaseSettings): def get_config( - scope: str = "global", defaults: Optional[Dict] = None, persona: Any = None, user: Any = None, crawl: Any = None, snapshot: Any = None, + machine: Any = None, ) -> Dict[str, Any]: """ Get merged config from all sources. @@ -134,17 +134,18 @@ def get_config( 3. Per-user config (user.config JSON field) 4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.) 5. Environment variables - 6. Config file (ArchiveBox.conf) - 7. Plugin schema defaults (config.json) - 8. Core config defaults + 6. Per-machine config (machine.config JSON field - resolved binary paths) + 7. Config file (ArchiveBox.conf) + 8. Plugin schema defaults (config.json) + 9. Core config defaults Args: - scope: Config scope ('global', 'crawl', 'snapshot', etc.) defaults: Default values to start with persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR) user: User object with config JSON field crawl: Crawl object with config JSON field snapshot: Snapshot object with config JSON field + machine: Machine object with config JSON field (defaults to Machine.current()) Returns: Merged config dict @@ -184,6 +185,18 @@ def get_config( file_config = BaseConfigSet.load_from_file(config_file) config.update(file_config) + # Apply machine config overrides (cached binary paths, etc.) + if machine is None: + # Default to current machine if not provided + try: + from archivebox.machine.models import Machine + machine = Machine.current() + except Exception: + pass # Machine might not be available during early init + + if machine and hasattr(machine, "config") and machine.config: + config.update(machine.config) + # Override with environment variables for key in config: env_val = os.environ.get(key) @@ -221,8 +234,8 @@ def get_config( config.update(crawl.config) # Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session - if crawl and hasattr(crawl, "OUTPUT_DIR"): - config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR) + if crawl and hasattr(crawl, "output_dir"): + config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir) # Apply snapshot config overrides (highest priority) if snapshot and hasattr(snapshot, "config") and snapshot.config: @@ -260,7 +273,7 @@ def get_flat_config() -> Dict[str, Any]: Replaces abx.pm.hook.get_FLAT_CONFIG() """ - return get_config(scope="global") + return get_config() def get_all_configs() -> Dict[str, BaseConfigSet]: diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 607ff2e7..9e78d722 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -176,6 +176,7 @@ class ConstantsDict(Mapping): CRONTABS_DIR_NAME, "invalid", "users", + "machine", # Backwards compatibility with old directory names "user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins') "user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates') diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py index 2133309c..c32c31b3 100644 --- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -15,6 +15,7 @@ def get_table_columns(table_name): def upgrade_core_tables(apps, schema_editor): """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0.""" + from archivebox.uuid_compat import uuid7 cursor = connection.cursor() # Check if core_archiveresult table exists @@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor): if has_data: if has_uuid and not has_abid: - # Migrating from v0.7.2 (has uuid, minimal fields) - print('Migrating ArchiveResult from v0.7.2 schema...') + # Migrating from v0.7.2+ (has uuid column) + print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...') cursor.execute(""" INSERT OR IGNORE INTO core_archiveresult_new ( id, uuid, snapshot_id, cmd, pwd, cmd_version, @@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor): FROM core_archiveresult; """) else: - print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}') + # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs) + print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...') + cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult") + old_records = cursor.fetchall() + for record in old_records: + new_uuid = uuid7().hex + cursor.execute(""" + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9])) cursor.execute("DROP TABLE IF EXISTS core_archiveresult;") cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;") diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py index 600b9f4e..d53670c8 100644 --- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor): # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already # transformed by migration 0023, so we don't need to copy them here. + # NOTE: UUIDs are already populated by migration 0023 for all migration paths # Debug: Check Snapshot timestamps at end of RunPython cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2") diff --git a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py index 36b9f14c..93139900 100644 --- a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py +++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py @@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7 def migrate_archiveresult_id_to_uuid(apps, schema_editor): """ - Migrate ArchiveResult from integer PK to UUID PK. + Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration). + + Handles both migration paths: + - 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs + - 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs Strategy: - 1. Add old_id field to store current integer IDs - 2. Generate UUIDs for any records missing them - 3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id) + 1. Create new table with UUID as primary key (no temporary columns) + 2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x) + 3. Copy all data with UUID as new id + 4. Drop old table, rename new table + 5. Recreate indexes + + Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid) """ cursor = connection.cursor() @@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): cursor.execute("SELECT COUNT(*) FROM core_archiveresult") row_count = cursor.fetchone()[0] - if row_count == 0: - print('No ArchiveResult records to migrate') - return + # Don't skip if table is empty - we still need to recreate to remove uuid column + # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029) - print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...') + if row_count == 0: + print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...') + else: + print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...') # Step 0: Check if machine_process table exists, if not NULL out process_id values cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'") @@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): print('machine_process table does not exist yet, setting process_id to NULL') cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL") - # Step 1: Create new table with UUID as primary key + # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns) cursor.execute(""" CREATE TABLE core_archiveresult_new ( id TEXT PRIMARY KEY NOT NULL, - old_id INTEGER, - uuid TEXT UNIQUE, created_at DATETIME NOT NULL, modified_at DATETIME NOT NULL, @@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): """) # Step 2: Generate UUIDs for records that don't have them - cursor.execute("SELECT id, uuid FROM core_archiveresult") - records = cursor.fetchall() + # Check if uuid column exists (0.8.x has it, 0.7.x doesn't) + cursor.execute("PRAGMA table_info(core_archiveresult)") + columns = cursor.fetchall() + col_names = [col[1] for col in columns] + has_uuid_column = 'uuid' in col_names - id_to_uuid = {} - for old_id, existing_uuid in records: - if existing_uuid: - # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format) - # (existing UUIDs might be stored with or without dashes in old schema) - id_to_uuid[old_id] = UUID(existing_uuid).hex - else: - # Generate new UUIDv7 (time-ordered) as 32-char hex - id_to_uuid[old_id] = uuid7().hex + if has_uuid_column: + cursor.execute("SELECT id, uuid FROM core_archiveresult") + records = cursor.fetchall() + id_to_uuid = {} + for old_id, existing_uuid in records: + if existing_uuid: + # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format) + # (existing UUIDs might be stored with or without dashes in old schema) + id_to_uuid[old_id] = UUID(existing_uuid).hex + else: + # Generate new UUIDv7 (time-ordered) as 32-char hex + id_to_uuid[old_id] = uuid7().hex + else: + # 0.7.x path: no uuid column, generate new UUIDs for all records + cursor.execute("SELECT id FROM core_archiveresult") + records = cursor.fetchall() + id_to_uuid = {old_id: uuid7().hex for (old_id,) in records} # Step 3: Copy data with UUIDs as new primary key cursor.execute("SELECT * FROM core_archiveresult") old_records = cursor.fetchall() - # Get column names - cursor.execute("PRAGMA table_info(core_archiveresult)") - columns = cursor.fetchall() - col_names = [col[1] for col in columns] - + # col_names already fetched in Step 2 + inserted_count = 0 for i, record in enumerate(old_records): old_id = record[col_names.index('id')] new_uuid = id_to_uuid[old_id] @@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): # Build insert with new structure values = {col_names[i]: record[i] for i in range(len(col_names))} - # Check which fields exist in new table + # List of fields to copy (all fields from new schema except id, old_id, uuid) fields_to_copy = [ 'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name', 'status', 'retry_at', 'start_ts', 'end_ts', @@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): 'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id' ] - # Build INSERT statement + # Build INSERT statement (only copy fields that exist in source) existing_fields = [f for f in fields_to_copy if f in values] - placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid - field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields) - insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields] + if i == 0: + print(f'[0029] Source columns: {col_names}') + print(f'[0029] Copying fields: {existing_fields}') - cursor.execute( - f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})", - insert_values - ) + placeholders = ', '.join(['?'] * (len(existing_fields) + 1)) # +1 for id + field_list = 'id, ' + ', '.join(existing_fields) + + insert_values = [new_uuid] + [values.get(f) for f in existing_fields] + + try: + cursor.execute( + f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})", + insert_values + ) + inserted_count += 1 + except Exception as e: + print(f'[0029] ERROR inserting record {old_id}: {e}') + if i == 0: + print(f'[0029] First record values: {insert_values[:5]}...') + raise + + print(f'[0029] Inserted {inserted_count}/{len(old_records)} records') # Step 4: Replace old table with new table cursor.execute("DROP TABLE core_archiveresult") @@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor): cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)") cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)") cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)") - cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)") print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key') @@ -159,23 +188,17 @@ class Migration(migrations.Migration): ), ], state_operations=[ - # Remove old uuid field + # Remove uuid field (was added in 0025, we're merging it into id) migrations.RemoveField( model_name='archiveresult', name='uuid', ), - # Change id from AutoField to UUIDField + # Change id from AutoField to UUIDField (absorbing the uuid field) migrations.AlterField( model_name='archiveresult', name='id', field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True), ), - # Add old_id field to preserve legacy integer IDs - migrations.AddField( - model_name='archiveresult', - name='old_id', - field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'), - ), ], ), ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b05ad501..ed2fc534 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def domain(self) -> str: return url_domain(self.url) - @cached_property + @property def output_dir(self): """The filesystem path to the snapshot's output directory.""" import os @@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]') # Clean up .pid files from output directory - if self.OUTPUT_DIR.exists(): - for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): + if Path(self.output_dir).exists(): + for pid_file in Path(self.output_dir).glob('**/*.pid'): pid_file.unlink(missing_ok=True) # Update all STARTED ArchiveResults from filesystem @@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # UUID primary key (migrated from integer in 0029) id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) - # old_id preserves the legacy integer ID for backward compatibility - old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions') - # Note: uuid field was removed in migration 0029 when id became UUID created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi @property def output_dir_parent(self) -> str: - return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR)) + return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR)) # Properties that delegate to Process model (for backwards compatibility) # These properties will replace the direct fields after migration is complete diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 86277275..9083d9f5 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -180,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return crawl @property - def OUTPUT_DIR(self) -> Path: + def output_dir(self) -> Path: """ Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id} Domain is extracted from the first URL in the crawl. @@ -383,7 +383,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith f.flush() hook_start = time.time() plugin_name = hook.parent.name - output_dir = self.OUTPUT_DIR / plugin_name + output_dir = self.output_dir / plugin_name output_dir.mkdir(parents=True, exist_ok=True) # Run hook using Process.launch() - returns Process model @@ -427,7 +427,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith f.write(f'Created {len(created_snapshots)} snapshots\n') f.write(f'=== Crawl.run() complete ===\n\n') f.flush() - return created_snapshots[0] if created_snapshots else None + + # Return first snapshot for this crawl (newly created or existing) + # This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created + return self.snapshot_set.first() def is_finished(self) -> bool: """Check if crawl is finished (all snapshots sealed or no snapshots exist).""" @@ -467,8 +470,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]') # Clean up .pid files from output directory - if self.OUTPUT_DIR.exists(): - for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): + if self.output_dir.exists(): + for pid_file in self.output_dir.glob('**/*.pid'): pid_file.unlink(missing_ok=True) # Run on_CrawlEnd hooks @@ -479,7 +482,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith for hook in hooks: plugin_name = hook.parent.name - output_dir = self.OUTPUT_DIR / plugin_name + output_dir = self.output_dir / plugin_name output_dir.mkdir(parents=True, exist_ok=True) process = run_hook( diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 0f69ad77..69de28ba 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -207,7 +207,7 @@ def discover_hooks( # Get merged config if not provided (lazy import to avoid circular dependency) if config is None: from archivebox.config.configset import get_config - config = get_config(scope='global') + config = get_config() enabled_hooks = [] @@ -703,7 +703,7 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]: # Get merged config if not provided if config is None: from archivebox.config.configset import get_config - config = get_config(scope='global') + config = get_config() # Support explicit ENABLED_PLUGINS override (legacy) if 'ENABLED_PLUGINS' in config: @@ -967,9 +967,9 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ else: # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True) import sys - print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr) enabled_key = f'{plugin_upper}_ENABLED' enabled = config.get(enabled_key) + print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr) if enabled is None: enabled = True elif isinstance(enabled, str): diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index e9777d80..73740a12 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -378,7 +378,7 @@ class Binary(ModelWithHealthStats): return None @property - def OUTPUT_DIR(self): + def output_dir(self): """Return the output directory for this binary installation.""" from pathlib import Path from django.conf import settings @@ -412,10 +412,10 @@ class Binary(ModelWithHealthStats): from archivebox.config.configset import get_config # Get merged config (Binary doesn't have crawl/snapshot context) - config = get_config(scope='global') + config = get_config() # Create output directory - output_dir = self.OUTPUT_DIR + output_dir = self.output_dir output_dir.mkdir(parents=True, exist_ok=True) self.output_dir = str(output_dir) self.save() @@ -514,7 +514,7 @@ class Binary(ModelWithHealthStats): print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]') # Clean up .pid files from output directory - output_dir = self.OUTPUT_DIR + output_dir = self.output_dir if output_dir.exists(): for pid_file in output_dir.glob('**/*.pid'): pid_file.unlink(missing_ok=True) @@ -1276,6 +1276,128 @@ class Process(models.Model): """Path to stderr log.""" return Path(self.pwd) / 'stderr.log' if self.pwd else None + def tail_stdout(self, lines: int = 50, follow: bool = False): + """ + Tail stdout log file (like `tail` or `tail -f`). + + Args: + lines: Number of lines to show (default 50) + follow: If True, follow the file and yield new lines as they appear + + Yields: + Lines from stdout + """ + if not self.stdout_file or not self.stdout_file.exists(): + return + + if follow: + # Follow mode - yield new lines as they appear (tail -f) + import time + with open(self.stdout_file, 'r') as f: + # Seek to end minus roughly 'lines' worth of bytes + f.seek(0, 2) # Seek to end + file_size = f.tell() + # Rough estimate: 100 bytes per line + seek_pos = max(0, file_size - (lines * 100)) + f.seek(seek_pos) + + # Skip partial line if we seeked to middle + if seek_pos > 0: + f.readline() + + # Yield existing lines + for line in f: + yield line.rstrip('\n') + + # Now follow for new lines + while True: + line = f.readline() + if line: + yield line.rstrip('\n') + else: + time.sleep(0.1) # Wait before checking again + else: + # Just get last N lines (tail -n) + try: + content = self.stdout_file.read_text() + for line in content.splitlines()[-lines:]: + yield line + except Exception: + return + + def tail_stderr(self, lines: int = 50, follow: bool = False): + """ + Tail stderr log file (like `tail` or `tail -f`). + + Args: + lines: Number of lines to show (default 50) + follow: If True, follow the file and yield new lines as they appear + + Yields: + Lines from stderr + """ + if not self.stderr_file or not self.stderr_file.exists(): + return + + if follow: + # Follow mode - yield new lines as they appear (tail -f) + import time + with open(self.stderr_file, 'r') as f: + # Seek to end minus roughly 'lines' worth of bytes + f.seek(0, 2) # Seek to end + file_size = f.tell() + # Rough estimate: 100 bytes per line + seek_pos = max(0, file_size - (lines * 100)) + f.seek(seek_pos) + + # Skip partial line if we seeked to middle + if seek_pos > 0: + f.readline() + + # Yield existing lines + for line in f: + yield line.rstrip('\n') + + # Now follow for new lines + while True: + line = f.readline() + if line: + yield line.rstrip('\n') + else: + time.sleep(0.1) # Wait before checking again + else: + # Just get last N lines (tail -n) + try: + content = self.stderr_file.read_text() + for line in content.splitlines()[-lines:]: + yield line + except Exception: + return + + def pipe_stdout(self, lines: int = 10, follow: bool = True): + """ + Pipe stdout to sys.stdout. + + Args: + lines: Number of initial lines to show + follow: If True, follow the file and print new lines as they appear + """ + import sys + for line in self.tail_stdout(lines=lines, follow=follow): + print(line, file=sys.stdout, flush=True) + + def pipe_stderr(self, lines: int = 10, follow: bool = True): + """ + Pipe stderr to sys.stderr. + + Args: + lines: Number of initial lines to show + follow: If True, follow the file and print new lines as they appear + """ + import sys + for line in self.tail_stderr(lines=lines, follow=follow): + print(line, file=sys.stderr, flush=True) + def _write_pid_file(self) -> None: """Write PID file with mtime set to process start time.""" if self.pid and self.started_at and self.pid_file: diff --git a/archivebox/plugins/chrome/config.json b/archivebox/plugins/chrome/config.json index 0bc9e754..79d1946d 100644 --- a/archivebox/plugins/chrome/config.json +++ b/archivebox/plugins/chrome/config.json @@ -3,6 +3,12 @@ "type": "object", "additionalProperties": false, "properties": { + "CHROME_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["USE_CHROME"], + "description": "Enable Chrome/Chromium browser integration for archiving" + }, "CHROME_BINARY": { "type": "string", "default": "chromium", diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index b2941991..d3f09c30 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -201,16 +201,18 @@ def test_config_save_screenshot_false_skips(): """Test that SCREENSHOT_ENABLED=False exits without emitting JSONL.""" import os + # FIRST check what Python sees + print(f"\n[DEBUG PYTHON] NODE_V8_COVERAGE in os.environ: {'NODE_V8_COVERAGE' in os.environ}") + print(f"[DEBUG PYTHON] Value: {os.environ.get('NODE_V8_COVERAGE', 'NOT SET')}") + with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) env = os.environ.copy() env['SCREENSHOT_ENABLED'] = 'False' - # DEBUG: Check if NODE_V8_COVERAGE is in env - if 'NODE_V8_COVERAGE' in env: - print(f"\n[DEBUG] NODE_V8_COVERAGE in env: {env['NODE_V8_COVERAGE']}") - else: - print("\n[DEBUG] NODE_V8_COVERAGE NOT in env") + # Check what's in the copied env + print(f"[DEBUG ENV COPY] NODE_V8_COVERAGE in env: {'NODE_V8_COVERAGE' in env}") + print(f"[DEBUG ENV COPY] Value: {env.get('NODE_V8_COVERAGE', 'NOT SET')}") result = subprocess.run( ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'], @@ -221,6 +223,12 @@ def test_config_save_screenshot_false_skips(): timeout=30 ) + print(f"[DEBUG RESULT] Exit code: {result.returncode}") + print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}") + + # FORCE FAILURE to verify test actually runs + assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}" + assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" # Feature disabled - temporary failure, should NOT emit JSONL diff --git a/archivebox/tests/test_migrations_07_to_09.py b/archivebox/tests/test_migrations_07_to_09.py index f8f23a2f..626e9aab 100644 --- a/archivebox/tests/test_migrations_07_to_09.py +++ b/archivebox/tests/test_migrations_07_to_09.py @@ -136,7 +136,7 @@ class TestMigrationFrom07x(unittest.TestCase): result = run_archivebox(self.work_dir, ['init'], timeout=45) self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") - result = run_archivebox(self.work_dir, ['list']) + result = run_archivebox(self.work_dir, ['snapshot', 'list']) self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}") # Verify ALL snapshots appear in output diff --git a/archivebox/tests/test_worker_config_propagation.py b/archivebox/tests/test_worker_config_propagation.py new file mode 100644 index 00000000..487cbf15 --- /dev/null +++ b/archivebox/tests/test_worker_config_propagation.py @@ -0,0 +1,481 @@ +""" +Integration test for config propagation through worker hierarchy. + +Tests that config is properly merged and passed through: + Parent CLI/Orchestrator + └── CrawlWorker subprocess (via Process.env) + └── SnapshotWorker subprocess (via Process.env) + └── Hook subprocess (via Process.env) + +Config priority order (highest to lowest): +1. Snapshot.config (JSON field) +2. Crawl.config (JSON field) +3. User.config (JSON field) +4. Environment variables (os.environ + Process.env) +5. Config file (ArchiveBox.conf) +6. Plugin defaults (config.json) +7. Core defaults +""" + +import os +import json +import tempfile +import subprocess +import time +from pathlib import Path + + +def test_config_propagation_through_worker_hierarchy(): + """ + Integration test: Verify config is properly merged at every level. + + Test flow: + 1. Create test archive with custom config in ArchiveBox.conf + 2. Set custom env vars before spawning worker + 3. Create Crawl with custom crawl.config JSON field + 4. Create Snapshot with custom snapshot.config JSON field + 5. Spawn SnapshotWorker via archivebox run --snapshot-id=... + 6. Verify worker received merged config from all sources + 7. Verify hook subprocess also received correct config + """ + + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) / 'test_archive' + data_dir.mkdir() + + print(f"\n{'='*80}") + print(f"Test: Config Propagation Through Worker Hierarchy") + print(f"DATA_DIR: {data_dir}") + print(f"{'='*80}\n") + + # Step 1: Initialize archive + print("Step 1: Initialize archive") + result = subprocess.run( + ['python', '-m', 'archivebox', 'init'], + cwd=str(data_dir), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=60, + ) + assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" + print(f"✓ Archive initialized\n") + + # Step 2: Write custom config to ArchiveBox.conf + print("Step 2: Write custom config to ArchiveBox.conf") + config_file = data_dir / 'ArchiveBox.conf' + config_file.write_text(""" +[GENERAL] +# Custom timeout in config file +TIMEOUT = 999 + +[ARCHIVING_CONFIG] +# Enable all plugins for proper testing +SAVE_WGET = True +SAVE_WARC = True +SAVE_PDF = True +SAVE_DOM = True +SAVE_SINGLEFILE = True +SAVE_READABILITY = True +SAVE_MERCURY = True +SAVE_HTMLTOTEXT = True +SAVE_GIT = True +SAVE_MEDIA = True +SAVE_ARCHIVE_DOT_ORG = True +SAVE_TITLE = True +SAVE_FAVICON = True +SAVE_SCREENSHOT = True +""") + print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n") + + # Step 2.5: Set Machine.config values + print("Step 2.5: Set Machine.config with custom binary path") + set_machine_config_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from archivebox.machine.models import Machine + +machine = Machine.current() +machine.config = {{ + 'CUSTOM_MACHINE_KEY': 'from_machine_config', + 'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path +}} +machine.save() +print(f"Machine {{machine.hostname}} config updated") +""" + result = subprocess.run( + ['python', '-c', set_machine_config_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}" + print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n") + + # Step 3: Create Crawl via Django ORM with custom crawl.config + print("Step 3: Create Crawl with custom crawl.config JSON") + create_crawl_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from django.utils import timezone +from archivebox.crawls.models import Crawl + +# Create crawl with custom config +crawl = Crawl.objects.create( + status='queued', + retry_at=timezone.now(), + urls='https://example.com', + config={{ + 'TIMEOUT': 777, # Crawl-level override (higher priority than file) + 'CUSTOM_CRAWL_KEY': 'from_crawl_json', + }} +) +print(crawl.id) +""" + result = subprocess.run( + ['python', '-c', create_crawl_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}" + # Extract UUID from output (last line should be the UUID) + crawl_id = result.stdout.decode().strip().split('\n')[-1] + print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n") + + # Step 4: Create Snapshot with custom snapshot.config + print("Step 4: Create Snapshot with custom snapshot.config JSON") + create_snapshot_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from django.utils import timezone +from archivebox.core.models import Snapshot +from archivebox.crawls.models import Crawl + +crawl = Crawl.objects.get(id='{crawl_id}') +snapshot = Snapshot.objects.create( + url='https://example.com', + crawl=crawl, + status='queued', + retry_at=timezone.now(), + config={{ + 'TIMEOUT': 555, # Snapshot-level override (highest priority) + 'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json', + 'SAVE_SCREENSHOT': True, # Keep screenshot enabled + 'SAVE_WGET': False, # But disable wget as a test of per-snapshot override + }} +) +print(snapshot.id) +""" + result = subprocess.run( + ['python', '-c', create_snapshot_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}" + # Extract UUID from output (last line should be the UUID) + snapshot_id = result.stdout.decode().strip().split('\n')[-1] + print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n") + + # Step 5: Run SnapshotWorker with additional env var + print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment") + result = subprocess.run( + ['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id], + cwd=str(data_dir), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + 'ENV_VAR_KEY': 'from_environment', # Environment variable + }, + capture_output=True, + timeout=120, + ) + + stdout = result.stdout.decode() + stderr = result.stderr.decode() + + print("\n--- SnapshotWorker stdout ---") + print(stdout) + print("\n--- SnapshotWorker stderr ---") + print(stderr) + print("--- End output ---\n") + + # Step 6: Verify config was properly merged + print("Step 6: Verify config merging") + + # Check that SnapshotWorker ran successfully + assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}" + + # Verify config by checking stderr debug output and ArchiveResults in database + print("\n--- Verifying config propagation ---\n") + + # Check for config debug messages in stderr + assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \ + "Expected debug output not found in stderr" + print("✓ Config debug output found in stderr") + + # Verify config values were actually used by checking ArchiveResults + verify_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from archivebox.core.models import Snapshot, ArchiveResult +from archivebox.config.configset import get_config + +snapshot = Snapshot.objects.get(id='{snapshot_id}') +print(f"Snapshot status: {{snapshot.status}}") +print(f"Snapshot URL: {{snapshot.url}}") + +# Check that snapshot reached sealed state +assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}" + +# Verify all config sources are present in merged config +print("\\nVerifying config merge priority:") +config = get_config(snapshot=snapshot) + +# 1. Snapshot.config (highest priority) +timeout = config.get('TIMEOUT') +print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)") +assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}" + +wget_enabled = config.get('SAVE_WGET') +print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)") +assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}" + +custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY') +print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)") +assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}" + +# 2. Crawl.config +custom_crawl = config.get('CUSTOM_CRAWL_KEY') +print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)") +assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}" + +# 6. Machine.config +custom_machine = config.get('CUSTOM_MACHINE_KEY') +print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)") +assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}" + +wget_binary = config.get('WGET_BINARY') +print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)") +# Note: This might be overridden by environment or other sources, just check it's present +assert wget_binary is not None, f"WGET_BINARY should be present" + +# Check ArchiveResults to verify plugins actually ran with correct config +results = ArchiveResult.objects.filter(snapshot=snapshot) +print(f"\\nArchiveResults created: {{results.count()}}") + +for ar in results.order_by('plugin'): + print(f" {{ar.plugin}}: {{ar.status}}") + +# Verify SAVE_WGET=False was respected (should have no wget result) +wget_results = results.filter(plugin='wget') +print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)") +assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results" + +# Verify SAVE_SCREENSHOT=True was respected (should have screenshot result) +screenshot_results = results.filter(plugin='screenshot') +print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)") +assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results" + +print("\\n✓ All config sources correctly merged:") +print(" - Snapshot.config overrides (highest priority)") +print(" - Crawl.config values present") +print(" - Machine.config values present") +print(" - File config values present") +print("✓ Config priority order verified") +print("✓ Snapshot successfully sealed") +""" + result = subprocess.run( + ['python', '-c', verify_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + + print(result.stdout.decode()) + if result.returncode != 0: + print("\nVerification error:") + print(result.stderr.decode()) + + assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}" + + print("\n" + "="*80) + print("✓ TEST PASSED: Config properly propagated through worker hierarchy") + print("="*80 + "\n") + + +def test_config_environment_variable_parsing(): + """ + Test that Process._build_env() correctly serializes config values, + and get_config() correctly parses them back from environment. + """ + + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) / 'test_archive' + data_dir.mkdir() + + print(f"\n{'='*80}") + print(f"Test: Config Environment Variable Parsing") + print(f"DATA_DIR: {data_dir}") + print(f"{'='*80}\n") + + # Initialize archive + result = subprocess.run( + ['python', '-m', 'archivebox', 'init'], + cwd=str(data_dir), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=60, + ) + assert result.returncode == 0, f"Init failed: {result.stderr.decode()}" + + # Test various data types in config + test_config_types_script = f""" +import os +os.environ['DATA_DIR'] = '{data_dir}' + +from archivebox.config.django import setup_django +setup_django() + +from archivebox.config.configset import get_config +from archivebox.machine.models import Process, Machine + +# Test get_config() with no overrides (baseline) +config = get_config() +print(f"Baseline config keys: {{len(config)}}") + +# Create a test Process with various config types +process = Process.objects.create( + machine=Machine.current(), + process_type=Process.TypeChoices.WORKER, + pwd='{data_dir}', + cmd=['test'], + env={{ + 'STRING_VAL': 'hello', + 'INT_VAL': 123, + 'FLOAT_VAL': 45.67, + 'BOOL_TRUE': True, + 'BOOL_FALSE': False, + 'LIST_VAL': ['a', 'b', 'c'], + 'DICT_VAL': {{'key': 'value'}}, + 'NONE_VAL': None, + }}, +) + +# Test _build_env() serialization +env = process._build_env() +print(f"\\nSerialized environment:") +print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})") +print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})") +print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})") +print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})") +print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})") +print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})") +print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})") +print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)") + +# Verify all are strings (required by subprocess.Popen) +assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str" +assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str" +assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str" +assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str" +assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str" +assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str" +assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str" + +print("\\n✓ All environment values correctly serialized as strings") + +# Now test that get_config() can parse them back +# Simulate subprocess by setting os.environ +import json +for key, val in env.items(): + if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']: + os.environ[key] = val + +# Get config again - should parse from environment +config = get_config() +print(f"\\nParsed from environment:") +print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})") +print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})") +print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})") +print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})") +print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})") +print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})") +print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})") + +print("\\n✓ All config values correctly parsed from environment") +""" + + result = subprocess.run( + ['python', '-c', test_config_types_script], + cwd=str(data_dir.parent), + env={ + **os.environ, + 'DATA_DIR': str(data_dir), + 'USE_COLOR': 'False', + }, + capture_output=True, + timeout=30, + ) + + print(result.stdout.decode()) + if result.stderr: + print("Script stderr:") + print(result.stderr.decode()) + + assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}" + + print("\n" + "="*80) + print("✓ TEST PASSED: Config serialization and parsing works correctly") + print("="*80 + "\n") + + +if __name__ == '__main__': + # Run as standalone script + test_config_propagation_through_worker_hierarchy() + test_config_environment_variable_parsing() diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 826accdb..9355649d 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -308,8 +308,8 @@ class Worker: crawl = Crawl.objects.get(id=crawl_id) cmd = [sys.executable, '-m', 'archivebox', 'run', '--crawl-id', str(crawl_id)] - pwd = Path(crawl.OUTPUT_DIR) # Run in crawl's output directory - env = get_config(scope='crawl', crawl=crawl) + pwd = Path(crawl.output_dir) # Run in crawl's output directory + env = get_config(crawl=crawl) elif cls.name == 'snapshot': snapshot_id = kwargs.get('snapshot_id') @@ -321,7 +321,7 @@ class Worker: cmd = [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', str(snapshot_id)] pwd = Path(snapshot.output_dir) # Run in snapshot's output directory - env = get_config(scope='snapshot', snapshot=snapshot) + env = get_config(snapshot=snapshot) else: raise ValueError(f"Unknown worker type: {cls.name}") @@ -459,6 +459,8 @@ class CrawlWorker(Worker): from pathlib import Path from archivebox.core.models import Snapshot from archivebox.machine.models import Process + import sys + import threading debug_log = Path('/tmp/archivebox_crawl_worker_debug.log') @@ -514,7 +516,9 @@ class CrawlWorker(Worker): with open(debug_log, 'a') as f: f.write(f' Spawning worker for {snapshot.url} (status={snapshot.status})\n') f.flush() - SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id)) + + pid = SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id)) + log_worker_event( worker_type='CrawlWorker', event=f'Spawned SnapshotWorker for {snapshot.url}', @@ -522,6 +526,18 @@ class CrawlWorker(Worker): pid=self.pid, ) + # Pipe the SnapshotWorker's stderr to our stderr so we can see what's happening + # Get the Process record that was just created + worker_process = Process.objects.filter(pid=pid).first() + if worker_process: + # Pipe stderr in background thread so it doesn't block + def pipe_worker_stderr(): + for line in worker_process.tail_stderr(lines=0, follow=True): + print(f' [SnapshotWorker] {line}', file=sys.stderr, flush=True) + + thread = threading.Thread(target=pipe_worker_stderr, daemon=True) + thread.start() + def _is_crawl_finished(self) -> bool: """Check if all snapshots are sealed.""" from pathlib import Path @@ -626,16 +642,28 @@ class SnapshotWorker(Worker): """Execute all hooks sequentially.""" from archivebox.hooks import discover_hooks, is_background_hook, extract_step from archivebox.core.models import ArchiveResult + from archivebox.config.configset import get_config self.on_startup() try: + # Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.) + config = get_config(snapshot=self.snapshot) + # Discover all hooks for this snapshot - hooks = discover_hooks('Snapshot', config=self.snapshot.config) + hooks = discover_hooks('Snapshot', config=config) hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix) + import sys + print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True) + if hooks: + print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True) + else: + print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True) + # Execute each hook sequentially for hook_path in hooks: + print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True) hook_name = hook_path.name plugin = self._extract_plugin_name(hook_name) hook_step = extract_step(hook_name) @@ -661,7 +689,7 @@ class SnapshotWorker(Worker): ar.save(update_fields=['status', 'start_ts', 'modified_at']) # Fork and run the hook - process = self._run_hook(hook_path, ar) + process = self._run_hook(hook_path, ar, config) if is_background: # Track but don't wait @@ -698,7 +726,7 @@ class SnapshotWorker(Worker): finally: self.on_shutdown() - def _run_hook(self, hook_path: Path, ar: Any) -> Any: + def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any: """Fork and run a hook using Process model, return Process.""" from archivebox.hooks import run_hook @@ -710,7 +738,7 @@ class SnapshotWorker(Worker): process = run_hook( script=hook_path, output_dir=output_dir, - config=self.snapshot.config, + config=config, timeout=120, parent=self.db_process, url=str(self.snapshot.url), diff --git a/bin/test_plugins.sh b/bin/test_plugins.sh index e3257da6..cc21eca6 100755 --- a/bin/test_plugins.sh +++ b/bin/test_plugins.sh @@ -179,7 +179,7 @@ if [ "$ENABLE_COVERAGE" = true ]; then export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js" echo "Python coverage: enabled (subprocess support)" - echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)" + echo "JavaScript coverage: enabled (NODE_V8_COVERAGE=$NODE_V8_COVERAGE)" echo "" fi