mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
more migration id/uuid and config propagation fixes
This commit is contained in:
@@ -111,7 +111,7 @@ class ModelWithOutputDir(ModelWithUUID):
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
super().save(*args, **kwargs)
|
||||
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
|
||||
# Note: index.json is deprecated, models should use write_index_jsonl() for full data
|
||||
|
||||
@property
|
||||
@@ -127,5 +127,5 @@ class ModelWithOutputDir(ModelWithUUID):
|
||||
return f'{self.output_dir_parent}/{self.output_dir_name}'
|
||||
|
||||
@property
|
||||
def OUTPUT_DIR(self) -> Path:
|
||||
return DATA_DIR / self.output_dir_str
|
||||
def output_dir(self) -> Path:
|
||||
raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
|
||||
|
||||
@@ -118,12 +118,12 @@ class BaseConfigSet(BaseSettings):
|
||||
|
||||
|
||||
def get_config(
|
||||
scope: str = "global",
|
||||
defaults: Optional[Dict] = None,
|
||||
persona: Any = None,
|
||||
user: Any = None,
|
||||
crawl: Any = None,
|
||||
snapshot: Any = None,
|
||||
machine: Any = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Get merged config from all sources.
|
||||
@@ -134,17 +134,18 @@ def get_config(
|
||||
3. Per-user config (user.config JSON field)
|
||||
4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
|
||||
5. Environment variables
|
||||
6. Config file (ArchiveBox.conf)
|
||||
7. Plugin schema defaults (config.json)
|
||||
8. Core config defaults
|
||||
6. Per-machine config (machine.config JSON field - resolved binary paths)
|
||||
7. Config file (ArchiveBox.conf)
|
||||
8. Plugin schema defaults (config.json)
|
||||
9. Core config defaults
|
||||
|
||||
Args:
|
||||
scope: Config scope ('global', 'crawl', 'snapshot', etc.)
|
||||
defaults: Default values to start with
|
||||
persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
|
||||
user: User object with config JSON field
|
||||
crawl: Crawl object with config JSON field
|
||||
snapshot: Snapshot object with config JSON field
|
||||
machine: Machine object with config JSON field (defaults to Machine.current())
|
||||
|
||||
Returns:
|
||||
Merged config dict
|
||||
@@ -184,6 +185,18 @@ def get_config(
|
||||
file_config = BaseConfigSet.load_from_file(config_file)
|
||||
config.update(file_config)
|
||||
|
||||
# Apply machine config overrides (cached binary paths, etc.)
|
||||
if machine is None:
|
||||
# Default to current machine if not provided
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
machine = Machine.current()
|
||||
except Exception:
|
||||
pass # Machine might not be available during early init
|
||||
|
||||
if machine and hasattr(machine, "config") and machine.config:
|
||||
config.update(machine.config)
|
||||
|
||||
# Override with environment variables
|
||||
for key in config:
|
||||
env_val = os.environ.get(key)
|
||||
@@ -221,8 +234,8 @@ def get_config(
|
||||
config.update(crawl.config)
|
||||
|
||||
# Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
|
||||
if crawl and hasattr(crawl, "OUTPUT_DIR"):
|
||||
config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
|
||||
if crawl and hasattr(crawl, "output_dir"):
|
||||
config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
|
||||
|
||||
# Apply snapshot config overrides (highest priority)
|
||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||
@@ -260,7 +273,7 @@ def get_flat_config() -> Dict[str, Any]:
|
||||
|
||||
Replaces abx.pm.hook.get_FLAT_CONFIG()
|
||||
"""
|
||||
return get_config(scope="global")
|
||||
return get_config()
|
||||
|
||||
|
||||
def get_all_configs() -> Dict[str, BaseConfigSet]:
|
||||
|
||||
@@ -176,6 +176,7 @@ class ConstantsDict(Mapping):
|
||||
CRONTABS_DIR_NAME,
|
||||
"invalid",
|
||||
"users",
|
||||
"machine",
|
||||
# Backwards compatibility with old directory names
|
||||
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
||||
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
||||
|
||||
@@ -15,6 +15,7 @@ def get_table_columns(table_name):
|
||||
|
||||
def upgrade_core_tables(apps, schema_editor):
|
||||
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
|
||||
from archivebox.uuid_compat import uuid7
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if core_archiveresult table exists
|
||||
@@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
|
||||
if has_data:
|
||||
if has_uuid and not has_abid:
|
||||
# Migrating from v0.7.2 (has uuid, minimal fields)
|
||||
print('Migrating ArchiveResult from v0.7.2 schema...')
|
||||
# Migrating from v0.7.2+ (has uuid column)
|
||||
print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
||||
@@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
FROM core_archiveresult;
|
||||
""")
|
||||
else:
|
||||
print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
|
||||
# Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
|
||||
print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
|
||||
cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
|
||||
old_records = cursor.fetchall()
|
||||
for record in old_records:
|
||||
new_uuid = uuid7().hex
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status, extractor, output
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
|
||||
|
||||
@@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor):
|
||||
|
||||
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
|
||||
# transformed by migration 0023, so we don't need to copy them here.
|
||||
# NOTE: UUIDs are already populated by migration 0023 for all migration paths
|
||||
|
||||
# Debug: Check Snapshot timestamps at end of RunPython
|
||||
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
|
||||
|
||||
@@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7
|
||||
|
||||
def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
"""
|
||||
Migrate ArchiveResult from integer PK to UUID PK.
|
||||
Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration).
|
||||
|
||||
Handles both migration paths:
|
||||
- 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs
|
||||
- 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs
|
||||
|
||||
Strategy:
|
||||
1. Add old_id field to store current integer IDs
|
||||
2. Generate UUIDs for any records missing them
|
||||
3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
|
||||
1. Create new table with UUID as primary key (no temporary columns)
|
||||
2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x)
|
||||
3. Copy all data with UUID as new id
|
||||
4. Drop old table, rename new table
|
||||
5. Recreate indexes
|
||||
|
||||
Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid)
|
||||
"""
|
||||
cursor = connection.cursor()
|
||||
|
||||
@@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
|
||||
row_count = cursor.fetchone()[0]
|
||||
|
||||
if row_count == 0:
|
||||
print('No ArchiveResult records to migrate')
|
||||
return
|
||||
# Don't skip if table is empty - we still need to recreate to remove uuid column
|
||||
# (fresh installs create table with uuid from 0025, but model expects no uuid after 0029)
|
||||
|
||||
print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
|
||||
if row_count == 0:
|
||||
print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...')
|
||||
else:
|
||||
print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
|
||||
|
||||
# Step 0: Check if machine_process table exists, if not NULL out process_id values
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
|
||||
@@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
print('machine_process table does not exist yet, setting process_id to NULL')
|
||||
cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
|
||||
|
||||
# Step 1: Create new table with UUID as primary key
|
||||
# Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns)
|
||||
cursor.execute("""
|
||||
CREATE TABLE core_archiveresult_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
old_id INTEGER,
|
||||
uuid TEXT UNIQUE,
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME NOT NULL,
|
||||
|
||||
@@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
""")
|
||||
|
||||
# Step 2: Generate UUIDs for records that don't have them
|
||||
cursor.execute("SELECT id, uuid FROM core_archiveresult")
|
||||
records = cursor.fetchall()
|
||||
# Check if uuid column exists (0.8.x has it, 0.7.x doesn't)
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
columns = cursor.fetchall()
|
||||
col_names = [col[1] for col in columns]
|
||||
has_uuid_column = 'uuid' in col_names
|
||||
|
||||
id_to_uuid = {}
|
||||
for old_id, existing_uuid in records:
|
||||
if existing_uuid:
|
||||
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
|
||||
# (existing UUIDs might be stored with or without dashes in old schema)
|
||||
id_to_uuid[old_id] = UUID(existing_uuid).hex
|
||||
else:
|
||||
# Generate new UUIDv7 (time-ordered) as 32-char hex
|
||||
id_to_uuid[old_id] = uuid7().hex
|
||||
if has_uuid_column:
|
||||
cursor.execute("SELECT id, uuid FROM core_archiveresult")
|
||||
records = cursor.fetchall()
|
||||
id_to_uuid = {}
|
||||
for old_id, existing_uuid in records:
|
||||
if existing_uuid:
|
||||
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
|
||||
# (existing UUIDs might be stored with or without dashes in old schema)
|
||||
id_to_uuid[old_id] = UUID(existing_uuid).hex
|
||||
else:
|
||||
# Generate new UUIDv7 (time-ordered) as 32-char hex
|
||||
id_to_uuid[old_id] = uuid7().hex
|
||||
else:
|
||||
# 0.7.x path: no uuid column, generate new UUIDs for all records
|
||||
cursor.execute("SELECT id FROM core_archiveresult")
|
||||
records = cursor.fetchall()
|
||||
id_to_uuid = {old_id: uuid7().hex for (old_id,) in records}
|
||||
|
||||
# Step 3: Copy data with UUIDs as new primary key
|
||||
cursor.execute("SELECT * FROM core_archiveresult")
|
||||
old_records = cursor.fetchall()
|
||||
|
||||
# Get column names
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
columns = cursor.fetchall()
|
||||
col_names = [col[1] for col in columns]
|
||||
|
||||
# col_names already fetched in Step 2
|
||||
inserted_count = 0
|
||||
for i, record in enumerate(old_records):
|
||||
old_id = record[col_names.index('id')]
|
||||
new_uuid = id_to_uuid[old_id]
|
||||
@@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
# Build insert with new structure
|
||||
values = {col_names[i]: record[i] for i in range(len(col_names))}
|
||||
|
||||
# Check which fields exist in new table
|
||||
# List of fields to copy (all fields from new schema except id, old_id, uuid)
|
||||
fields_to_copy = [
|
||||
'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
|
||||
'status', 'retry_at', 'start_ts', 'end_ts',
|
||||
@@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
|
||||
]
|
||||
|
||||
# Build INSERT statement
|
||||
# Build INSERT statement (only copy fields that exist in source)
|
||||
existing_fields = [f for f in fields_to_copy if f in values]
|
||||
placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid
|
||||
field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
|
||||
|
||||
insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
|
||||
if i == 0:
|
||||
print(f'[0029] Source columns: {col_names}')
|
||||
print(f'[0029] Copying fields: {existing_fields}')
|
||||
|
||||
cursor.execute(
|
||||
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
|
||||
insert_values
|
||||
)
|
||||
placeholders = ', '.join(['?'] * (len(existing_fields) + 1)) # +1 for id
|
||||
field_list = 'id, ' + ', '.join(existing_fields)
|
||||
|
||||
insert_values = [new_uuid] + [values.get(f) for f in existing_fields]
|
||||
|
||||
try:
|
||||
cursor.execute(
|
||||
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
|
||||
insert_values
|
||||
)
|
||||
inserted_count += 1
|
||||
except Exception as e:
|
||||
print(f'[0029] ERROR inserting record {old_id}: {e}')
|
||||
if i == 0:
|
||||
print(f'[0029] First record values: {insert_values[:5]}...')
|
||||
raise
|
||||
|
||||
print(f'[0029] Inserted {inserted_count}/{len(old_records)} records')
|
||||
|
||||
# Step 4: Replace old table with new table
|
||||
cursor.execute("DROP TABLE core_archiveresult")
|
||||
@@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
|
||||
|
||||
print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
|
||||
|
||||
@@ -159,23 +188,17 @@ class Migration(migrations.Migration):
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# Remove old uuid field
|
||||
# Remove uuid field (was added in 0025, we're merging it into id)
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
),
|
||||
# Change id from AutoField to UUIDField
|
||||
# Change id from AutoField to UUIDField (absorbing the uuid field)
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
|
||||
),
|
||||
# Add old_id field to preserve legacy integer IDs
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='old_id',
|
||||
field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
def domain(self) -> str:
|
||||
return url_domain(self.url)
|
||||
|
||||
@cached_property
|
||||
@property
|
||||
def output_dir(self):
|
||||
"""The filesystem path to the snapshot's output directory."""
|
||||
import os
|
||||
@@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
|
||||
|
||||
# Clean up .pid files from output directory
|
||||
if self.OUTPUT_DIR.exists():
|
||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
||||
if Path(self.output_dir).exists():
|
||||
for pid_file in Path(self.output_dir).glob('**/*.pid'):
|
||||
pid_file.unlink(missing_ok=True)
|
||||
|
||||
# Update all STARTED ArchiveResults from filesystem
|
||||
@@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# UUID primary key (migrated from integer in 0029)
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
# old_id preserves the legacy integer ID for backward compatibility
|
||||
old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
|
||||
# Note: uuid field was removed in migration 0029 when id became UUID
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
@@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
@property
|
||||
def output_dir_parent(self) -> str:
|
||||
return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
|
||||
return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))
|
||||
|
||||
# Properties that delegate to Process model (for backwards compatibility)
|
||||
# These properties will replace the direct fields after migration is complete
|
||||
|
||||
@@ -180,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
return crawl
|
||||
|
||||
@property
|
||||
def OUTPUT_DIR(self) -> Path:
|
||||
def output_dir(self) -> Path:
|
||||
"""
|
||||
Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
|
||||
Domain is extracted from the first URL in the crawl.
|
||||
@@ -383,7 +383,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
f.flush()
|
||||
hook_start = time.time()
|
||||
plugin_name = hook.parent.name
|
||||
output_dir = self.OUTPUT_DIR / plugin_name
|
||||
output_dir = self.output_dir / plugin_name
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run hook using Process.launch() - returns Process model
|
||||
@@ -427,7 +427,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
f.write(f'Created {len(created_snapshots)} snapshots\n')
|
||||
f.write(f'=== Crawl.run() complete ===\n\n')
|
||||
f.flush()
|
||||
return created_snapshots[0] if created_snapshots else None
|
||||
|
||||
# Return first snapshot for this crawl (newly created or existing)
|
||||
# This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created
|
||||
return self.snapshot_set.first()
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
|
||||
@@ -467,8 +470,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]')
|
||||
|
||||
# Clean up .pid files from output directory
|
||||
if self.OUTPUT_DIR.exists():
|
||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
||||
if self.output_dir.exists():
|
||||
for pid_file in self.output_dir.glob('**/*.pid'):
|
||||
pid_file.unlink(missing_ok=True)
|
||||
|
||||
# Run on_CrawlEnd hooks
|
||||
@@ -479,7 +482,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
|
||||
for hook in hooks:
|
||||
plugin_name = hook.parent.name
|
||||
output_dir = self.OUTPUT_DIR / plugin_name
|
||||
output_dir = self.output_dir / plugin_name
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
process = run_hook(
|
||||
|
||||
@@ -207,7 +207,7 @@ def discover_hooks(
|
||||
# Get merged config if not provided (lazy import to avoid circular dependency)
|
||||
if config is None:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(scope='global')
|
||||
config = get_config()
|
||||
|
||||
enabled_hooks = []
|
||||
|
||||
@@ -703,7 +703,7 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
|
||||
# Get merged config if not provided
|
||||
if config is None:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(scope='global')
|
||||
config = get_config()
|
||||
|
||||
# Support explicit ENABLED_PLUGINS override (legacy)
|
||||
if 'ENABLED_PLUGINS' in config:
|
||||
@@ -967,9 +967,9 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
else:
|
||||
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
|
||||
import sys
|
||||
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr)
|
||||
if enabled is None:
|
||||
enabled = True
|
||||
elif isinstance(enabled, str):
|
||||
|
||||
@@ -378,7 +378,7 @@ class Binary(ModelWithHealthStats):
|
||||
return None
|
||||
|
||||
@property
|
||||
def OUTPUT_DIR(self):
|
||||
def output_dir(self):
|
||||
"""Return the output directory for this binary installation."""
|
||||
from pathlib import Path
|
||||
from django.conf import settings
|
||||
@@ -412,10 +412,10 @@ class Binary(ModelWithHealthStats):
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config (Binary doesn't have crawl/snapshot context)
|
||||
config = get_config(scope='global')
|
||||
config = get_config()
|
||||
|
||||
# Create output directory
|
||||
output_dir = self.OUTPUT_DIR
|
||||
output_dir = self.output_dir
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.output_dir = str(output_dir)
|
||||
self.save()
|
||||
@@ -514,7 +514,7 @@ class Binary(ModelWithHealthStats):
|
||||
print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]')
|
||||
|
||||
# Clean up .pid files from output directory
|
||||
output_dir = self.OUTPUT_DIR
|
||||
output_dir = self.output_dir
|
||||
if output_dir.exists():
|
||||
for pid_file in output_dir.glob('**/*.pid'):
|
||||
pid_file.unlink(missing_ok=True)
|
||||
@@ -1276,6 +1276,128 @@ class Process(models.Model):
|
||||
"""Path to stderr log."""
|
||||
return Path(self.pwd) / 'stderr.log' if self.pwd else None
|
||||
|
||||
def tail_stdout(self, lines: int = 50, follow: bool = False):
|
||||
"""
|
||||
Tail stdout log file (like `tail` or `tail -f`).
|
||||
|
||||
Args:
|
||||
lines: Number of lines to show (default 50)
|
||||
follow: If True, follow the file and yield new lines as they appear
|
||||
|
||||
Yields:
|
||||
Lines from stdout
|
||||
"""
|
||||
if not self.stdout_file or not self.stdout_file.exists():
|
||||
return
|
||||
|
||||
if follow:
|
||||
# Follow mode - yield new lines as they appear (tail -f)
|
||||
import time
|
||||
with open(self.stdout_file, 'r') as f:
|
||||
# Seek to end minus roughly 'lines' worth of bytes
|
||||
f.seek(0, 2) # Seek to end
|
||||
file_size = f.tell()
|
||||
# Rough estimate: 100 bytes per line
|
||||
seek_pos = max(0, file_size - (lines * 100))
|
||||
f.seek(seek_pos)
|
||||
|
||||
# Skip partial line if we seeked to middle
|
||||
if seek_pos > 0:
|
||||
f.readline()
|
||||
|
||||
# Yield existing lines
|
||||
for line in f:
|
||||
yield line.rstrip('\n')
|
||||
|
||||
# Now follow for new lines
|
||||
while True:
|
||||
line = f.readline()
|
||||
if line:
|
||||
yield line.rstrip('\n')
|
||||
else:
|
||||
time.sleep(0.1) # Wait before checking again
|
||||
else:
|
||||
# Just get last N lines (tail -n)
|
||||
try:
|
||||
content = self.stdout_file.read_text()
|
||||
for line in content.splitlines()[-lines:]:
|
||||
yield line
|
||||
except Exception:
|
||||
return
|
||||
|
||||
def tail_stderr(self, lines: int = 50, follow: bool = False):
|
||||
"""
|
||||
Tail stderr log file (like `tail` or `tail -f`).
|
||||
|
||||
Args:
|
||||
lines: Number of lines to show (default 50)
|
||||
follow: If True, follow the file and yield new lines as they appear
|
||||
|
||||
Yields:
|
||||
Lines from stderr
|
||||
"""
|
||||
if not self.stderr_file or not self.stderr_file.exists():
|
||||
return
|
||||
|
||||
if follow:
|
||||
# Follow mode - yield new lines as they appear (tail -f)
|
||||
import time
|
||||
with open(self.stderr_file, 'r') as f:
|
||||
# Seek to end minus roughly 'lines' worth of bytes
|
||||
f.seek(0, 2) # Seek to end
|
||||
file_size = f.tell()
|
||||
# Rough estimate: 100 bytes per line
|
||||
seek_pos = max(0, file_size - (lines * 100))
|
||||
f.seek(seek_pos)
|
||||
|
||||
# Skip partial line if we seeked to middle
|
||||
if seek_pos > 0:
|
||||
f.readline()
|
||||
|
||||
# Yield existing lines
|
||||
for line in f:
|
||||
yield line.rstrip('\n')
|
||||
|
||||
# Now follow for new lines
|
||||
while True:
|
||||
line = f.readline()
|
||||
if line:
|
||||
yield line.rstrip('\n')
|
||||
else:
|
||||
time.sleep(0.1) # Wait before checking again
|
||||
else:
|
||||
# Just get last N lines (tail -n)
|
||||
try:
|
||||
content = self.stderr_file.read_text()
|
||||
for line in content.splitlines()[-lines:]:
|
||||
yield line
|
||||
except Exception:
|
||||
return
|
||||
|
||||
def pipe_stdout(self, lines: int = 10, follow: bool = True):
|
||||
"""
|
||||
Pipe stdout to sys.stdout.
|
||||
|
||||
Args:
|
||||
lines: Number of initial lines to show
|
||||
follow: If True, follow the file and print new lines as they appear
|
||||
"""
|
||||
import sys
|
||||
for line in self.tail_stdout(lines=lines, follow=follow):
|
||||
print(line, file=sys.stdout, flush=True)
|
||||
|
||||
def pipe_stderr(self, lines: int = 10, follow: bool = True):
|
||||
"""
|
||||
Pipe stderr to sys.stderr.
|
||||
|
||||
Args:
|
||||
lines: Number of initial lines to show
|
||||
follow: If True, follow the file and print new lines as they appear
|
||||
"""
|
||||
import sys
|
||||
for line in self.tail_stderr(lines=lines, follow=follow):
|
||||
print(line, file=sys.stderr, flush=True)
|
||||
|
||||
def _write_pid_file(self) -> None:
|
||||
"""Write PID file with mtime set to process start time."""
|
||||
if self.pid and self.started_at and self.pid_file:
|
||||
|
||||
@@ -3,6 +3,12 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"CHROME_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_CHROME"],
|
||||
"description": "Enable Chrome/Chromium browser integration for archiving"
|
||||
},
|
||||
"CHROME_BINARY": {
|
||||
"type": "string",
|
||||
"default": "chromium",
|
||||
|
||||
@@ -201,16 +201,18 @@ def test_config_save_screenshot_false_skips():
|
||||
"""Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
# FIRST check what Python sees
|
||||
print(f"\n[DEBUG PYTHON] NODE_V8_COVERAGE in os.environ: {'NODE_V8_COVERAGE' in os.environ}")
|
||||
print(f"[DEBUG PYTHON] Value: {os.environ.get('NODE_V8_COVERAGE', 'NOT SET')}")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SCREENSHOT_ENABLED'] = 'False'
|
||||
|
||||
# DEBUG: Check if NODE_V8_COVERAGE is in env
|
||||
if 'NODE_V8_COVERAGE' in env:
|
||||
print(f"\n[DEBUG] NODE_V8_COVERAGE in env: {env['NODE_V8_COVERAGE']}")
|
||||
else:
|
||||
print("\n[DEBUG] NODE_V8_COVERAGE NOT in env")
|
||||
# Check what's in the copied env
|
||||
print(f"[DEBUG ENV COPY] NODE_V8_COVERAGE in env: {'NODE_V8_COVERAGE' in env}")
|
||||
print(f"[DEBUG ENV COPY] Value: {env.get('NODE_V8_COVERAGE', 'NOT SET')}")
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
@@ -221,6 +223,12 @@ def test_config_save_screenshot_false_skips():
|
||||
timeout=30
|
||||
)
|
||||
|
||||
print(f"[DEBUG RESULT] Exit code: {result.returncode}")
|
||||
print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
|
||||
|
||||
# FORCE FAILURE to verify test actually runs
|
||||
assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
|
||||
@@ -136,7 +136,7 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['list'])
|
||||
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
|
||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
|
||||
481
archivebox/tests/test_worker_config_propagation.py
Normal file
481
archivebox/tests/test_worker_config_propagation.py
Normal file
@@ -0,0 +1,481 @@
|
||||
"""
|
||||
Integration test for config propagation through worker hierarchy.
|
||||
|
||||
Tests that config is properly merged and passed through:
|
||||
Parent CLI/Orchestrator
|
||||
└── CrawlWorker subprocess (via Process.env)
|
||||
└── SnapshotWorker subprocess (via Process.env)
|
||||
└── Hook subprocess (via Process.env)
|
||||
|
||||
Config priority order (highest to lowest):
|
||||
1. Snapshot.config (JSON field)
|
||||
2. Crawl.config (JSON field)
|
||||
3. User.config (JSON field)
|
||||
4. Environment variables (os.environ + Process.env)
|
||||
5. Config file (ArchiveBox.conf)
|
||||
6. Plugin defaults (config.json)
|
||||
7. Core defaults
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_config_propagation_through_worker_hierarchy():
|
||||
"""
|
||||
Integration test: Verify config is properly merged at every level.
|
||||
|
||||
Test flow:
|
||||
1. Create test archive with custom config in ArchiveBox.conf
|
||||
2. Set custom env vars before spawning worker
|
||||
3. Create Crawl with custom crawl.config JSON field
|
||||
4. Create Snapshot with custom snapshot.config JSON field
|
||||
5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
|
||||
6. Verify worker received merged config from all sources
|
||||
7. Verify hook subprocess also received correct config
|
||||
"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / 'test_archive'
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Propagation Through Worker Hierarchy")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Step 1: Initialize archive
|
||||
print("Step 1: Initialize archive")
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'init'],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
print(f"✓ Archive initialized\n")
|
||||
|
||||
# Step 2: Write custom config to ArchiveBox.conf
|
||||
print("Step 2: Write custom config to ArchiveBox.conf")
|
||||
config_file = data_dir / 'ArchiveBox.conf'
|
||||
config_file.write_text("""
|
||||
[GENERAL]
|
||||
# Custom timeout in config file
|
||||
TIMEOUT = 999
|
||||
|
||||
[ARCHIVING_CONFIG]
|
||||
# Enable all plugins for proper testing
|
||||
SAVE_WGET = True
|
||||
SAVE_WARC = True
|
||||
SAVE_PDF = True
|
||||
SAVE_DOM = True
|
||||
SAVE_SINGLEFILE = True
|
||||
SAVE_READABILITY = True
|
||||
SAVE_MERCURY = True
|
||||
SAVE_HTMLTOTEXT = True
|
||||
SAVE_GIT = True
|
||||
SAVE_MEDIA = True
|
||||
SAVE_ARCHIVE_DOT_ORG = True
|
||||
SAVE_TITLE = True
|
||||
SAVE_FAVICON = True
|
||||
SAVE_SCREENSHOT = True
|
||||
""")
|
||||
print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
|
||||
|
||||
# Step 2.5: Set Machine.config values
|
||||
print("Step 2.5: Set Machine.config with custom binary path")
|
||||
set_machine_config_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
machine.config = {{
|
||||
'CUSTOM_MACHINE_KEY': 'from_machine_config',
|
||||
'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
|
||||
}}
|
||||
machine.save()
|
||||
print(f"Machine {{machine.hostname}} config updated")
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', set_machine_config_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
|
||||
print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
|
||||
|
||||
# Step 3: Create Crawl via Django ORM with custom crawl.config
|
||||
print("Step 3: Create Crawl with custom crawl.config JSON")
|
||||
create_crawl_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
# Create crawl with custom config
|
||||
crawl = Crawl.objects.create(
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
urls='https://example.com',
|
||||
config={{
|
||||
'TIMEOUT': 777, # Crawl-level override (higher priority than file)
|
||||
'CUSTOM_CRAWL_KEY': 'from_crawl_json',
|
||||
}}
|
||||
)
|
||||
print(crawl.id)
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', create_crawl_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
|
||||
# Extract UUID from output (last line should be the UUID)
|
||||
crawl_id = result.stdout.decode().strip().split('\n')[-1]
|
||||
print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
|
||||
|
||||
# Step 4: Create Snapshot with custom snapshot.config
|
||||
print("Step 4: Create Snapshot with custom snapshot.config JSON")
|
||||
create_snapshot_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id='{crawl_id}')
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
config={{
|
||||
'TIMEOUT': 555, # Snapshot-level override (highest priority)
|
||||
'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
|
||||
'SAVE_SCREENSHOT': True, # Keep screenshot enabled
|
||||
'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
|
||||
}}
|
||||
)
|
||||
print(snapshot.id)
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', create_snapshot_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
|
||||
# Extract UUID from output (last line should be the UUID)
|
||||
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
|
||||
print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
|
||||
|
||||
# Step 5: Run SnapshotWorker with additional env var
|
||||
print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
'ENV_VAR_KEY': 'from_environment', # Environment variable
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
stdout = result.stdout.decode()
|
||||
stderr = result.stderr.decode()
|
||||
|
||||
print("\n--- SnapshotWorker stdout ---")
|
||||
print(stdout)
|
||||
print("\n--- SnapshotWorker stderr ---")
|
||||
print(stderr)
|
||||
print("--- End output ---\n")
|
||||
|
||||
# Step 6: Verify config was properly merged
|
||||
print("Step 6: Verify config merging")
|
||||
|
||||
# Check that SnapshotWorker ran successfully
|
||||
assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
|
||||
|
||||
# Verify config by checking stderr debug output and ArchiveResults in database
|
||||
print("\n--- Verifying config propagation ---\n")
|
||||
|
||||
# Check for config debug messages in stderr
|
||||
assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
|
||||
"Expected debug output not found in stderr"
|
||||
print("✓ Config debug output found in stderr")
|
||||
|
||||
# Verify config values were actually used by checking ArchiveResults
|
||||
verify_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
||||
print(f"Snapshot status: {{snapshot.status}}")
|
||||
print(f"Snapshot URL: {{snapshot.url}}")
|
||||
|
||||
# Check that snapshot reached sealed state
|
||||
assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
|
||||
|
||||
# Verify all config sources are present in merged config
|
||||
print("\\nVerifying config merge priority:")
|
||||
config = get_config(snapshot=snapshot)
|
||||
|
||||
# 1. Snapshot.config (highest priority)
|
||||
timeout = config.get('TIMEOUT')
|
||||
print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
|
||||
assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
|
||||
|
||||
wget_enabled = config.get('SAVE_WGET')
|
||||
print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
|
||||
assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
|
||||
|
||||
custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
|
||||
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
|
||||
assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
|
||||
|
||||
# 2. Crawl.config
|
||||
custom_crawl = config.get('CUSTOM_CRAWL_KEY')
|
||||
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
|
||||
assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
|
||||
|
||||
# 6. Machine.config
|
||||
custom_machine = config.get('CUSTOM_MACHINE_KEY')
|
||||
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
|
||||
assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
|
||||
|
||||
wget_binary = config.get('WGET_BINARY')
|
||||
print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
|
||||
# Note: This might be overridden by environment or other sources, just check it's present
|
||||
assert wget_binary is not None, f"WGET_BINARY should be present"
|
||||
|
||||
# Check ArchiveResults to verify plugins actually ran with correct config
|
||||
results = ArchiveResult.objects.filter(snapshot=snapshot)
|
||||
print(f"\\nArchiveResults created: {{results.count()}}")
|
||||
|
||||
for ar in results.order_by('plugin'):
|
||||
print(f" {{ar.plugin}}: {{ar.status}}")
|
||||
|
||||
# Verify SAVE_WGET=False was respected (should have no wget result)
|
||||
wget_results = results.filter(plugin='wget')
|
||||
print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
|
||||
assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
|
||||
|
||||
# Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
|
||||
screenshot_results = results.filter(plugin='screenshot')
|
||||
print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
|
||||
assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
|
||||
|
||||
print("\\n✓ All config sources correctly merged:")
|
||||
print(" - Snapshot.config overrides (highest priority)")
|
||||
print(" - Crawl.config values present")
|
||||
print(" - Machine.config values present")
|
||||
print(" - File config values present")
|
||||
print("✓ Config priority order verified")
|
||||
print("✓ Snapshot successfully sealed")
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', verify_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.returncode != 0:
|
||||
print("\nVerification error:")
|
||||
print(result.stderr.decode())
|
||||
|
||||
assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
def test_config_environment_variable_parsing():
|
||||
"""
|
||||
Test that Process._build_env() correctly serializes config values,
|
||||
and get_config() correctly parses them back from environment.
|
||||
"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / 'test_archive'
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Environment Variable Parsing")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Initialize archive
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'init'],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
|
||||
# Test various data types in config
|
||||
test_config_types_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
# Test get_config() with no overrides (baseline)
|
||||
config = get_config()
|
||||
print(f"Baseline config keys: {{len(config)}}")
|
||||
|
||||
# Create a test Process with various config types
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
pwd='{data_dir}',
|
||||
cmd=['test'],
|
||||
env={{
|
||||
'STRING_VAL': 'hello',
|
||||
'INT_VAL': 123,
|
||||
'FLOAT_VAL': 45.67,
|
||||
'BOOL_TRUE': True,
|
||||
'BOOL_FALSE': False,
|
||||
'LIST_VAL': ['a', 'b', 'c'],
|
||||
'DICT_VAL': {{'key': 'value'}},
|
||||
'NONE_VAL': None,
|
||||
}},
|
||||
)
|
||||
|
||||
# Test _build_env() serialization
|
||||
env = process._build_env()
|
||||
print(f"\\nSerialized environment:")
|
||||
print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
|
||||
print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
|
||||
print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
|
||||
print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
|
||||
print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
|
||||
print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
|
||||
print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
|
||||
print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
|
||||
|
||||
# Verify all are strings (required by subprocess.Popen)
|
||||
assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
|
||||
assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
|
||||
assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
|
||||
assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
|
||||
assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
|
||||
assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
|
||||
assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
|
||||
|
||||
print("\\n✓ All environment values correctly serialized as strings")
|
||||
|
||||
# Now test that get_config() can parse them back
|
||||
# Simulate subprocess by setting os.environ
|
||||
import json
|
||||
for key, val in env.items():
|
||||
if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
|
||||
os.environ[key] = val
|
||||
|
||||
# Get config again - should parse from environment
|
||||
config = get_config()
|
||||
print(f"\\nParsed from environment:")
|
||||
print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
|
||||
print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
|
||||
print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
|
||||
print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
|
||||
print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
|
||||
print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
|
||||
print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
|
||||
|
||||
print("\\n✓ All config values correctly parsed from environment")
|
||||
"""
|
||||
|
||||
result = subprocess.run(
|
||||
['python', '-c', test_config_types_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.stderr:
|
||||
print("Script stderr:")
|
||||
print(result.stderr.decode())
|
||||
|
||||
assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✓ TEST PASSED: Config serialization and parsing works correctly")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Run as standalone script
|
||||
test_config_propagation_through_worker_hierarchy()
|
||||
test_config_environment_variable_parsing()
|
||||
@@ -308,8 +308,8 @@ class Worker:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
|
||||
cmd = [sys.executable, '-m', 'archivebox', 'run', '--crawl-id', str(crawl_id)]
|
||||
pwd = Path(crawl.OUTPUT_DIR) # Run in crawl's output directory
|
||||
env = get_config(scope='crawl', crawl=crawl)
|
||||
pwd = Path(crawl.output_dir) # Run in crawl's output directory
|
||||
env = get_config(crawl=crawl)
|
||||
|
||||
elif cls.name == 'snapshot':
|
||||
snapshot_id = kwargs.get('snapshot_id')
|
||||
@@ -321,7 +321,7 @@ class Worker:
|
||||
|
||||
cmd = [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', str(snapshot_id)]
|
||||
pwd = Path(snapshot.output_dir) # Run in snapshot's output directory
|
||||
env = get_config(scope='snapshot', snapshot=snapshot)
|
||||
env = get_config(snapshot=snapshot)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown worker type: {cls.name}")
|
||||
@@ -459,6 +459,8 @@ class CrawlWorker(Worker):
|
||||
from pathlib import Path
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
import sys
|
||||
import threading
|
||||
|
||||
debug_log = Path('/tmp/archivebox_crawl_worker_debug.log')
|
||||
|
||||
@@ -514,7 +516,9 @@ class CrawlWorker(Worker):
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f' Spawning worker for {snapshot.url} (status={snapshot.status})\n')
|
||||
f.flush()
|
||||
SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
|
||||
|
||||
pid = SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
|
||||
|
||||
log_worker_event(
|
||||
worker_type='CrawlWorker',
|
||||
event=f'Spawned SnapshotWorker for {snapshot.url}',
|
||||
@@ -522,6 +526,18 @@ class CrawlWorker(Worker):
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
# Pipe the SnapshotWorker's stderr to our stderr so we can see what's happening
|
||||
# Get the Process record that was just created
|
||||
worker_process = Process.objects.filter(pid=pid).first()
|
||||
if worker_process:
|
||||
# Pipe stderr in background thread so it doesn't block
|
||||
def pipe_worker_stderr():
|
||||
for line in worker_process.tail_stderr(lines=0, follow=True):
|
||||
print(f' [SnapshotWorker] {line}', file=sys.stderr, flush=True)
|
||||
|
||||
thread = threading.Thread(target=pipe_worker_stderr, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def _is_crawl_finished(self) -> bool:
|
||||
"""Check if all snapshots are sealed."""
|
||||
from pathlib import Path
|
||||
@@ -626,16 +642,28 @@ class SnapshotWorker(Worker):
|
||||
"""Execute all hooks sequentially."""
|
||||
from archivebox.hooks import discover_hooks, is_background_hook, extract_step
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
self.on_startup()
|
||||
|
||||
try:
|
||||
# Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
|
||||
config = get_config(snapshot=self.snapshot)
|
||||
|
||||
# Discover all hooks for this snapshot
|
||||
hooks = discover_hooks('Snapshot', config=self.snapshot.config)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
|
||||
|
||||
import sys
|
||||
print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
|
||||
if hooks:
|
||||
print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
|
||||
else:
|
||||
print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
|
||||
|
||||
# Execute each hook sequentially
|
||||
for hook_path in hooks:
|
||||
print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
|
||||
hook_name = hook_path.name
|
||||
plugin = self._extract_plugin_name(hook_name)
|
||||
hook_step = extract_step(hook_name)
|
||||
@@ -661,7 +689,7 @@ class SnapshotWorker(Worker):
|
||||
ar.save(update_fields=['status', 'start_ts', 'modified_at'])
|
||||
|
||||
# Fork and run the hook
|
||||
process = self._run_hook(hook_path, ar)
|
||||
process = self._run_hook(hook_path, ar, config)
|
||||
|
||||
if is_background:
|
||||
# Track but don't wait
|
||||
@@ -698,7 +726,7 @@ class SnapshotWorker(Worker):
|
||||
finally:
|
||||
self.on_shutdown()
|
||||
|
||||
def _run_hook(self, hook_path: Path, ar: Any) -> Any:
|
||||
def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any:
|
||||
"""Fork and run a hook using Process model, return Process."""
|
||||
from archivebox.hooks import run_hook
|
||||
|
||||
@@ -710,7 +738,7 @@ class SnapshotWorker(Worker):
|
||||
process = run_hook(
|
||||
script=hook_path,
|
||||
output_dir=output_dir,
|
||||
config=self.snapshot.config,
|
||||
config=config,
|
||||
timeout=120,
|
||||
parent=self.db_process,
|
||||
url=str(self.snapshot.url),
|
||||
|
||||
@@ -179,7 +179,7 @@ if [ "$ENABLE_COVERAGE" = true ]; then
|
||||
export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
|
||||
|
||||
echo "Python coverage: enabled (subprocess support)"
|
||||
echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)"
|
||||
echo "JavaScript coverage: enabled (NODE_V8_COVERAGE=$NODE_V8_COVERAGE)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
|
||||
Reference in New Issue
Block a user