mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
more migration id/uuid and config propagation fixes
This commit is contained in:
@@ -111,7 +111,7 @@ class ModelWithOutputDir(ModelWithUUID):
|
|||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
super().save(*args, **kwargs)
|
super().save(*args, **kwargs)
|
||||||
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
|
||||||
# Note: index.json is deprecated, models should use write_index_jsonl() for full data
|
# Note: index.json is deprecated, models should use write_index_jsonl() for full data
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -127,5 +127,5 @@ class ModelWithOutputDir(ModelWithUUID):
|
|||||||
return f'{self.output_dir_parent}/{self.output_dir_name}'
|
return f'{self.output_dir_parent}/{self.output_dir_name}'
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def OUTPUT_DIR(self) -> Path:
|
def output_dir(self) -> Path:
|
||||||
return DATA_DIR / self.output_dir_str
|
raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
|
||||||
|
|||||||
@@ -118,12 +118,12 @@ class BaseConfigSet(BaseSettings):
|
|||||||
|
|
||||||
|
|
||||||
def get_config(
|
def get_config(
|
||||||
scope: str = "global",
|
|
||||||
defaults: Optional[Dict] = None,
|
defaults: Optional[Dict] = None,
|
||||||
persona: Any = None,
|
persona: Any = None,
|
||||||
user: Any = None,
|
user: Any = None,
|
||||||
crawl: Any = None,
|
crawl: Any = None,
|
||||||
snapshot: Any = None,
|
snapshot: Any = None,
|
||||||
|
machine: Any = None,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Get merged config from all sources.
|
Get merged config from all sources.
|
||||||
@@ -134,17 +134,18 @@ def get_config(
|
|||||||
3. Per-user config (user.config JSON field)
|
3. Per-user config (user.config JSON field)
|
||||||
4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
|
4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
|
||||||
5. Environment variables
|
5. Environment variables
|
||||||
6. Config file (ArchiveBox.conf)
|
6. Per-machine config (machine.config JSON field - resolved binary paths)
|
||||||
7. Plugin schema defaults (config.json)
|
7. Config file (ArchiveBox.conf)
|
||||||
8. Core config defaults
|
8. Plugin schema defaults (config.json)
|
||||||
|
9. Core config defaults
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
scope: Config scope ('global', 'crawl', 'snapshot', etc.)
|
|
||||||
defaults: Default values to start with
|
defaults: Default values to start with
|
||||||
persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
|
persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
|
||||||
user: User object with config JSON field
|
user: User object with config JSON field
|
||||||
crawl: Crawl object with config JSON field
|
crawl: Crawl object with config JSON field
|
||||||
snapshot: Snapshot object with config JSON field
|
snapshot: Snapshot object with config JSON field
|
||||||
|
machine: Machine object with config JSON field (defaults to Machine.current())
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Merged config dict
|
Merged config dict
|
||||||
@@ -184,6 +185,18 @@ def get_config(
|
|||||||
file_config = BaseConfigSet.load_from_file(config_file)
|
file_config = BaseConfigSet.load_from_file(config_file)
|
||||||
config.update(file_config)
|
config.update(file_config)
|
||||||
|
|
||||||
|
# Apply machine config overrides (cached binary paths, etc.)
|
||||||
|
if machine is None:
|
||||||
|
# Default to current machine if not provided
|
||||||
|
try:
|
||||||
|
from archivebox.machine.models import Machine
|
||||||
|
machine = Machine.current()
|
||||||
|
except Exception:
|
||||||
|
pass # Machine might not be available during early init
|
||||||
|
|
||||||
|
if machine and hasattr(machine, "config") and machine.config:
|
||||||
|
config.update(machine.config)
|
||||||
|
|
||||||
# Override with environment variables
|
# Override with environment variables
|
||||||
for key in config:
|
for key in config:
|
||||||
env_val = os.environ.get(key)
|
env_val = os.environ.get(key)
|
||||||
@@ -221,8 +234,8 @@ def get_config(
|
|||||||
config.update(crawl.config)
|
config.update(crawl.config)
|
||||||
|
|
||||||
# Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
|
# Add CRAWL_OUTPUT_DIR for snapshot hooks to find shared Chrome session
|
||||||
if crawl and hasattr(crawl, "OUTPUT_DIR"):
|
if crawl and hasattr(crawl, "output_dir"):
|
||||||
config['CRAWL_OUTPUT_DIR'] = str(crawl.OUTPUT_DIR)
|
config['CRAWL_OUTPUT_DIR'] = str(crawl.output_dir)
|
||||||
|
|
||||||
# Apply snapshot config overrides (highest priority)
|
# Apply snapshot config overrides (highest priority)
|
||||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||||
@@ -260,7 +273,7 @@ def get_flat_config() -> Dict[str, Any]:
|
|||||||
|
|
||||||
Replaces abx.pm.hook.get_FLAT_CONFIG()
|
Replaces abx.pm.hook.get_FLAT_CONFIG()
|
||||||
"""
|
"""
|
||||||
return get_config(scope="global")
|
return get_config()
|
||||||
|
|
||||||
|
|
||||||
def get_all_configs() -> Dict[str, BaseConfigSet]:
|
def get_all_configs() -> Dict[str, BaseConfigSet]:
|
||||||
|
|||||||
@@ -176,6 +176,7 @@ class ConstantsDict(Mapping):
|
|||||||
CRONTABS_DIR_NAME,
|
CRONTABS_DIR_NAME,
|
||||||
"invalid",
|
"invalid",
|
||||||
"users",
|
"users",
|
||||||
|
"machine",
|
||||||
# Backwards compatibility with old directory names
|
# Backwards compatibility with old directory names
|
||||||
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
||||||
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ def get_table_columns(table_name):
|
|||||||
|
|
||||||
def upgrade_core_tables(apps, schema_editor):
|
def upgrade_core_tables(apps, schema_editor):
|
||||||
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
|
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
|
||||||
|
from archivebox.uuid_compat import uuid7
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
|
|
||||||
# Check if core_archiveresult table exists
|
# Check if core_archiveresult table exists
|
||||||
@@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor):
|
|||||||
|
|
||||||
if has_data:
|
if has_data:
|
||||||
if has_uuid and not has_abid:
|
if has_uuid and not has_abid:
|
||||||
# Migrating from v0.7.2 (has uuid, minimal fields)
|
# Migrating from v0.7.2+ (has uuid column)
|
||||||
print('Migrating ArchiveResult from v0.7.2 schema...')
|
print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||||
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
||||||
@@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor):
|
|||||||
FROM core_archiveresult;
|
FROM core_archiveresult;
|
||||||
""")
|
""")
|
||||||
else:
|
else:
|
||||||
print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
|
# Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
|
||||||
|
print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
|
||||||
|
cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
|
||||||
|
old_records = cursor.fetchall()
|
||||||
|
for record in old_records:
|
||||||
|
new_uuid = uuid7().hex
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||||
|
id, uuid, snapshot_id, cmd, pwd, cmd_version,
|
||||||
|
start_ts, end_ts, status, extractor, output
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""", (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
|
||||||
|
|
||||||
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
|
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
|
||||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
|
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor):
|
|||||||
|
|
||||||
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
|
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
|
||||||
# transformed by migration 0023, so we don't need to copy them here.
|
# transformed by migration 0023, so we don't need to copy them here.
|
||||||
|
# NOTE: UUIDs are already populated by migration 0023 for all migration paths
|
||||||
|
|
||||||
# Debug: Check Snapshot timestamps at end of RunPython
|
# Debug: Check Snapshot timestamps at end of RunPython
|
||||||
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
|
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
|
||||||
|
|||||||
@@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7
|
|||||||
|
|
||||||
def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
||||||
"""
|
"""
|
||||||
Migrate ArchiveResult from integer PK to UUID PK.
|
Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration).
|
||||||
|
|
||||||
|
Handles both migration paths:
|
||||||
|
- 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs
|
||||||
|
- 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs
|
||||||
|
|
||||||
Strategy:
|
Strategy:
|
||||||
1. Add old_id field to store current integer IDs
|
1. Create new table with UUID as primary key (no temporary columns)
|
||||||
2. Generate UUIDs for any records missing them
|
2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x)
|
||||||
3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
|
3. Copy all data with UUID as new id
|
||||||
|
4. Drop old table, rename new table
|
||||||
|
5. Recreate indexes
|
||||||
|
|
||||||
|
Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid)
|
||||||
"""
|
"""
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
|
|
||||||
@@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
|||||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
|
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
|
||||||
row_count = cursor.fetchone()[0]
|
row_count = cursor.fetchone()[0]
|
||||||
|
|
||||||
if row_count == 0:
|
# Don't skip if table is empty - we still need to recreate to remove uuid column
|
||||||
print('No ArchiveResult records to migrate')
|
# (fresh installs create table with uuid from 0025, but model expects no uuid after 0029)
|
||||||
return
|
|
||||||
|
|
||||||
print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
|
if row_count == 0:
|
||||||
|
print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...')
|
||||||
|
else:
|
||||||
|
print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
|
||||||
|
|
||||||
# Step 0: Check if machine_process table exists, if not NULL out process_id values
|
# Step 0: Check if machine_process table exists, if not NULL out process_id values
|
||||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
|
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
|
||||||
@@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
|||||||
print('machine_process table does not exist yet, setting process_id to NULL')
|
print('machine_process table does not exist yet, setting process_id to NULL')
|
||||||
cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
|
cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
|
||||||
|
|
||||||
# Step 1: Create new table with UUID as primary key
|
# Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns)
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TABLE core_archiveresult_new (
|
CREATE TABLE core_archiveresult_new (
|
||||||
id TEXT PRIMARY KEY NOT NULL,
|
id TEXT PRIMARY KEY NOT NULL,
|
||||||
old_id INTEGER,
|
|
||||||
uuid TEXT UNIQUE,
|
|
||||||
created_at DATETIME NOT NULL,
|
created_at DATETIME NOT NULL,
|
||||||
modified_at DATETIME NOT NULL,
|
modified_at DATETIME NOT NULL,
|
||||||
|
|
||||||
@@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
|||||||
""")
|
""")
|
||||||
|
|
||||||
# Step 2: Generate UUIDs for records that don't have them
|
# Step 2: Generate UUIDs for records that don't have them
|
||||||
cursor.execute("SELECT id, uuid FROM core_archiveresult")
|
# Check if uuid column exists (0.8.x has it, 0.7.x doesn't)
|
||||||
records = cursor.fetchall()
|
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||||
|
columns = cursor.fetchall()
|
||||||
|
col_names = [col[1] for col in columns]
|
||||||
|
has_uuid_column = 'uuid' in col_names
|
||||||
|
|
||||||
id_to_uuid = {}
|
if has_uuid_column:
|
||||||
for old_id, existing_uuid in records:
|
cursor.execute("SELECT id, uuid FROM core_archiveresult")
|
||||||
if existing_uuid:
|
records = cursor.fetchall()
|
||||||
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
|
id_to_uuid = {}
|
||||||
# (existing UUIDs might be stored with or without dashes in old schema)
|
for old_id, existing_uuid in records:
|
||||||
id_to_uuid[old_id] = UUID(existing_uuid).hex
|
if existing_uuid:
|
||||||
else:
|
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
|
||||||
# Generate new UUIDv7 (time-ordered) as 32-char hex
|
# (existing UUIDs might be stored with or without dashes in old schema)
|
||||||
id_to_uuid[old_id] = uuid7().hex
|
id_to_uuid[old_id] = UUID(existing_uuid).hex
|
||||||
|
else:
|
||||||
|
# Generate new UUIDv7 (time-ordered) as 32-char hex
|
||||||
|
id_to_uuid[old_id] = uuid7().hex
|
||||||
|
else:
|
||||||
|
# 0.7.x path: no uuid column, generate new UUIDs for all records
|
||||||
|
cursor.execute("SELECT id FROM core_archiveresult")
|
||||||
|
records = cursor.fetchall()
|
||||||
|
id_to_uuid = {old_id: uuid7().hex for (old_id,) in records}
|
||||||
|
|
||||||
# Step 3: Copy data with UUIDs as new primary key
|
# Step 3: Copy data with UUIDs as new primary key
|
||||||
cursor.execute("SELECT * FROM core_archiveresult")
|
cursor.execute("SELECT * FROM core_archiveresult")
|
||||||
old_records = cursor.fetchall()
|
old_records = cursor.fetchall()
|
||||||
|
|
||||||
# Get column names
|
# col_names already fetched in Step 2
|
||||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
inserted_count = 0
|
||||||
columns = cursor.fetchall()
|
|
||||||
col_names = [col[1] for col in columns]
|
|
||||||
|
|
||||||
for i, record in enumerate(old_records):
|
for i, record in enumerate(old_records):
|
||||||
old_id = record[col_names.index('id')]
|
old_id = record[col_names.index('id')]
|
||||||
new_uuid = id_to_uuid[old_id]
|
new_uuid = id_to_uuid[old_id]
|
||||||
@@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
|||||||
# Build insert with new structure
|
# Build insert with new structure
|
||||||
values = {col_names[i]: record[i] for i in range(len(col_names))}
|
values = {col_names[i]: record[i] for i in range(len(col_names))}
|
||||||
|
|
||||||
# Check which fields exist in new table
|
# List of fields to copy (all fields from new schema except id, old_id, uuid)
|
||||||
fields_to_copy = [
|
fields_to_copy = [
|
||||||
'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
|
'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
|
||||||
'status', 'retry_at', 'start_ts', 'end_ts',
|
'status', 'retry_at', 'start_ts', 'end_ts',
|
||||||
@@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
|||||||
'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
|
'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
|
||||||
]
|
]
|
||||||
|
|
||||||
# Build INSERT statement
|
# Build INSERT statement (only copy fields that exist in source)
|
||||||
existing_fields = [f for f in fields_to_copy if f in values]
|
existing_fields = [f for f in fields_to_copy if f in values]
|
||||||
placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid
|
|
||||||
field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
|
|
||||||
|
|
||||||
insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
|
if i == 0:
|
||||||
|
print(f'[0029] Source columns: {col_names}')
|
||||||
|
print(f'[0029] Copying fields: {existing_fields}')
|
||||||
|
|
||||||
cursor.execute(
|
placeholders = ', '.join(['?'] * (len(existing_fields) + 1)) # +1 for id
|
||||||
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
|
field_list = 'id, ' + ', '.join(existing_fields)
|
||||||
insert_values
|
|
||||||
)
|
insert_values = [new_uuid] + [values.get(f) for f in existing_fields]
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor.execute(
|
||||||
|
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
|
||||||
|
insert_values
|
||||||
|
)
|
||||||
|
inserted_count += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f'[0029] ERROR inserting record {old_id}: {e}')
|
||||||
|
if i == 0:
|
||||||
|
print(f'[0029] First record values: {insert_values[:5]}...')
|
||||||
|
raise
|
||||||
|
|
||||||
|
print(f'[0029] Inserted {inserted_count}/{len(old_records)} records')
|
||||||
|
|
||||||
# Step 4: Replace old table with new table
|
# Step 4: Replace old table with new table
|
||||||
cursor.execute("DROP TABLE core_archiveresult")
|
cursor.execute("DROP TABLE core_archiveresult")
|
||||||
@@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
|
|||||||
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
||||||
cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
|
cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
|
||||||
cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
|
cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
|
||||||
cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
|
|
||||||
|
|
||||||
print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
|
print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
|
||||||
|
|
||||||
@@ -159,23 +188,17 @@ class Migration(migrations.Migration):
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
state_operations=[
|
state_operations=[
|
||||||
# Remove old uuid field
|
# Remove uuid field (was added in 0025, we're merging it into id)
|
||||||
migrations.RemoveField(
|
migrations.RemoveField(
|
||||||
model_name='archiveresult',
|
model_name='archiveresult',
|
||||||
name='uuid',
|
name='uuid',
|
||||||
),
|
),
|
||||||
# Change id from AutoField to UUIDField
|
# Change id from AutoField to UUIDField (absorbing the uuid field)
|
||||||
migrations.AlterField(
|
migrations.AlterField(
|
||||||
model_name='archiveresult',
|
model_name='archiveresult',
|
||||||
name='id',
|
name='id',
|
||||||
field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
|
field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
|
||||||
),
|
),
|
||||||
# Add old_id field to preserve legacy integer IDs
|
|
||||||
migrations.AddField(
|
|
||||||
model_name='archiveresult',
|
|
||||||
name='old_id',
|
|
||||||
field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
def domain(self) -> str:
|
def domain(self) -> str:
|
||||||
return url_domain(self.url)
|
return url_domain(self.url)
|
||||||
|
|
||||||
@cached_property
|
@property
|
||||||
def output_dir(self):
|
def output_dir(self):
|
||||||
"""The filesystem path to the snapshot's output directory."""
|
"""The filesystem path to the snapshot's output directory."""
|
||||||
import os
|
import os
|
||||||
@@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
|
print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
|
||||||
|
|
||||||
# Clean up .pid files from output directory
|
# Clean up .pid files from output directory
|
||||||
if self.OUTPUT_DIR.exists():
|
if Path(self.output_dir).exists():
|
||||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
for pid_file in Path(self.output_dir).glob('**/*.pid'):
|
||||||
pid_file.unlink(missing_ok=True)
|
pid_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
# Update all STARTED ArchiveResults from filesystem
|
# Update all STARTED ArchiveResults from filesystem
|
||||||
@@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
|
|
||||||
# UUID primary key (migrated from integer in 0029)
|
# UUID primary key (migrated from integer in 0029)
|
||||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||||
# old_id preserves the legacy integer ID for backward compatibility
|
|
||||||
old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
|
|
||||||
# Note: uuid field was removed in migration 0029 when id became UUID
|
|
||||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||||
modified_at = models.DateTimeField(auto_now=True)
|
modified_at = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
@@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def output_dir_parent(self) -> str:
|
def output_dir_parent(self) -> str:
|
||||||
return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
|
return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))
|
||||||
|
|
||||||
# Properties that delegate to Process model (for backwards compatibility)
|
# Properties that delegate to Process model (for backwards compatibility)
|
||||||
# These properties will replace the direct fields after migration is complete
|
# These properties will replace the direct fields after migration is complete
|
||||||
|
|||||||
@@ -180,7 +180,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
return crawl
|
return crawl
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def OUTPUT_DIR(self) -> Path:
|
def output_dir(self) -> Path:
|
||||||
"""
|
"""
|
||||||
Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
|
Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
|
||||||
Domain is extracted from the first URL in the crawl.
|
Domain is extracted from the first URL in the crawl.
|
||||||
@@ -383,7 +383,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
f.flush()
|
f.flush()
|
||||||
hook_start = time.time()
|
hook_start = time.time()
|
||||||
plugin_name = hook.parent.name
|
plugin_name = hook.parent.name
|
||||||
output_dir = self.OUTPUT_DIR / plugin_name
|
output_dir = self.output_dir / plugin_name
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Run hook using Process.launch() - returns Process model
|
# Run hook using Process.launch() - returns Process model
|
||||||
@@ -427,7 +427,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
f.write(f'Created {len(created_snapshots)} snapshots\n')
|
f.write(f'Created {len(created_snapshots)} snapshots\n')
|
||||||
f.write(f'=== Crawl.run() complete ===\n\n')
|
f.write(f'=== Crawl.run() complete ===\n\n')
|
||||||
f.flush()
|
f.flush()
|
||||||
return created_snapshots[0] if created_snapshots else None
|
|
||||||
|
# Return first snapshot for this crawl (newly created or existing)
|
||||||
|
# This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created
|
||||||
|
return self.snapshot_set.first()
|
||||||
|
|
||||||
def is_finished(self) -> bool:
|
def is_finished(self) -> bool:
|
||||||
"""Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
|
"""Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
|
||||||
@@ -467,8 +470,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]')
|
print(f'[yellow]🔪 Killed {killed_count} orphaned crawl hook process(es)[/yellow]')
|
||||||
|
|
||||||
# Clean up .pid files from output directory
|
# Clean up .pid files from output directory
|
||||||
if self.OUTPUT_DIR.exists():
|
if self.output_dir.exists():
|
||||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
for pid_file in self.output_dir.glob('**/*.pid'):
|
||||||
pid_file.unlink(missing_ok=True)
|
pid_file.unlink(missing_ok=True)
|
||||||
|
|
||||||
# Run on_CrawlEnd hooks
|
# Run on_CrawlEnd hooks
|
||||||
@@ -479,7 +482,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
|
|
||||||
for hook in hooks:
|
for hook in hooks:
|
||||||
plugin_name = hook.parent.name
|
plugin_name = hook.parent.name
|
||||||
output_dir = self.OUTPUT_DIR / plugin_name
|
output_dir = self.output_dir / plugin_name
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
process = run_hook(
|
process = run_hook(
|
||||||
|
|||||||
@@ -207,7 +207,7 @@ def discover_hooks(
|
|||||||
# Get merged config if not provided (lazy import to avoid circular dependency)
|
# Get merged config if not provided (lazy import to avoid circular dependency)
|
||||||
if config is None:
|
if config is None:
|
||||||
from archivebox.config.configset import get_config
|
from archivebox.config.configset import get_config
|
||||||
config = get_config(scope='global')
|
config = get_config()
|
||||||
|
|
||||||
enabled_hooks = []
|
enabled_hooks = []
|
||||||
|
|
||||||
@@ -703,7 +703,7 @@ def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
|
|||||||
# Get merged config if not provided
|
# Get merged config if not provided
|
||||||
if config is None:
|
if config is None:
|
||||||
from archivebox.config.configset import get_config
|
from archivebox.config.configset import get_config
|
||||||
config = get_config(scope='global')
|
config = get_config()
|
||||||
|
|
||||||
# Support explicit ENABLED_PLUGINS override (legacy)
|
# Support explicit ENABLED_PLUGINS override (legacy)
|
||||||
if 'ENABLED_PLUGINS' in config:
|
if 'ENABLED_PLUGINS' in config:
|
||||||
@@ -967,9 +967,9 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
|||||||
else:
|
else:
|
||||||
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
|
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
|
||||||
import sys
|
import sys
|
||||||
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_upper}_ENABLED", file=sys.stderr)
|
|
||||||
enabled_key = f'{plugin_upper}_ENABLED'
|
enabled_key = f'{plugin_upper}_ENABLED'
|
||||||
enabled = config.get(enabled_key)
|
enabled = config.get(enabled_key)
|
||||||
|
print(f"DEBUG: NO PLUGINS whitelist in config, checking {enabled_key}={enabled}", file=sys.stderr)
|
||||||
if enabled is None:
|
if enabled is None:
|
||||||
enabled = True
|
enabled = True
|
||||||
elif isinstance(enabled, str):
|
elif isinstance(enabled, str):
|
||||||
|
|||||||
@@ -378,7 +378,7 @@ class Binary(ModelWithHealthStats):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def OUTPUT_DIR(self):
|
def output_dir(self):
|
||||||
"""Return the output directory for this binary installation."""
|
"""Return the output directory for this binary installation."""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@@ -412,10 +412,10 @@ class Binary(ModelWithHealthStats):
|
|||||||
from archivebox.config.configset import get_config
|
from archivebox.config.configset import get_config
|
||||||
|
|
||||||
# Get merged config (Binary doesn't have crawl/snapshot context)
|
# Get merged config (Binary doesn't have crawl/snapshot context)
|
||||||
config = get_config(scope='global')
|
config = get_config()
|
||||||
|
|
||||||
# Create output directory
|
# Create output directory
|
||||||
output_dir = self.OUTPUT_DIR
|
output_dir = self.output_dir
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
self.output_dir = str(output_dir)
|
self.output_dir = str(output_dir)
|
||||||
self.save()
|
self.save()
|
||||||
@@ -514,7 +514,7 @@ class Binary(ModelWithHealthStats):
|
|||||||
print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]')
|
print(f'[yellow]🔪 Killed {killed_count} binary installation hook process(es)[/yellow]')
|
||||||
|
|
||||||
# Clean up .pid files from output directory
|
# Clean up .pid files from output directory
|
||||||
output_dir = self.OUTPUT_DIR
|
output_dir = self.output_dir
|
||||||
if output_dir.exists():
|
if output_dir.exists():
|
||||||
for pid_file in output_dir.glob('**/*.pid'):
|
for pid_file in output_dir.glob('**/*.pid'):
|
||||||
pid_file.unlink(missing_ok=True)
|
pid_file.unlink(missing_ok=True)
|
||||||
@@ -1276,6 +1276,128 @@ class Process(models.Model):
|
|||||||
"""Path to stderr log."""
|
"""Path to stderr log."""
|
||||||
return Path(self.pwd) / 'stderr.log' if self.pwd else None
|
return Path(self.pwd) / 'stderr.log' if self.pwd else None
|
||||||
|
|
||||||
|
def tail_stdout(self, lines: int = 50, follow: bool = False):
|
||||||
|
"""
|
||||||
|
Tail stdout log file (like `tail` or `tail -f`).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines: Number of lines to show (default 50)
|
||||||
|
follow: If True, follow the file and yield new lines as they appear
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Lines from stdout
|
||||||
|
"""
|
||||||
|
if not self.stdout_file or not self.stdout_file.exists():
|
||||||
|
return
|
||||||
|
|
||||||
|
if follow:
|
||||||
|
# Follow mode - yield new lines as they appear (tail -f)
|
||||||
|
import time
|
||||||
|
with open(self.stdout_file, 'r') as f:
|
||||||
|
# Seek to end minus roughly 'lines' worth of bytes
|
||||||
|
f.seek(0, 2) # Seek to end
|
||||||
|
file_size = f.tell()
|
||||||
|
# Rough estimate: 100 bytes per line
|
||||||
|
seek_pos = max(0, file_size - (lines * 100))
|
||||||
|
f.seek(seek_pos)
|
||||||
|
|
||||||
|
# Skip partial line if we seeked to middle
|
||||||
|
if seek_pos > 0:
|
||||||
|
f.readline()
|
||||||
|
|
||||||
|
# Yield existing lines
|
||||||
|
for line in f:
|
||||||
|
yield line.rstrip('\n')
|
||||||
|
|
||||||
|
# Now follow for new lines
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if line:
|
||||||
|
yield line.rstrip('\n')
|
||||||
|
else:
|
||||||
|
time.sleep(0.1) # Wait before checking again
|
||||||
|
else:
|
||||||
|
# Just get last N lines (tail -n)
|
||||||
|
try:
|
||||||
|
content = self.stdout_file.read_text()
|
||||||
|
for line in content.splitlines()[-lines:]:
|
||||||
|
yield line
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
|
||||||
|
def tail_stderr(self, lines: int = 50, follow: bool = False):
|
||||||
|
"""
|
||||||
|
Tail stderr log file (like `tail` or `tail -f`).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines: Number of lines to show (default 50)
|
||||||
|
follow: If True, follow the file and yield new lines as they appear
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Lines from stderr
|
||||||
|
"""
|
||||||
|
if not self.stderr_file or not self.stderr_file.exists():
|
||||||
|
return
|
||||||
|
|
||||||
|
if follow:
|
||||||
|
# Follow mode - yield new lines as they appear (tail -f)
|
||||||
|
import time
|
||||||
|
with open(self.stderr_file, 'r') as f:
|
||||||
|
# Seek to end minus roughly 'lines' worth of bytes
|
||||||
|
f.seek(0, 2) # Seek to end
|
||||||
|
file_size = f.tell()
|
||||||
|
# Rough estimate: 100 bytes per line
|
||||||
|
seek_pos = max(0, file_size - (lines * 100))
|
||||||
|
f.seek(seek_pos)
|
||||||
|
|
||||||
|
# Skip partial line if we seeked to middle
|
||||||
|
if seek_pos > 0:
|
||||||
|
f.readline()
|
||||||
|
|
||||||
|
# Yield existing lines
|
||||||
|
for line in f:
|
||||||
|
yield line.rstrip('\n')
|
||||||
|
|
||||||
|
# Now follow for new lines
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if line:
|
||||||
|
yield line.rstrip('\n')
|
||||||
|
else:
|
||||||
|
time.sleep(0.1) # Wait before checking again
|
||||||
|
else:
|
||||||
|
# Just get last N lines (tail -n)
|
||||||
|
try:
|
||||||
|
content = self.stderr_file.read_text()
|
||||||
|
for line in content.splitlines()[-lines:]:
|
||||||
|
yield line
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
|
||||||
|
def pipe_stdout(self, lines: int = 10, follow: bool = True):
|
||||||
|
"""
|
||||||
|
Pipe stdout to sys.stdout.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines: Number of initial lines to show
|
||||||
|
follow: If True, follow the file and print new lines as they appear
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
for line in self.tail_stdout(lines=lines, follow=follow):
|
||||||
|
print(line, file=sys.stdout, flush=True)
|
||||||
|
|
||||||
|
def pipe_stderr(self, lines: int = 10, follow: bool = True):
|
||||||
|
"""
|
||||||
|
Pipe stderr to sys.stderr.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines: Number of initial lines to show
|
||||||
|
follow: If True, follow the file and print new lines as they appear
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
for line in self.tail_stderr(lines=lines, follow=follow):
|
||||||
|
print(line, file=sys.stderr, flush=True)
|
||||||
|
|
||||||
def _write_pid_file(self) -> None:
|
def _write_pid_file(self) -> None:
|
||||||
"""Write PID file with mtime set to process start time."""
|
"""Write PID file with mtime set to process start time."""
|
||||||
if self.pid and self.started_at and self.pid_file:
|
if self.pid and self.started_at and self.pid_file:
|
||||||
|
|||||||
@@ -3,6 +3,12 @@
|
|||||||
"type": "object",
|
"type": "object",
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"CHROME_ENABLED": {
|
||||||
|
"type": "boolean",
|
||||||
|
"default": true,
|
||||||
|
"x-aliases": ["USE_CHROME"],
|
||||||
|
"description": "Enable Chrome/Chromium browser integration for archiving"
|
||||||
|
},
|
||||||
"CHROME_BINARY": {
|
"CHROME_BINARY": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"default": "chromium",
|
"default": "chromium",
|
||||||
|
|||||||
@@ -201,16 +201,18 @@ def test_config_save_screenshot_false_skips():
|
|||||||
"""Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
|
"""Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
# FIRST check what Python sees
|
||||||
|
print(f"\n[DEBUG PYTHON] NODE_V8_COVERAGE in os.environ: {'NODE_V8_COVERAGE' in os.environ}")
|
||||||
|
print(f"[DEBUG PYTHON] Value: {os.environ.get('NODE_V8_COVERAGE', 'NOT SET')}")
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
tmpdir = Path(tmpdir)
|
tmpdir = Path(tmpdir)
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
env['SCREENSHOT_ENABLED'] = 'False'
|
env['SCREENSHOT_ENABLED'] = 'False'
|
||||||
|
|
||||||
# DEBUG: Check if NODE_V8_COVERAGE is in env
|
# Check what's in the copied env
|
||||||
if 'NODE_V8_COVERAGE' in env:
|
print(f"[DEBUG ENV COPY] NODE_V8_COVERAGE in env: {'NODE_V8_COVERAGE' in env}")
|
||||||
print(f"\n[DEBUG] NODE_V8_COVERAGE in env: {env['NODE_V8_COVERAGE']}")
|
print(f"[DEBUG ENV COPY] Value: {env.get('NODE_V8_COVERAGE', 'NOT SET')}")
|
||||||
else:
|
|
||||||
print("\n[DEBUG] NODE_V8_COVERAGE NOT in env")
|
|
||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||||
@@ -221,6 +223,12 @@ def test_config_save_screenshot_false_skips():
|
|||||||
timeout=30
|
timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print(f"[DEBUG RESULT] Exit code: {result.returncode}")
|
||||||
|
print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
|
||||||
|
|
||||||
|
# FORCE FAILURE to verify test actually runs
|
||||||
|
assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
|
||||||
|
|
||||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||||
|
|
||||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ class TestMigrationFrom07x(unittest.TestCase):
|
|||||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||||
|
|
||||||
result = run_archivebox(self.work_dir, ['list'])
|
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
|
||||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||||
|
|
||||||
# Verify ALL snapshots appear in output
|
# Verify ALL snapshots appear in output
|
||||||
|
|||||||
481
archivebox/tests/test_worker_config_propagation.py
Normal file
481
archivebox/tests/test_worker_config_propagation.py
Normal file
@@ -0,0 +1,481 @@
|
|||||||
|
"""
|
||||||
|
Integration test for config propagation through worker hierarchy.
|
||||||
|
|
||||||
|
Tests that config is properly merged and passed through:
|
||||||
|
Parent CLI/Orchestrator
|
||||||
|
└── CrawlWorker subprocess (via Process.env)
|
||||||
|
└── SnapshotWorker subprocess (via Process.env)
|
||||||
|
└── Hook subprocess (via Process.env)
|
||||||
|
|
||||||
|
Config priority order (highest to lowest):
|
||||||
|
1. Snapshot.config (JSON field)
|
||||||
|
2. Crawl.config (JSON field)
|
||||||
|
3. User.config (JSON field)
|
||||||
|
4. Environment variables (os.environ + Process.env)
|
||||||
|
5. Config file (ArchiveBox.conf)
|
||||||
|
6. Plugin defaults (config.json)
|
||||||
|
7. Core defaults
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_propagation_through_worker_hierarchy():
|
||||||
|
"""
|
||||||
|
Integration test: Verify config is properly merged at every level.
|
||||||
|
|
||||||
|
Test flow:
|
||||||
|
1. Create test archive with custom config in ArchiveBox.conf
|
||||||
|
2. Set custom env vars before spawning worker
|
||||||
|
3. Create Crawl with custom crawl.config JSON field
|
||||||
|
4. Create Snapshot with custom snapshot.config JSON field
|
||||||
|
5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
|
||||||
|
6. Verify worker received merged config from all sources
|
||||||
|
7. Verify hook subprocess also received correct config
|
||||||
|
"""
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
data_dir = Path(tmpdir) / 'test_archive'
|
||||||
|
data_dir.mkdir()
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Test: Config Propagation Through Worker Hierarchy")
|
||||||
|
print(f"DATA_DIR: {data_dir}")
|
||||||
|
print(f"{'='*80}\n")
|
||||||
|
|
||||||
|
# Step 1: Initialize archive
|
||||||
|
print("Step 1: Initialize archive")
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-m', 'archivebox', 'init'],
|
||||||
|
cwd=str(data_dir),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||||
|
print(f"✓ Archive initialized\n")
|
||||||
|
|
||||||
|
# Step 2: Write custom config to ArchiveBox.conf
|
||||||
|
print("Step 2: Write custom config to ArchiveBox.conf")
|
||||||
|
config_file = data_dir / 'ArchiveBox.conf'
|
||||||
|
config_file.write_text("""
|
||||||
|
[GENERAL]
|
||||||
|
# Custom timeout in config file
|
||||||
|
TIMEOUT = 999
|
||||||
|
|
||||||
|
[ARCHIVING_CONFIG]
|
||||||
|
# Enable all plugins for proper testing
|
||||||
|
SAVE_WGET = True
|
||||||
|
SAVE_WARC = True
|
||||||
|
SAVE_PDF = True
|
||||||
|
SAVE_DOM = True
|
||||||
|
SAVE_SINGLEFILE = True
|
||||||
|
SAVE_READABILITY = True
|
||||||
|
SAVE_MERCURY = True
|
||||||
|
SAVE_HTMLTOTEXT = True
|
||||||
|
SAVE_GIT = True
|
||||||
|
SAVE_MEDIA = True
|
||||||
|
SAVE_ARCHIVE_DOT_ORG = True
|
||||||
|
SAVE_TITLE = True
|
||||||
|
SAVE_FAVICON = True
|
||||||
|
SAVE_SCREENSHOT = True
|
||||||
|
""")
|
||||||
|
print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
|
||||||
|
|
||||||
|
# Step 2.5: Set Machine.config values
|
||||||
|
print("Step 2.5: Set Machine.config with custom binary path")
|
||||||
|
set_machine_config_script = f"""
|
||||||
|
import os
|
||||||
|
os.environ['DATA_DIR'] = '{data_dir}'
|
||||||
|
|
||||||
|
from archivebox.config.django import setup_django
|
||||||
|
setup_django()
|
||||||
|
|
||||||
|
from archivebox.machine.models import Machine
|
||||||
|
|
||||||
|
machine = Machine.current()
|
||||||
|
machine.config = {{
|
||||||
|
'CUSTOM_MACHINE_KEY': 'from_machine_config',
|
||||||
|
'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
|
||||||
|
}}
|
||||||
|
machine.save()
|
||||||
|
print(f"Machine {{machine.hostname}} config updated")
|
||||||
|
"""
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-c', set_machine_config_script],
|
||||||
|
cwd=str(data_dir.parent),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
|
||||||
|
print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
|
||||||
|
|
||||||
|
# Step 3: Create Crawl via Django ORM with custom crawl.config
|
||||||
|
print("Step 3: Create Crawl with custom crawl.config JSON")
|
||||||
|
create_crawl_script = f"""
|
||||||
|
import os
|
||||||
|
os.environ['DATA_DIR'] = '{data_dir}'
|
||||||
|
|
||||||
|
from archivebox.config.django import setup_django
|
||||||
|
setup_django()
|
||||||
|
|
||||||
|
from django.utils import timezone
|
||||||
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
|
# Create crawl with custom config
|
||||||
|
crawl = Crawl.objects.create(
|
||||||
|
status='queued',
|
||||||
|
retry_at=timezone.now(),
|
||||||
|
urls='https://example.com',
|
||||||
|
config={{
|
||||||
|
'TIMEOUT': 777, # Crawl-level override (higher priority than file)
|
||||||
|
'CUSTOM_CRAWL_KEY': 'from_crawl_json',
|
||||||
|
}}
|
||||||
|
)
|
||||||
|
print(crawl.id)
|
||||||
|
"""
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-c', create_crawl_script],
|
||||||
|
cwd=str(data_dir.parent),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
|
||||||
|
# Extract UUID from output (last line should be the UUID)
|
||||||
|
crawl_id = result.stdout.decode().strip().split('\n')[-1]
|
||||||
|
print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
|
||||||
|
|
||||||
|
# Step 4: Create Snapshot with custom snapshot.config
|
||||||
|
print("Step 4: Create Snapshot with custom snapshot.config JSON")
|
||||||
|
create_snapshot_script = f"""
|
||||||
|
import os
|
||||||
|
os.environ['DATA_DIR'] = '{data_dir}'
|
||||||
|
|
||||||
|
from archivebox.config.django import setup_django
|
||||||
|
setup_django()
|
||||||
|
|
||||||
|
from django.utils import timezone
|
||||||
|
from archivebox.core.models import Snapshot
|
||||||
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
|
crawl = Crawl.objects.get(id='{crawl_id}')
|
||||||
|
snapshot = Snapshot.objects.create(
|
||||||
|
url='https://example.com',
|
||||||
|
crawl=crawl,
|
||||||
|
status='queued',
|
||||||
|
retry_at=timezone.now(),
|
||||||
|
config={{
|
||||||
|
'TIMEOUT': 555, # Snapshot-level override (highest priority)
|
||||||
|
'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
|
||||||
|
'SAVE_SCREENSHOT': True, # Keep screenshot enabled
|
||||||
|
'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
|
||||||
|
}}
|
||||||
|
)
|
||||||
|
print(snapshot.id)
|
||||||
|
"""
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-c', create_snapshot_script],
|
||||||
|
cwd=str(data_dir.parent),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
|
||||||
|
# Extract UUID from output (last line should be the UUID)
|
||||||
|
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
|
||||||
|
print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
|
||||||
|
|
||||||
|
# Step 5: Run SnapshotWorker with additional env var
|
||||||
|
print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
|
||||||
|
cwd=str(data_dir),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
'ENV_VAR_KEY': 'from_environment', # Environment variable
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
stdout = result.stdout.decode()
|
||||||
|
stderr = result.stderr.decode()
|
||||||
|
|
||||||
|
print("\n--- SnapshotWorker stdout ---")
|
||||||
|
print(stdout)
|
||||||
|
print("\n--- SnapshotWorker stderr ---")
|
||||||
|
print(stderr)
|
||||||
|
print("--- End output ---\n")
|
||||||
|
|
||||||
|
# Step 6: Verify config was properly merged
|
||||||
|
print("Step 6: Verify config merging")
|
||||||
|
|
||||||
|
# Check that SnapshotWorker ran successfully
|
||||||
|
assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
|
||||||
|
|
||||||
|
# Verify config by checking stderr debug output and ArchiveResults in database
|
||||||
|
print("\n--- Verifying config propagation ---\n")
|
||||||
|
|
||||||
|
# Check for config debug messages in stderr
|
||||||
|
assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
|
||||||
|
"Expected debug output not found in stderr"
|
||||||
|
print("✓ Config debug output found in stderr")
|
||||||
|
|
||||||
|
# Verify config values were actually used by checking ArchiveResults
|
||||||
|
verify_script = f"""
|
||||||
|
import os
|
||||||
|
os.environ['DATA_DIR'] = '{data_dir}'
|
||||||
|
|
||||||
|
from archivebox.config.django import setup_django
|
||||||
|
setup_django()
|
||||||
|
|
||||||
|
from archivebox.core.models import Snapshot, ArchiveResult
|
||||||
|
from archivebox.config.configset import get_config
|
||||||
|
|
||||||
|
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
||||||
|
print(f"Snapshot status: {{snapshot.status}}")
|
||||||
|
print(f"Snapshot URL: {{snapshot.url}}")
|
||||||
|
|
||||||
|
# Check that snapshot reached sealed state
|
||||||
|
assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
|
||||||
|
|
||||||
|
# Verify all config sources are present in merged config
|
||||||
|
print("\\nVerifying config merge priority:")
|
||||||
|
config = get_config(snapshot=snapshot)
|
||||||
|
|
||||||
|
# 1. Snapshot.config (highest priority)
|
||||||
|
timeout = config.get('TIMEOUT')
|
||||||
|
print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
|
||||||
|
assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
|
||||||
|
|
||||||
|
wget_enabled = config.get('SAVE_WGET')
|
||||||
|
print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
|
||||||
|
assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
|
||||||
|
|
||||||
|
custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
|
||||||
|
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
|
||||||
|
assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
|
||||||
|
|
||||||
|
# 2. Crawl.config
|
||||||
|
custom_crawl = config.get('CUSTOM_CRAWL_KEY')
|
||||||
|
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
|
||||||
|
assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
|
||||||
|
|
||||||
|
# 6. Machine.config
|
||||||
|
custom_machine = config.get('CUSTOM_MACHINE_KEY')
|
||||||
|
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
|
||||||
|
assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
|
||||||
|
|
||||||
|
wget_binary = config.get('WGET_BINARY')
|
||||||
|
print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
|
||||||
|
# Note: This might be overridden by environment or other sources, just check it's present
|
||||||
|
assert wget_binary is not None, f"WGET_BINARY should be present"
|
||||||
|
|
||||||
|
# Check ArchiveResults to verify plugins actually ran with correct config
|
||||||
|
results = ArchiveResult.objects.filter(snapshot=snapshot)
|
||||||
|
print(f"\\nArchiveResults created: {{results.count()}}")
|
||||||
|
|
||||||
|
for ar in results.order_by('plugin'):
|
||||||
|
print(f" {{ar.plugin}}: {{ar.status}}")
|
||||||
|
|
||||||
|
# Verify SAVE_WGET=False was respected (should have no wget result)
|
||||||
|
wget_results = results.filter(plugin='wget')
|
||||||
|
print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
|
||||||
|
assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
|
||||||
|
|
||||||
|
# Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
|
||||||
|
screenshot_results = results.filter(plugin='screenshot')
|
||||||
|
print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
|
||||||
|
assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
|
||||||
|
|
||||||
|
print("\\n✓ All config sources correctly merged:")
|
||||||
|
print(" - Snapshot.config overrides (highest priority)")
|
||||||
|
print(" - Crawl.config values present")
|
||||||
|
print(" - Machine.config values present")
|
||||||
|
print(" - File config values present")
|
||||||
|
print("✓ Config priority order verified")
|
||||||
|
print("✓ Snapshot successfully sealed")
|
||||||
|
"""
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-c', verify_script],
|
||||||
|
cwd=str(data_dir.parent),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(result.stdout.decode())
|
||||||
|
if result.returncode != 0:
|
||||||
|
print("\nVerification error:")
|
||||||
|
print(result.stderr.decode())
|
||||||
|
|
||||||
|
assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
|
||||||
|
print("="*80 + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_environment_variable_parsing():
|
||||||
|
"""
|
||||||
|
Test that Process._build_env() correctly serializes config values,
|
||||||
|
and get_config() correctly parses them back from environment.
|
||||||
|
"""
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
data_dir = Path(tmpdir) / 'test_archive'
|
||||||
|
data_dir.mkdir()
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Test: Config Environment Variable Parsing")
|
||||||
|
print(f"DATA_DIR: {data_dir}")
|
||||||
|
print(f"{'='*80}\n")
|
||||||
|
|
||||||
|
# Initialize archive
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-m', 'archivebox', 'init'],
|
||||||
|
cwd=str(data_dir),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||||
|
|
||||||
|
# Test various data types in config
|
||||||
|
test_config_types_script = f"""
|
||||||
|
import os
|
||||||
|
os.environ['DATA_DIR'] = '{data_dir}'
|
||||||
|
|
||||||
|
from archivebox.config.django import setup_django
|
||||||
|
setup_django()
|
||||||
|
|
||||||
|
from archivebox.config.configset import get_config
|
||||||
|
from archivebox.machine.models import Process, Machine
|
||||||
|
|
||||||
|
# Test get_config() with no overrides (baseline)
|
||||||
|
config = get_config()
|
||||||
|
print(f"Baseline config keys: {{len(config)}}")
|
||||||
|
|
||||||
|
# Create a test Process with various config types
|
||||||
|
process = Process.objects.create(
|
||||||
|
machine=Machine.current(),
|
||||||
|
process_type=Process.TypeChoices.WORKER,
|
||||||
|
pwd='{data_dir}',
|
||||||
|
cmd=['test'],
|
||||||
|
env={{
|
||||||
|
'STRING_VAL': 'hello',
|
||||||
|
'INT_VAL': 123,
|
||||||
|
'FLOAT_VAL': 45.67,
|
||||||
|
'BOOL_TRUE': True,
|
||||||
|
'BOOL_FALSE': False,
|
||||||
|
'LIST_VAL': ['a', 'b', 'c'],
|
||||||
|
'DICT_VAL': {{'key': 'value'}},
|
||||||
|
'NONE_VAL': None,
|
||||||
|
}},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test _build_env() serialization
|
||||||
|
env = process._build_env()
|
||||||
|
print(f"\\nSerialized environment:")
|
||||||
|
print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
|
||||||
|
print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
|
||||||
|
print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
|
||||||
|
print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
|
||||||
|
print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
|
||||||
|
print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
|
||||||
|
print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
|
||||||
|
print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
|
||||||
|
|
||||||
|
# Verify all are strings (required by subprocess.Popen)
|
||||||
|
assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
|
||||||
|
assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
|
||||||
|
assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
|
||||||
|
assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
|
||||||
|
assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
|
||||||
|
assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
|
||||||
|
assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
|
||||||
|
|
||||||
|
print("\\n✓ All environment values correctly serialized as strings")
|
||||||
|
|
||||||
|
# Now test that get_config() can parse them back
|
||||||
|
# Simulate subprocess by setting os.environ
|
||||||
|
import json
|
||||||
|
for key, val in env.items():
|
||||||
|
if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
|
||||||
|
os.environ[key] = val
|
||||||
|
|
||||||
|
# Get config again - should parse from environment
|
||||||
|
config = get_config()
|
||||||
|
print(f"\\nParsed from environment:")
|
||||||
|
print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
|
||||||
|
print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
|
||||||
|
print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
|
||||||
|
print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
|
||||||
|
print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
|
||||||
|
print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
|
||||||
|
print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
|
||||||
|
|
||||||
|
print("\\n✓ All config values correctly parsed from environment")
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', '-c', test_config_types_script],
|
||||||
|
cwd=str(data_dir.parent),
|
||||||
|
env={
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
},
|
||||||
|
capture_output=True,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(result.stdout.decode())
|
||||||
|
if result.stderr:
|
||||||
|
print("Script stderr:")
|
||||||
|
print(result.stderr.decode())
|
||||||
|
|
||||||
|
assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("✓ TEST PASSED: Config serialization and parsing works correctly")
|
||||||
|
print("="*80 + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Run as standalone script
|
||||||
|
test_config_propagation_through_worker_hierarchy()
|
||||||
|
test_config_environment_variable_parsing()
|
||||||
@@ -308,8 +308,8 @@ class Worker:
|
|||||||
crawl = Crawl.objects.get(id=crawl_id)
|
crawl = Crawl.objects.get(id=crawl_id)
|
||||||
|
|
||||||
cmd = [sys.executable, '-m', 'archivebox', 'run', '--crawl-id', str(crawl_id)]
|
cmd = [sys.executable, '-m', 'archivebox', 'run', '--crawl-id', str(crawl_id)]
|
||||||
pwd = Path(crawl.OUTPUT_DIR) # Run in crawl's output directory
|
pwd = Path(crawl.output_dir) # Run in crawl's output directory
|
||||||
env = get_config(scope='crawl', crawl=crawl)
|
env = get_config(crawl=crawl)
|
||||||
|
|
||||||
elif cls.name == 'snapshot':
|
elif cls.name == 'snapshot':
|
||||||
snapshot_id = kwargs.get('snapshot_id')
|
snapshot_id = kwargs.get('snapshot_id')
|
||||||
@@ -321,7 +321,7 @@ class Worker:
|
|||||||
|
|
||||||
cmd = [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', str(snapshot_id)]
|
cmd = [sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', str(snapshot_id)]
|
||||||
pwd = Path(snapshot.output_dir) # Run in snapshot's output directory
|
pwd = Path(snapshot.output_dir) # Run in snapshot's output directory
|
||||||
env = get_config(scope='snapshot', snapshot=snapshot)
|
env = get_config(snapshot=snapshot)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown worker type: {cls.name}")
|
raise ValueError(f"Unknown worker type: {cls.name}")
|
||||||
@@ -459,6 +459,8 @@ class CrawlWorker(Worker):
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
from archivebox.machine.models import Process
|
from archivebox.machine.models import Process
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
|
||||||
debug_log = Path('/tmp/archivebox_crawl_worker_debug.log')
|
debug_log = Path('/tmp/archivebox_crawl_worker_debug.log')
|
||||||
|
|
||||||
@@ -514,7 +516,9 @@ class CrawlWorker(Worker):
|
|||||||
with open(debug_log, 'a') as f:
|
with open(debug_log, 'a') as f:
|
||||||
f.write(f' Spawning worker for {snapshot.url} (status={snapshot.status})\n')
|
f.write(f' Spawning worker for {snapshot.url} (status={snapshot.status})\n')
|
||||||
f.flush()
|
f.flush()
|
||||||
SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
|
|
||||||
|
pid = SnapshotWorker.start(parent=self.db_process, snapshot_id=str(snapshot.id))
|
||||||
|
|
||||||
log_worker_event(
|
log_worker_event(
|
||||||
worker_type='CrawlWorker',
|
worker_type='CrawlWorker',
|
||||||
event=f'Spawned SnapshotWorker for {snapshot.url}',
|
event=f'Spawned SnapshotWorker for {snapshot.url}',
|
||||||
@@ -522,6 +526,18 @@ class CrawlWorker(Worker):
|
|||||||
pid=self.pid,
|
pid=self.pid,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Pipe the SnapshotWorker's stderr to our stderr so we can see what's happening
|
||||||
|
# Get the Process record that was just created
|
||||||
|
worker_process = Process.objects.filter(pid=pid).first()
|
||||||
|
if worker_process:
|
||||||
|
# Pipe stderr in background thread so it doesn't block
|
||||||
|
def pipe_worker_stderr():
|
||||||
|
for line in worker_process.tail_stderr(lines=0, follow=True):
|
||||||
|
print(f' [SnapshotWorker] {line}', file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
thread = threading.Thread(target=pipe_worker_stderr, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
def _is_crawl_finished(self) -> bool:
|
def _is_crawl_finished(self) -> bool:
|
||||||
"""Check if all snapshots are sealed."""
|
"""Check if all snapshots are sealed."""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -626,16 +642,28 @@ class SnapshotWorker(Worker):
|
|||||||
"""Execute all hooks sequentially."""
|
"""Execute all hooks sequentially."""
|
||||||
from archivebox.hooks import discover_hooks, is_background_hook, extract_step
|
from archivebox.hooks import discover_hooks, is_background_hook, extract_step
|
||||||
from archivebox.core.models import ArchiveResult
|
from archivebox.core.models import ArchiveResult
|
||||||
|
from archivebox.config.configset import get_config
|
||||||
|
|
||||||
self.on_startup()
|
self.on_startup()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
|
||||||
|
config = get_config(snapshot=self.snapshot)
|
||||||
|
|
||||||
# Discover all hooks for this snapshot
|
# Discover all hooks for this snapshot
|
||||||
hooks = discover_hooks('Snapshot', config=self.snapshot.config)
|
hooks = discover_hooks('Snapshot', config=config)
|
||||||
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
|
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
|
||||||
|
if hooks:
|
||||||
|
print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
|
||||||
|
else:
|
||||||
|
print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
|
||||||
|
|
||||||
# Execute each hook sequentially
|
# Execute each hook sequentially
|
||||||
for hook_path in hooks:
|
for hook_path in hooks:
|
||||||
|
print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
|
||||||
hook_name = hook_path.name
|
hook_name = hook_path.name
|
||||||
plugin = self._extract_plugin_name(hook_name)
|
plugin = self._extract_plugin_name(hook_name)
|
||||||
hook_step = extract_step(hook_name)
|
hook_step = extract_step(hook_name)
|
||||||
@@ -661,7 +689,7 @@ class SnapshotWorker(Worker):
|
|||||||
ar.save(update_fields=['status', 'start_ts', 'modified_at'])
|
ar.save(update_fields=['status', 'start_ts', 'modified_at'])
|
||||||
|
|
||||||
# Fork and run the hook
|
# Fork and run the hook
|
||||||
process = self._run_hook(hook_path, ar)
|
process = self._run_hook(hook_path, ar, config)
|
||||||
|
|
||||||
if is_background:
|
if is_background:
|
||||||
# Track but don't wait
|
# Track but don't wait
|
||||||
@@ -698,7 +726,7 @@ class SnapshotWorker(Worker):
|
|||||||
finally:
|
finally:
|
||||||
self.on_shutdown()
|
self.on_shutdown()
|
||||||
|
|
||||||
def _run_hook(self, hook_path: Path, ar: Any) -> Any:
|
def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any:
|
||||||
"""Fork and run a hook using Process model, return Process."""
|
"""Fork and run a hook using Process model, return Process."""
|
||||||
from archivebox.hooks import run_hook
|
from archivebox.hooks import run_hook
|
||||||
|
|
||||||
@@ -710,7 +738,7 @@ class SnapshotWorker(Worker):
|
|||||||
process = run_hook(
|
process = run_hook(
|
||||||
script=hook_path,
|
script=hook_path,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
config=self.snapshot.config,
|
config=config,
|
||||||
timeout=120,
|
timeout=120,
|
||||||
parent=self.db_process,
|
parent=self.db_process,
|
||||||
url=str(self.snapshot.url),
|
url=str(self.snapshot.url),
|
||||||
|
|||||||
@@ -179,7 +179,7 @@ if [ "$ENABLE_COVERAGE" = true ]; then
|
|||||||
export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
|
export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js"
|
||||||
|
|
||||||
echo "Python coverage: enabled (subprocess support)"
|
echo "Python coverage: enabled (subprocess support)"
|
||||||
echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)"
|
echo "JavaScript coverage: enabled (NODE_V8_COVERAGE=$NODE_V8_COVERAGE)"
|
||||||
echo ""
|
echo ""
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user