Files
ArchiveBox/archivebox/tests/test_worker_config_propagation.py
Nick Sweeting 934e02695b fix lint
2026-03-15 18:45:29 -07:00

1079 lines
38 KiB
Python

"""
Integration test for config propagation through worker hierarchy.
Tests that config is properly merged and passed through:
Parent CLI/Orchestrator
└── CrawlWorker subprocess (via Process.env)
└── SnapshotWorker subprocess (via Process.env)
└── Hook subprocess (via Process.env)
Config priority order (highest to lowest):
1. Snapshot.config (JSON field)
2. Crawl.config (JSON field)
3. User.config (JSON field)
4. Environment variables (os.environ + Process.env)
5. Config file (ArchiveBox.conf)
6. Plugin defaults (config.json)
7. Core defaults
"""
import os
import sys
import tempfile
import subprocess
from pathlib import Path
def test_config_propagation_through_worker_hierarchy():
"""
Integration test: Verify config is properly merged at every level.
Test flow:
1. Create test archive with custom config in ArchiveBox.conf
2. Set custom env vars before spawning worker
3. Create Crawl with custom crawl.config JSON field
4. Create Snapshot with custom snapshot.config JSON field
5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
6. Verify worker received merged config from all sources
7. Verify hook subprocess also received correct config
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print("Test: Config Propagation Through Worker Hierarchy")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Step 1: Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print("✓ Archive initialized\n")
# Step 2: Write custom config to ArchiveBox.conf
print("Step 2: Write custom config to ArchiveBox.conf")
config_file = data_dir / 'ArchiveBox.conf'
config_file.write_text("""
[GENERAL]
# Custom timeout in config file
TIMEOUT = 999
[ARCHIVING_CONFIG]
# Enable all plugins for proper testing
SAVE_WGET = True
SAVE_WARC = True
SAVE_PDF = True
SAVE_DOM = True
SAVE_SINGLEFILE = True
SAVE_READABILITY = True
SAVE_MERCURY = True
SAVE_HTMLTOTEXT = True
SAVE_GIT = True
SAVE_MEDIA = True
SAVE_ARCHIVE_DOT_ORG = True
SAVE_TITLE = True
SAVE_FAVICON = True
SAVE_SCREENSHOT = True
""")
print("✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
# Step 2.5: Set Machine.config values
print("Step 2.5: Set Machine.config with custom binary path")
set_machine_config_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from archivebox.machine.models import Machine
machine = Machine.current()
machine.config = {{
'CUSTOM_MACHINE_KEY': 'from_machine_config',
'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
}}
machine.save()
print(f"Machine {{machine.hostname}} config updated")
"""
result = subprocess.run(
[sys.executable, '-c', set_machine_config_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
print("✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
# Step 3: Create Crawl via Django ORM with custom crawl.config
print("Step 3: Create Crawl with custom crawl.config JSON")
create_crawl_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.crawls.models import Crawl
# Create crawl with custom config
crawl = Crawl.objects.create(
status='queued',
retry_at=timezone.now(),
urls='https://example.com',
config={{
'TIMEOUT': 777, # Crawl-level override (higher priority than file)
'CUSTOM_CRAWL_KEY': 'from_crawl_json',
}}
)
print(crawl.id)
"""
result = subprocess.run(
[sys.executable, '-c', create_crawl_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
# Extract UUID from output (last line should be the UUID)
crawl_id = result.stdout.decode().strip().split('\n')[-1]
print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
# Step 4: Create Snapshot with custom snapshot.config
print("Step 4: Create Snapshot with custom snapshot.config JSON")
create_snapshot_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id='{crawl_id}')
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status='queued',
retry_at=timezone.now(),
config={{
'TIMEOUT': 555, # Snapshot-level override (highest priority)
'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
'SAVE_SCREENSHOT': True, # Keep screenshot enabled
'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
}}
)
print(snapshot.id)
"""
result = subprocess.run(
[sys.executable, '-c', create_snapshot_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
# Extract UUID from output (last line should be the UUID)
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
# Step 5: Run SnapshotWorker with additional env var
print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
'ENV_VAR_KEY': 'from_environment', # Environment variable
},
capture_output=True,
timeout=120,
)
stdout = result.stdout.decode()
stderr = result.stderr.decode()
print("\n--- SnapshotWorker stdout ---")
print(stdout)
print("\n--- SnapshotWorker stderr ---")
print(stderr)
print("--- End output ---\n")
# Step 6: Verify config was properly merged
print("Step 6: Verify config merging")
# Check that SnapshotWorker ran successfully
assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
# Verify config by checking ArchiveResults and merged config state
print("\n--- Verifying config propagation ---\n")
# Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
verify_precedence_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from archivebox.core.models import Snapshot
from archivebox.config.configset import get_config
snapshot = Snapshot.objects.get(id='{snapshot_id}')
# Test precedence by getting config at different levels
print("\\nTesting config precedence order:")
# 1. Just defaults (lowest priority)
config_defaults = get_config()
print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}")
# 2. With machine config
from archivebox.machine.models import Machine
machine = Machine.current()
config_machine = get_config(machine=machine)
custom_machine = config_machine.get('CUSTOM_MACHINE_KEY')
print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}")
# 3. With crawl config
config_crawl = get_config(crawl=snapshot.crawl)
print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)")
assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}"
# 4. With snapshot config (highest priority)
config_snapshot = get_config(snapshot=snapshot)
print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)")
assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}"
# Verify snapshot config overrides crawl config
assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present"
assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present"
assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present"
print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
"""
result = subprocess.run(
[sys.executable, '-c', verify_precedence_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nPrecedence verification error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}"
# Verify config values were actually used by checking ArchiveResults
verify_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.config.configset import get_config
from archivebox.hooks import discover_hooks
snapshot = Snapshot.objects.get(id='{snapshot_id}')
print(f"Snapshot status: {{snapshot.status}}")
print(f"Snapshot URL: {{snapshot.url}}")
# Check that snapshot reached sealed state
assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
# Verify all config sources are present in merged config
print("\\nVerifying config merge priority:")
config = get_config(snapshot=snapshot)
# 1. Snapshot.config (highest priority)
timeout = config.get('TIMEOUT')
print(f" 1. Snapshot.config: TIMEOUT={{timeout}} (expected: 555)")
assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
wget_enabled = config.get('WGET_ENABLED')
print(f" 1. Snapshot.config: WGET_ENABLED={{wget_enabled}} (expected: False)")
assert wget_enabled == False, f"WGET_ENABLED should be False from snapshot.config, got {{wget_enabled}}"
custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={{custom_snapshot}} (expected: from_snapshot_json)")
assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
# 2. Crawl.config
custom_crawl = config.get('CUSTOM_CRAWL_KEY')
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={{custom_crawl}} (expected: from_crawl_json)")
assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
# 6. Machine.config
custom_machine = config.get('CUSTOM_MACHINE_KEY')
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={{custom_machine}} (expected: from_machine_config)")
assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
wget_binary = config.get('WGET_BINARY')
print(f" 6. Machine.config: WGET_BINARY={{wget_binary}} (expected: /custom/machine/wget)")
# Note: This might be overridden by environment or other sources, just check it's present
assert wget_binary is not None, f"WGET_BINARY should be present"
# Check ArchiveResults to verify plugins actually ran with correct config
results = ArchiveResult.objects.filter(snapshot=snapshot)
print(f"\\nArchiveResults created: {{results.count()}}")
for ar in results.order_by('plugin'):
print(f" {{ar.plugin}}: {{ar.status}}")
# Verify SAVE_WGET=False was respected (should have no wget result)
wget_results = results.filter(plugin='wget')
print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
# Verify hook selection respected merged SAVE_* config, without depending on
# a browser extractor finishing successfully in CI.
snapshot_hooks = [hook.name for hook in discover_hooks('Snapshot', config=config)]
print(f"Enabled snapshot hooks: {{snapshot_hooks}}")
assert any('screenshot' in hook_name for hook_name in snapshot_hooks), (
f"SCREENSHOT should remain enabled in hook discovery, got {{snapshot_hooks}}"
)
assert not any('wget' in hook_name for hook_name in snapshot_hooks), (
f"WGET should be filtered out by snapshot.config, got {{snapshot_hooks}}"
)
print("\\n✓ All config sources correctly merged:")
print(" - Snapshot.config overrides (highest priority)")
print(" - Crawl.config values present")
print(" - Machine.config values present")
print(" - File config values present")
print("✓ Config priority order verified")
print("✓ Snapshot successfully sealed")
"""
result = subprocess.run(
[sys.executable, '-c', verify_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nVerification error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
print("="*80 + "\n")
def test_config_environment_variable_parsing():
"""
Test that Process._build_env() correctly serializes config values,
and get_config() correctly parses them back from environment.
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print("Test: Config Environment Variable Parsing")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize archive
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
# Test various data types in config
test_config_types_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from archivebox.config.configset import get_config
from archivebox.machine.models import Process, Machine
# Test get_config() with no overrides (baseline)
config = get_config()
print(f"Baseline config keys: {{len(config)}}")
# Create a test Process with various config types
process = Process.objects.create(
machine=Machine.current(),
process_type=Process.TypeChoices.WORKER,
pwd='{data_dir}',
cmd=['test'],
env={{
'STRING_VAL': 'hello',
'INT_VAL': 123,
'FLOAT_VAL': 45.67,
'BOOL_TRUE': True,
'BOOL_FALSE': False,
'LIST_VAL': ['a', 'b', 'c'],
'DICT_VAL': {{'key': 'value'}},
'NONE_VAL': None,
}},
)
# Test _build_env() serialization
env = process._build_env()
print(f"\\nSerialized environment:")
print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
# Verify all are strings (required by subprocess.Popen)
assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
print("\\n✓ All environment values correctly serialized as strings")
# Now test that get_config() can parse them back
# Simulate subprocess by setting os.environ
import json
for key, val in env.items():
if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
os.environ[key] = val
# Get config again - should parse from environment
config = get_config()
print(f"\\nParsed from environment:")
print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
print("\\n✓ All config values correctly parsed from environment")
"""
result = subprocess.run(
[sys.executable, '-c', test_config_types_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.stderr:
print("Script stderr:")
print(result.stderr.decode())
assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Config serialization and parsing works correctly")
print("="*80 + "\n")
def test_parent_environment_preserved_in_hooks():
"""
Test that parent environment variables are preserved in hook execution.
This test catches the bug where we built env=os.environ.copy() but then
clobbered it with process.env={}, losing all parent environment.
Also verifies:
- NODE_PATH is correctly derived from LIB_DIR/npm/node_modules
- LIB_BIN_DIR is correctly derived and added to PATH
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print("Test: Parent Environment Preserved in Hooks")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print("✓ Archive initialized\n")
# Create snapshot
print("Step 2: Create Snapshot")
create_snapshot_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.create(
urls='https://example.com',
status='queued',
retry_at=timezone.now()
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status='queued',
retry_at=timezone.now()
)
print(snapshot.id)
"""
result = subprocess.run(
[sys.executable, '-c', create_snapshot_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
print(f"✓ Created snapshot {snapshot_id}\n")
# Run SnapshotWorker with custom parent environment variable
print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook
'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast)
},
capture_output=True,
timeout=120,
)
stderr = result.stderr.decode()
print("\n--- SnapshotWorker stderr (first 50 lines) ---")
print('\n'.join(stderr.split('\n')[:50]))
print("--- End stderr ---\n")
# Verify hooks ran by checking Process records
print("Step 4: Verify environment variables in hook Process records")
verify_env_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from archivebox.machine.models import Process
from archivebox.core.models import Snapshot
import json
snapshot = Snapshot.objects.get(id='{snapshot_id}')
# Find hook processes for this snapshot
hook_processes = Process.objects.filter(
process_type=Process.TypeChoices.HOOK,
pwd__contains=str(snapshot.id)
).order_by('-created_at')
print(f"Found {{hook_processes.count()}} hook processes")
if hook_processes.count() == 0:
print("ERROR: No hook processes found!")
import sys
sys.exit(1)
# Check the first hook process environment
hook_process = hook_processes.first()
print(f"\\nChecking hook: {{hook_process.cmd}}")
print(f"Hook env keys: {{len(hook_process.env)}} total")
# Verify TEST_PARENT_ENV_VAR was preserved
test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR')
print(f" TEST_PARENT_ENV_VAR: {{test_parent}}")
assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}"
# Verify LIB_DIR is set
lib_dir = hook_process.env.get('LIB_DIR')
print(f" LIB_DIR: {{lib_dir}}")
assert lib_dir is not None, "LIB_DIR not set"
# Verify LIB_BIN_DIR is derived
lib_bin_dir = hook_process.env.get('LIB_BIN_DIR')
print(f" LIB_BIN_DIR: {{lib_bin_dir}}")
if lib_dir:
assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR"
assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}"
# Verify LIB_BIN_DIR is in PATH
path = hook_process.env.get('PATH')
if lib_bin_dir:
assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
# Verify canonical crawl/snapshot directories are exported for plugins
crawl_dir = hook_process.env.get('CRAWL_DIR')
snap_dir = hook_process.env.get('SNAP_DIR')
print(f" CRAWL_DIR: {{crawl_dir}}")
print(f" SNAP_DIR: {{snap_dir}}")
assert crawl_dir is not None, "CRAWL_DIR not set"
assert snap_dir is not None, "SNAP_DIR not set"
assert str(snapshot.id) in snap_dir, f"SNAP_DIR should contain snapshot id, got {{snap_dir}}"
# Verify NODE_PATH is set
node_path = hook_process.env.get('NODE_PATH')
node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
print(f" NODE_PATH: {{node_path}}")
print(f" NODE_MODULES_DIR: {{node_modules_dir}}")
if node_path:
# Should also have NODE_MODULES_DIR for backwards compatibility
assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH"
print("\\n✓ All environment checks passed")
"""
result = subprocess.run(
[sys.executable, '-c', verify_env_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nVerification error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Parent environment preserved in hooks")
print(" - Custom parent env vars reach hooks")
print(" - LIB_DIR propagated correctly")
print(" - LIB_BIN_DIR derived and added to PATH")
print(" - NODE_PATH/NODE_MODULES_DIR set when available")
print("="*80 + "\n")
def test_config_auto_fetch_relationships():
"""
Test that get_config() auto-fetches related objects from relationships.
Verifies:
- snapshot auto-fetched from archiveresult.snapshot
- crawl auto-fetched from snapshot.crawl
- user auto-fetched from crawl.created_by
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print("Test: Config Auto-Fetch Relationships")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print("✓ Archive initialized\n")
# Create objects with config at each level
print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
create_objects_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.config.configset import get_config
# Create crawl with config
crawl = Crawl.objects.create(
urls='https://example.com',
status='queued',
retry_at=timezone.now(),
config={{
'CRAWL_KEY': 'from_crawl',
'TIMEOUT': 777,
}}
)
# Create snapshot with config
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status='queued',
retry_at=timezone.now(),
config={{
'SNAPSHOT_KEY': 'from_snapshot',
'TIMEOUT': 555,
}}
)
# Create ArchiveResult
ar = ArchiveResult.objects.create(
snapshot=snapshot,
plugin='test',
hook_name='test_hook',
status=ArchiveResult.StatusChoices.STARTED
)
print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}")
# Test 1: Auto-fetch crawl from snapshot
print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl")
config = get_config(snapshot=snapshot)
assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}"
assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}"
assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}"
print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)")
print("✓ Both snapshot.config and crawl.config values present")
# Test 2: Auto-fetch snapshot from archiveresult
print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl")
config_from_ar = get_config(archiveresult=ar)
assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}"
assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot"
assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl"
print("✓ Auto-fetched snapshot from ar.snapshot")
print("✓ Auto-fetched crawl from snapshot.crawl")
# Test 3: Precedence without auto-fetch (explicit crawl only)
print("\\nTest 3: get_config(crawl=crawl) without snapshot")
config_crawl_only = get_config(crawl=crawl)
assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}"
assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl'
assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config"
print("✓ Crawl-only config has TIMEOUT=777")
print("✓ No snapshot config values present")
print("\\n✓ All auto-fetch tests passed")
"""
result = subprocess.run(
[sys.executable, '-c', create_objects_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nAuto-fetch test error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Config auto-fetches related objects correctly")
print(" - archiveresult → snapshot → crawl → user")
print(" - Precedence preserved during auto-fetch")
print("="*80 + "\n")
def test_config_precedence_with_environment_vars():
"""
Test that config precedence order is correct when environment vars are set.
Documented order (highest to lowest):
1. snapshot.config
2. crawl.config
3. user.config
4. persona config
5. environment variables <-- LOWER priority than snapshot/crawl
6. machine.config
7. config file
8. plugin defaults
9. core defaults
This test verifies snapshot.config overrides environment variables.
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print("Test: Config Precedence with Environment Variables")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
capture_output=True,
timeout=60,
)
assert result.returncode == 0
print("✓ Archive initialized\n")
# Test with environment variable set
print("Step 1: Test with TIMEOUT=999 in environment")
test_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
os.environ['TIMEOUT'] = '999' # Set env var
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.config.configset import get_config
# Create crawl with TIMEOUT=777
crawl = Crawl.objects.create(
urls='https://example.com',
status='queued',
retry_at=timezone.now(),
config={{'TIMEOUT': 777}}
)
# Create snapshot with TIMEOUT=555
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status='queued',
retry_at=timezone.now(),
config={{'TIMEOUT': 555}}
)
# Get config with all sources
config = get_config(snapshot=snapshot)
print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}")
print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}")
print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}")
print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}")
# Snapshot should override both crawl AND environment
expected = 555
actual = config.get('TIMEOUT')
if actual != expected:
print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}")
print(f" Snapshot.config should have highest priority!")
import sys
sys.exit(1)
print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)")
"""
result = subprocess.run(
[sys.executable, '-c', test_script],
cwd=str(data_dir.parent),
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nPrecedence bug detected:")
print(result.stderr.decode())
assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Snapshot config correctly overrides environment variables")
print("="*80 + "\n")
def test_new_environment_variables_added():
"""
Test that NEW environment variables (not in defaults) are added to config.
This is important for worker subprocesses that receive config via Process.env.
When Worker.start() creates a subprocess, it serializes config to Process.env.
The subprocess must be able to read those values back via get_config().
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print("Test: New Environment Variables Added to Config")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
capture_output=True,
timeout=60,
)
assert result.returncode == 0
print("✓ Archive initialized\n")
print("Step 1: Test that new uppercase env vars are added to config")
test_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
os.environ['NEW_CUSTOM_VAR'] = 'custom_value' # Not in defaults
os.environ['ANOTHER_VAR'] = 'another_value'
os.environ['lowercase_var'] = 'should_be_ignored' # Lowercase should be ignored
from archivebox.config.django import setup_django
setup_django()
from archivebox.config.configset import get_config
config = get_config()
# Check uppercase vars are added
new_var = config.get('NEW_CUSTOM_VAR')
another_var = config.get('ANOTHER_VAR')
lowercase_var = config.get('lowercase_var')
print(f"NEW_CUSTOM_VAR: {{new_var}}")
print(f"ANOTHER_VAR: {{another_var}}")
print(f"lowercase_var: {{lowercase_var}}")
assert new_var == 'custom_value', f"Expected 'custom_value', got {{new_var}}"
assert another_var == 'another_value', f"Expected 'another_value', got {{another_var}}"
assert lowercase_var is None, f"Lowercase vars should be ignored, got {{lowercase_var}}"
print("\\n✓ New uppercase environment variables added to config")
print("✓ Lowercase environment variables ignored")
"""
result = subprocess.run(
[sys.executable, '-c', test_script],
cwd=str(data_dir.parent),
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nTest error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Test failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: New environment variables correctly added to config")
print("="*80 + "\n")
if __name__ == '__main__':
# Run as standalone script
test_config_propagation_through_worker_hierarchy()
test_config_environment_variable_parsing()
test_parent_environment_preserved_in_hooks()
test_config_auto_fetch_relationships()
test_config_precedence_with_environment_vars()
test_new_environment_variables_added()