mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
1079 lines
38 KiB
Python
1079 lines
38 KiB
Python
"""
|
|
Integration test for config propagation through worker hierarchy.
|
|
|
|
Tests that config is properly merged and passed through:
|
|
Parent CLI/Orchestrator
|
|
└── CrawlWorker subprocess (via Process.env)
|
|
└── SnapshotWorker subprocess (via Process.env)
|
|
└── Hook subprocess (via Process.env)
|
|
|
|
Config priority order (highest to lowest):
|
|
1. Snapshot.config (JSON field)
|
|
2. Crawl.config (JSON field)
|
|
3. User.config (JSON field)
|
|
4. Environment variables (os.environ + Process.env)
|
|
5. Config file (ArchiveBox.conf)
|
|
6. Plugin defaults (config.json)
|
|
7. Core defaults
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import tempfile
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
|
|
def test_config_propagation_through_worker_hierarchy():
|
|
"""
|
|
Integration test: Verify config is properly merged at every level.
|
|
|
|
Test flow:
|
|
1. Create test archive with custom config in ArchiveBox.conf
|
|
2. Set custom env vars before spawning worker
|
|
3. Create Crawl with custom crawl.config JSON field
|
|
4. Create Snapshot with custom snapshot.config JSON field
|
|
5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
|
|
6. Verify worker received merged config from all sources
|
|
7. Verify hook subprocess also received correct config
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print("Test: Config Propagation Through Worker Hierarchy")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Step 1: Initialize archive
|
|
print("Step 1: Initialize archive")
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
|
print("✓ Archive initialized\n")
|
|
|
|
# Step 2: Write custom config to ArchiveBox.conf
|
|
print("Step 2: Write custom config to ArchiveBox.conf")
|
|
config_file = data_dir / 'ArchiveBox.conf'
|
|
config_file.write_text("""
|
|
[GENERAL]
|
|
# Custom timeout in config file
|
|
TIMEOUT = 999
|
|
|
|
[ARCHIVING_CONFIG]
|
|
# Enable all plugins for proper testing
|
|
SAVE_WGET = True
|
|
SAVE_WARC = True
|
|
SAVE_PDF = True
|
|
SAVE_DOM = True
|
|
SAVE_SINGLEFILE = True
|
|
SAVE_READABILITY = True
|
|
SAVE_MERCURY = True
|
|
SAVE_HTMLTOTEXT = True
|
|
SAVE_GIT = True
|
|
SAVE_MEDIA = True
|
|
SAVE_ARCHIVE_DOT_ORG = True
|
|
SAVE_TITLE = True
|
|
SAVE_FAVICON = True
|
|
SAVE_SCREENSHOT = True
|
|
""")
|
|
print("✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
|
|
|
|
# Step 2.5: Set Machine.config values
|
|
print("Step 2.5: Set Machine.config with custom binary path")
|
|
set_machine_config_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.machine.models import Machine
|
|
|
|
machine = Machine.current()
|
|
machine.config = {{
|
|
'CUSTOM_MACHINE_KEY': 'from_machine_config',
|
|
'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
|
|
}}
|
|
machine.save()
|
|
print(f"Machine {{machine.hostname}} config updated")
|
|
"""
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', set_machine_config_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
|
|
print("✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
|
|
|
|
# Step 3: Create Crawl via Django ORM with custom crawl.config
|
|
print("Step 3: Create Crawl with custom crawl.config JSON")
|
|
create_crawl_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from django.utils import timezone
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
# Create crawl with custom config
|
|
crawl = Crawl.objects.create(
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
urls='https://example.com',
|
|
config={{
|
|
'TIMEOUT': 777, # Crawl-level override (higher priority than file)
|
|
'CUSTOM_CRAWL_KEY': 'from_crawl_json',
|
|
}}
|
|
)
|
|
print(crawl.id)
|
|
"""
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', create_crawl_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
|
|
# Extract UUID from output (last line should be the UUID)
|
|
crawl_id = result.stdout.decode().strip().split('\n')[-1]
|
|
print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
|
|
|
|
# Step 4: Create Snapshot with custom snapshot.config
|
|
print("Step 4: Create Snapshot with custom snapshot.config JSON")
|
|
create_snapshot_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from django.utils import timezone
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
crawl = Crawl.objects.get(id='{crawl_id}')
|
|
snapshot = Snapshot.objects.create(
|
|
url='https://example.com',
|
|
crawl=crawl,
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
config={{
|
|
'TIMEOUT': 555, # Snapshot-level override (highest priority)
|
|
'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
|
|
'SAVE_SCREENSHOT': True, # Keep screenshot enabled
|
|
'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
|
|
}}
|
|
)
|
|
print(snapshot.id)
|
|
"""
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', create_snapshot_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
|
|
# Extract UUID from output (last line should be the UUID)
|
|
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
|
|
print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
|
|
|
|
# Step 5: Run SnapshotWorker with additional env var
|
|
print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
'ENV_VAR_KEY': 'from_environment', # Environment variable
|
|
},
|
|
capture_output=True,
|
|
timeout=120,
|
|
)
|
|
|
|
stdout = result.stdout.decode()
|
|
stderr = result.stderr.decode()
|
|
|
|
print("\n--- SnapshotWorker stdout ---")
|
|
print(stdout)
|
|
print("\n--- SnapshotWorker stderr ---")
|
|
print(stderr)
|
|
print("--- End output ---\n")
|
|
|
|
# Step 6: Verify config was properly merged
|
|
print("Step 6: Verify config merging")
|
|
|
|
# Check that SnapshotWorker ran successfully
|
|
assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
|
|
|
|
# Verify config by checking ArchiveResults and merged config state
|
|
print("\n--- Verifying config propagation ---\n")
|
|
|
|
# Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
|
|
verify_precedence_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.config.configset import get_config
|
|
|
|
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
|
|
|
# Test precedence by getting config at different levels
|
|
print("\\nTesting config precedence order:")
|
|
|
|
# 1. Just defaults (lowest priority)
|
|
config_defaults = get_config()
|
|
print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}")
|
|
|
|
# 2. With machine config
|
|
from archivebox.machine.models import Machine
|
|
machine = Machine.current()
|
|
config_machine = get_config(machine=machine)
|
|
custom_machine = config_machine.get('CUSTOM_MACHINE_KEY')
|
|
print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}")
|
|
|
|
# 3. With crawl config
|
|
config_crawl = get_config(crawl=snapshot.crawl)
|
|
print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)")
|
|
assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}"
|
|
|
|
# 4. With snapshot config (highest priority)
|
|
config_snapshot = get_config(snapshot=snapshot)
|
|
print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)")
|
|
assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}"
|
|
|
|
# Verify snapshot config overrides crawl config
|
|
assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present"
|
|
assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present"
|
|
assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present"
|
|
|
|
print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
|
|
"""
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', verify_precedence_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.returncode != 0:
|
|
print("\nPrecedence verification error:")
|
|
print(result.stderr.decode())
|
|
assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}"
|
|
|
|
# Verify config values were actually used by checking ArchiveResults
|
|
verify_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.core.models import Snapshot, ArchiveResult
|
|
from archivebox.config.configset import get_config
|
|
from archivebox.hooks import discover_hooks
|
|
|
|
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
|
print(f"Snapshot status: {{snapshot.status}}")
|
|
print(f"Snapshot URL: {{snapshot.url}}")
|
|
|
|
# Check that snapshot reached sealed state
|
|
assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
|
|
|
|
# Verify all config sources are present in merged config
|
|
print("\\nVerifying config merge priority:")
|
|
config = get_config(snapshot=snapshot)
|
|
|
|
# 1. Snapshot.config (highest priority)
|
|
timeout = config.get('TIMEOUT')
|
|
print(f" 1. Snapshot.config: TIMEOUT={{timeout}} (expected: 555)")
|
|
assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
|
|
|
|
wget_enabled = config.get('WGET_ENABLED')
|
|
print(f" 1. Snapshot.config: WGET_ENABLED={{wget_enabled}} (expected: False)")
|
|
assert wget_enabled == False, f"WGET_ENABLED should be False from snapshot.config, got {{wget_enabled}}"
|
|
|
|
custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
|
|
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={{custom_snapshot}} (expected: from_snapshot_json)")
|
|
assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
|
|
|
|
# 2. Crawl.config
|
|
custom_crawl = config.get('CUSTOM_CRAWL_KEY')
|
|
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={{custom_crawl}} (expected: from_crawl_json)")
|
|
assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
|
|
|
|
# 6. Machine.config
|
|
custom_machine = config.get('CUSTOM_MACHINE_KEY')
|
|
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={{custom_machine}} (expected: from_machine_config)")
|
|
assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
|
|
|
|
wget_binary = config.get('WGET_BINARY')
|
|
print(f" 6. Machine.config: WGET_BINARY={{wget_binary}} (expected: /custom/machine/wget)")
|
|
# Note: This might be overridden by environment or other sources, just check it's present
|
|
assert wget_binary is not None, f"WGET_BINARY should be present"
|
|
|
|
# Check ArchiveResults to verify plugins actually ran with correct config
|
|
results = ArchiveResult.objects.filter(snapshot=snapshot)
|
|
print(f"\\nArchiveResults created: {{results.count()}}")
|
|
|
|
for ar in results.order_by('plugin'):
|
|
print(f" {{ar.plugin}}: {{ar.status}}")
|
|
|
|
# Verify SAVE_WGET=False was respected (should have no wget result)
|
|
wget_results = results.filter(plugin='wget')
|
|
print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
|
|
assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
|
|
|
|
# Verify hook selection respected merged SAVE_* config, without depending on
|
|
# a browser extractor finishing successfully in CI.
|
|
snapshot_hooks = [hook.name for hook in discover_hooks('Snapshot', config=config)]
|
|
print(f"Enabled snapshot hooks: {{snapshot_hooks}}")
|
|
assert any('screenshot' in hook_name for hook_name in snapshot_hooks), (
|
|
f"SCREENSHOT should remain enabled in hook discovery, got {{snapshot_hooks}}"
|
|
)
|
|
assert not any('wget' in hook_name for hook_name in snapshot_hooks), (
|
|
f"WGET should be filtered out by snapshot.config, got {{snapshot_hooks}}"
|
|
)
|
|
|
|
print("\\n✓ All config sources correctly merged:")
|
|
print(" - Snapshot.config overrides (highest priority)")
|
|
print(" - Crawl.config values present")
|
|
print(" - Machine.config values present")
|
|
print(" - File config values present")
|
|
print("✓ Config priority order verified")
|
|
print("✓ Snapshot successfully sealed")
|
|
"""
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', verify_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.returncode != 0:
|
|
print("\nVerification error:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
def test_config_environment_variable_parsing():
|
|
"""
|
|
Test that Process._build_env() correctly serializes config values,
|
|
and get_config() correctly parses them back from environment.
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print("Test: Config Environment Variable Parsing")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Initialize archive
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
|
|
|
# Test various data types in config
|
|
test_config_types_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.config.configset import get_config
|
|
from archivebox.machine.models import Process, Machine
|
|
|
|
# Test get_config() with no overrides (baseline)
|
|
config = get_config()
|
|
print(f"Baseline config keys: {{len(config)}}")
|
|
|
|
# Create a test Process with various config types
|
|
process = Process.objects.create(
|
|
machine=Machine.current(),
|
|
process_type=Process.TypeChoices.WORKER,
|
|
pwd='{data_dir}',
|
|
cmd=['test'],
|
|
env={{
|
|
'STRING_VAL': 'hello',
|
|
'INT_VAL': 123,
|
|
'FLOAT_VAL': 45.67,
|
|
'BOOL_TRUE': True,
|
|
'BOOL_FALSE': False,
|
|
'LIST_VAL': ['a', 'b', 'c'],
|
|
'DICT_VAL': {{'key': 'value'}},
|
|
'NONE_VAL': None,
|
|
}},
|
|
)
|
|
|
|
# Test _build_env() serialization
|
|
env = process._build_env()
|
|
print(f"\\nSerialized environment:")
|
|
print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
|
|
print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
|
|
print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
|
|
print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
|
|
print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
|
|
print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
|
|
print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
|
|
print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
|
|
|
|
# Verify all are strings (required by subprocess.Popen)
|
|
assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
|
|
assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
|
|
assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
|
|
assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
|
|
assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
|
|
assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
|
|
assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
|
|
|
|
print("\\n✓ All environment values correctly serialized as strings")
|
|
|
|
# Now test that get_config() can parse them back
|
|
# Simulate subprocess by setting os.environ
|
|
import json
|
|
for key, val in env.items():
|
|
if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
|
|
os.environ[key] = val
|
|
|
|
# Get config again - should parse from environment
|
|
config = get_config()
|
|
print(f"\\nParsed from environment:")
|
|
print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
|
|
print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
|
|
print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
|
|
print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
|
|
print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
|
|
print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
|
|
print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
|
|
|
|
print("\\n✓ All config values correctly parsed from environment")
|
|
"""
|
|
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', test_config_types_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.stderr:
|
|
print("Script stderr:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: Config serialization and parsing works correctly")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
def test_parent_environment_preserved_in_hooks():
|
|
"""
|
|
Test that parent environment variables are preserved in hook execution.
|
|
|
|
This test catches the bug where we built env=os.environ.copy() but then
|
|
clobbered it with process.env={}, losing all parent environment.
|
|
|
|
Also verifies:
|
|
- NODE_PATH is correctly derived from LIB_DIR/npm/node_modules
|
|
- LIB_BIN_DIR is correctly derived and added to PATH
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print("Test: Parent Environment Preserved in Hooks")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Initialize archive
|
|
print("Step 1: Initialize archive")
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
|
print("✓ Archive initialized\n")
|
|
|
|
# Create snapshot
|
|
print("Step 2: Create Snapshot")
|
|
create_snapshot_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from django.utils import timezone
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
crawl = Crawl.objects.create(
|
|
urls='https://example.com',
|
|
status='queued',
|
|
retry_at=timezone.now()
|
|
)
|
|
|
|
snapshot = Snapshot.objects.create(
|
|
url='https://example.com',
|
|
crawl=crawl,
|
|
status='queued',
|
|
retry_at=timezone.now()
|
|
)
|
|
print(snapshot.id)
|
|
"""
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', create_snapshot_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
|
|
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
|
|
print(f"✓ Created snapshot {snapshot_id}\n")
|
|
|
|
# Run SnapshotWorker with custom parent environment variable
|
|
print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook
|
|
'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast)
|
|
},
|
|
capture_output=True,
|
|
timeout=120,
|
|
)
|
|
|
|
stderr = result.stderr.decode()
|
|
|
|
print("\n--- SnapshotWorker stderr (first 50 lines) ---")
|
|
print('\n'.join(stderr.split('\n')[:50]))
|
|
print("--- End stderr ---\n")
|
|
|
|
# Verify hooks ran by checking Process records
|
|
print("Step 4: Verify environment variables in hook Process records")
|
|
verify_env_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.machine.models import Process
|
|
from archivebox.core.models import Snapshot
|
|
import json
|
|
|
|
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
|
|
|
# Find hook processes for this snapshot
|
|
hook_processes = Process.objects.filter(
|
|
process_type=Process.TypeChoices.HOOK,
|
|
pwd__contains=str(snapshot.id)
|
|
).order_by('-created_at')
|
|
|
|
print(f"Found {{hook_processes.count()}} hook processes")
|
|
|
|
if hook_processes.count() == 0:
|
|
print("ERROR: No hook processes found!")
|
|
import sys
|
|
sys.exit(1)
|
|
|
|
# Check the first hook process environment
|
|
hook_process = hook_processes.first()
|
|
print(f"\\nChecking hook: {{hook_process.cmd}}")
|
|
print(f"Hook env keys: {{len(hook_process.env)}} total")
|
|
|
|
# Verify TEST_PARENT_ENV_VAR was preserved
|
|
test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR')
|
|
print(f" TEST_PARENT_ENV_VAR: {{test_parent}}")
|
|
assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}"
|
|
|
|
# Verify LIB_DIR is set
|
|
lib_dir = hook_process.env.get('LIB_DIR')
|
|
print(f" LIB_DIR: {{lib_dir}}")
|
|
assert lib_dir is not None, "LIB_DIR not set"
|
|
|
|
# Verify LIB_BIN_DIR is derived
|
|
lib_bin_dir = hook_process.env.get('LIB_BIN_DIR')
|
|
print(f" LIB_BIN_DIR: {{lib_bin_dir}}")
|
|
if lib_dir:
|
|
assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR"
|
|
assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}"
|
|
|
|
# Verify LIB_BIN_DIR is in PATH
|
|
path = hook_process.env.get('PATH')
|
|
if lib_bin_dir:
|
|
assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
|
|
|
|
# Verify canonical crawl/snapshot directories are exported for plugins
|
|
crawl_dir = hook_process.env.get('CRAWL_DIR')
|
|
snap_dir = hook_process.env.get('SNAP_DIR')
|
|
print(f" CRAWL_DIR: {{crawl_dir}}")
|
|
print(f" SNAP_DIR: {{snap_dir}}")
|
|
assert crawl_dir is not None, "CRAWL_DIR not set"
|
|
assert snap_dir is not None, "SNAP_DIR not set"
|
|
assert str(snapshot.id) in snap_dir, f"SNAP_DIR should contain snapshot id, got {{snap_dir}}"
|
|
|
|
# Verify NODE_PATH is set
|
|
node_path = hook_process.env.get('NODE_PATH')
|
|
node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
|
|
print(f" NODE_PATH: {{node_path}}")
|
|
print(f" NODE_MODULES_DIR: {{node_modules_dir}}")
|
|
if node_path:
|
|
# Should also have NODE_MODULES_DIR for backwards compatibility
|
|
assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH"
|
|
|
|
print("\\n✓ All environment checks passed")
|
|
"""
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', verify_env_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.returncode != 0:
|
|
print("\nVerification error:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: Parent environment preserved in hooks")
|
|
print(" - Custom parent env vars reach hooks")
|
|
print(" - LIB_DIR propagated correctly")
|
|
print(" - LIB_BIN_DIR derived and added to PATH")
|
|
print(" - NODE_PATH/NODE_MODULES_DIR set when available")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
def test_config_auto_fetch_relationships():
|
|
"""
|
|
Test that get_config() auto-fetches related objects from relationships.
|
|
|
|
Verifies:
|
|
- snapshot auto-fetched from archiveresult.snapshot
|
|
- crawl auto-fetched from snapshot.crawl
|
|
- user auto-fetched from crawl.created_by
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print("Test: Config Auto-Fetch Relationships")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Initialize archive
|
|
print("Step 1: Initialize archive")
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
|
print("✓ Archive initialized\n")
|
|
|
|
# Create objects with config at each level
|
|
print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
|
|
create_objects_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from django.utils import timezone
|
|
from archivebox.crawls.models import Crawl
|
|
from archivebox.core.models import Snapshot, ArchiveResult
|
|
from archivebox.config.configset import get_config
|
|
|
|
# Create crawl with config
|
|
crawl = Crawl.objects.create(
|
|
urls='https://example.com',
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
config={{
|
|
'CRAWL_KEY': 'from_crawl',
|
|
'TIMEOUT': 777,
|
|
}}
|
|
)
|
|
|
|
# Create snapshot with config
|
|
snapshot = Snapshot.objects.create(
|
|
url='https://example.com',
|
|
crawl=crawl,
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
config={{
|
|
'SNAPSHOT_KEY': 'from_snapshot',
|
|
'TIMEOUT': 555,
|
|
}}
|
|
)
|
|
|
|
# Create ArchiveResult
|
|
ar = ArchiveResult.objects.create(
|
|
snapshot=snapshot,
|
|
plugin='test',
|
|
hook_name='test_hook',
|
|
status=ArchiveResult.StatusChoices.STARTED
|
|
)
|
|
|
|
print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}")
|
|
|
|
# Test 1: Auto-fetch crawl from snapshot
|
|
print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl")
|
|
config = get_config(snapshot=snapshot)
|
|
assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}"
|
|
assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}"
|
|
assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}"
|
|
print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)")
|
|
print("✓ Both snapshot.config and crawl.config values present")
|
|
|
|
# Test 2: Auto-fetch snapshot from archiveresult
|
|
print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl")
|
|
config_from_ar = get_config(archiveresult=ar)
|
|
assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}"
|
|
assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot"
|
|
assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl"
|
|
print("✓ Auto-fetched snapshot from ar.snapshot")
|
|
print("✓ Auto-fetched crawl from snapshot.crawl")
|
|
|
|
# Test 3: Precedence without auto-fetch (explicit crawl only)
|
|
print("\\nTest 3: get_config(crawl=crawl) without snapshot")
|
|
config_crawl_only = get_config(crawl=crawl)
|
|
assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}"
|
|
assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl'
|
|
assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config"
|
|
print("✓ Crawl-only config has TIMEOUT=777")
|
|
print("✓ No snapshot config values present")
|
|
|
|
print("\\n✓ All auto-fetch tests passed")
|
|
"""
|
|
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', create_objects_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.returncode != 0:
|
|
print("\nAuto-fetch test error:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: Config auto-fetches related objects correctly")
|
|
print(" - archiveresult → snapshot → crawl → user")
|
|
print(" - Precedence preserved during auto-fetch")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
def test_config_precedence_with_environment_vars():
|
|
"""
|
|
Test that config precedence order is correct when environment vars are set.
|
|
|
|
Documented order (highest to lowest):
|
|
1. snapshot.config
|
|
2. crawl.config
|
|
3. user.config
|
|
4. persona config
|
|
5. environment variables <-- LOWER priority than snapshot/crawl
|
|
6. machine.config
|
|
7. config file
|
|
8. plugin defaults
|
|
9. core defaults
|
|
|
|
This test verifies snapshot.config overrides environment variables.
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print("Test: Config Precedence with Environment Variables")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Initialize
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0
|
|
print("✓ Archive initialized\n")
|
|
|
|
# Test with environment variable set
|
|
print("Step 1: Test with TIMEOUT=999 in environment")
|
|
test_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
os.environ['TIMEOUT'] = '999' # Set env var
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from django.utils import timezone
|
|
from archivebox.crawls.models import Crawl
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.config.configset import get_config
|
|
|
|
# Create crawl with TIMEOUT=777
|
|
crawl = Crawl.objects.create(
|
|
urls='https://example.com',
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
config={{'TIMEOUT': 777}}
|
|
)
|
|
|
|
# Create snapshot with TIMEOUT=555
|
|
snapshot = Snapshot.objects.create(
|
|
url='https://example.com',
|
|
crawl=crawl,
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
config={{'TIMEOUT': 555}}
|
|
)
|
|
|
|
# Get config with all sources
|
|
config = get_config(snapshot=snapshot)
|
|
|
|
print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}")
|
|
print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}")
|
|
print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}")
|
|
print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}")
|
|
|
|
# Snapshot should override both crawl AND environment
|
|
expected = 555
|
|
actual = config.get('TIMEOUT')
|
|
if actual != expected:
|
|
print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}")
|
|
print(f" Snapshot.config should have highest priority!")
|
|
import sys
|
|
sys.exit(1)
|
|
|
|
print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)")
|
|
"""
|
|
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', test_script],
|
|
cwd=str(data_dir.parent),
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.returncode != 0:
|
|
print("\nPrecedence bug detected:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: Snapshot config correctly overrides environment variables")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
def test_new_environment_variables_added():
|
|
"""
|
|
Test that NEW environment variables (not in defaults) are added to config.
|
|
|
|
This is important for worker subprocesses that receive config via Process.env.
|
|
When Worker.start() creates a subprocess, it serializes config to Process.env.
|
|
The subprocess must be able to read those values back via get_config().
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print("Test: New Environment Variables Added to Config")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Initialize
|
|
result = subprocess.run(
|
|
[sys.executable, '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0
|
|
print("✓ Archive initialized\n")
|
|
|
|
print("Step 1: Test that new uppercase env vars are added to config")
|
|
test_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
os.environ['NEW_CUSTOM_VAR'] = 'custom_value' # Not in defaults
|
|
os.environ['ANOTHER_VAR'] = 'another_value'
|
|
os.environ['lowercase_var'] = 'should_be_ignored' # Lowercase should be ignored
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
from archivebox.config.configset import get_config
|
|
|
|
config = get_config()
|
|
|
|
# Check uppercase vars are added
|
|
new_var = config.get('NEW_CUSTOM_VAR')
|
|
another_var = config.get('ANOTHER_VAR')
|
|
lowercase_var = config.get('lowercase_var')
|
|
|
|
print(f"NEW_CUSTOM_VAR: {{new_var}}")
|
|
print(f"ANOTHER_VAR: {{another_var}}")
|
|
print(f"lowercase_var: {{lowercase_var}}")
|
|
|
|
assert new_var == 'custom_value', f"Expected 'custom_value', got {{new_var}}"
|
|
assert another_var == 'another_value', f"Expected 'another_value', got {{another_var}}"
|
|
assert lowercase_var is None, f"Lowercase vars should be ignored, got {{lowercase_var}}"
|
|
|
|
print("\\n✓ New uppercase environment variables added to config")
|
|
print("✓ Lowercase environment variables ignored")
|
|
"""
|
|
|
|
result = subprocess.run(
|
|
[sys.executable, '-c', test_script],
|
|
cwd=str(data_dir.parent),
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.returncode != 0:
|
|
print("\nTest error:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Test failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: New environment variables correctly added to config")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Run as standalone script
|
|
test_config_propagation_through_worker_hierarchy()
|
|
test_config_environment_variable_parsing()
|
|
test_parent_environment_preserved_in_hooks()
|
|
test_config_auto_fetch_relationships()
|
|
test_config_precedence_with_environment_vars()
|
|
test_new_environment_variables_added()
|