mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
482 lines
17 KiB
Python
482 lines
17 KiB
Python
"""
|
|
Integration test for config propagation through worker hierarchy.
|
|
|
|
Tests that config is properly merged and passed through:
|
|
Parent CLI/Orchestrator
|
|
└── CrawlWorker subprocess (via Process.env)
|
|
└── SnapshotWorker subprocess (via Process.env)
|
|
└── Hook subprocess (via Process.env)
|
|
|
|
Config priority order (highest to lowest):
|
|
1. Snapshot.config (JSON field)
|
|
2. Crawl.config (JSON field)
|
|
3. User.config (JSON field)
|
|
4. Environment variables (os.environ + Process.env)
|
|
5. Config file (ArchiveBox.conf)
|
|
6. Plugin defaults (config.json)
|
|
7. Core defaults
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import tempfile
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
|
|
|
|
def test_config_propagation_through_worker_hierarchy():
|
|
"""
|
|
Integration test: Verify config is properly merged at every level.
|
|
|
|
Test flow:
|
|
1. Create test archive with custom config in ArchiveBox.conf
|
|
2. Set custom env vars before spawning worker
|
|
3. Create Crawl with custom crawl.config JSON field
|
|
4. Create Snapshot with custom snapshot.config JSON field
|
|
5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
|
|
6. Verify worker received merged config from all sources
|
|
7. Verify hook subprocess also received correct config
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"Test: Config Propagation Through Worker Hierarchy")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Step 1: Initialize archive
|
|
print("Step 1: Initialize archive")
|
|
result = subprocess.run(
|
|
['python', '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
|
print(f"✓ Archive initialized\n")
|
|
|
|
# Step 2: Write custom config to ArchiveBox.conf
|
|
print("Step 2: Write custom config to ArchiveBox.conf")
|
|
config_file = data_dir / 'ArchiveBox.conf'
|
|
config_file.write_text("""
|
|
[GENERAL]
|
|
# Custom timeout in config file
|
|
TIMEOUT = 999
|
|
|
|
[ARCHIVING_CONFIG]
|
|
# Enable all plugins for proper testing
|
|
SAVE_WGET = True
|
|
SAVE_WARC = True
|
|
SAVE_PDF = True
|
|
SAVE_DOM = True
|
|
SAVE_SINGLEFILE = True
|
|
SAVE_READABILITY = True
|
|
SAVE_MERCURY = True
|
|
SAVE_HTMLTOTEXT = True
|
|
SAVE_GIT = True
|
|
SAVE_MEDIA = True
|
|
SAVE_ARCHIVE_DOT_ORG = True
|
|
SAVE_TITLE = True
|
|
SAVE_FAVICON = True
|
|
SAVE_SCREENSHOT = True
|
|
""")
|
|
print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
|
|
|
|
# Step 2.5: Set Machine.config values
|
|
print("Step 2.5: Set Machine.config with custom binary path")
|
|
set_machine_config_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.machine.models import Machine
|
|
|
|
machine = Machine.current()
|
|
machine.config = {{
|
|
'CUSTOM_MACHINE_KEY': 'from_machine_config',
|
|
'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
|
|
}}
|
|
machine.save()
|
|
print(f"Machine {{machine.hostname}} config updated")
|
|
"""
|
|
result = subprocess.run(
|
|
['python', '-c', set_machine_config_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
|
|
print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
|
|
|
|
# Step 3: Create Crawl via Django ORM with custom crawl.config
|
|
print("Step 3: Create Crawl with custom crawl.config JSON")
|
|
create_crawl_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from django.utils import timezone
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
# Create crawl with custom config
|
|
crawl = Crawl.objects.create(
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
urls='https://example.com',
|
|
config={{
|
|
'TIMEOUT': 777, # Crawl-level override (higher priority than file)
|
|
'CUSTOM_CRAWL_KEY': 'from_crawl_json',
|
|
}}
|
|
)
|
|
print(crawl.id)
|
|
"""
|
|
result = subprocess.run(
|
|
['python', '-c', create_crawl_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
|
|
# Extract UUID from output (last line should be the UUID)
|
|
crawl_id = result.stdout.decode().strip().split('\n')[-1]
|
|
print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
|
|
|
|
# Step 4: Create Snapshot with custom snapshot.config
|
|
print("Step 4: Create Snapshot with custom snapshot.config JSON")
|
|
create_snapshot_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from django.utils import timezone
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
crawl = Crawl.objects.get(id='{crawl_id}')
|
|
snapshot = Snapshot.objects.create(
|
|
url='https://example.com',
|
|
crawl=crawl,
|
|
status='queued',
|
|
retry_at=timezone.now(),
|
|
config={{
|
|
'TIMEOUT': 555, # Snapshot-level override (highest priority)
|
|
'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
|
|
'SAVE_SCREENSHOT': True, # Keep screenshot enabled
|
|
'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
|
|
}}
|
|
)
|
|
print(snapshot.id)
|
|
"""
|
|
result = subprocess.run(
|
|
['python', '-c', create_snapshot_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
|
|
# Extract UUID from output (last line should be the UUID)
|
|
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
|
|
print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
|
|
|
|
# Step 5: Run SnapshotWorker with additional env var
|
|
print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
|
|
result = subprocess.run(
|
|
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
'ENV_VAR_KEY': 'from_environment', # Environment variable
|
|
},
|
|
capture_output=True,
|
|
timeout=120,
|
|
)
|
|
|
|
stdout = result.stdout.decode()
|
|
stderr = result.stderr.decode()
|
|
|
|
print("\n--- SnapshotWorker stdout ---")
|
|
print(stdout)
|
|
print("\n--- SnapshotWorker stderr ---")
|
|
print(stderr)
|
|
print("--- End output ---\n")
|
|
|
|
# Step 6: Verify config was properly merged
|
|
print("Step 6: Verify config merging")
|
|
|
|
# Check that SnapshotWorker ran successfully
|
|
assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
|
|
|
|
# Verify config by checking stderr debug output and ArchiveResults in database
|
|
print("\n--- Verifying config propagation ---\n")
|
|
|
|
# Check for config debug messages in stderr
|
|
assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
|
|
"Expected debug output not found in stderr"
|
|
print("✓ Config debug output found in stderr")
|
|
|
|
# Verify config values were actually used by checking ArchiveResults
|
|
verify_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.core.models import Snapshot, ArchiveResult
|
|
from archivebox.config.configset import get_config
|
|
|
|
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
|
print(f"Snapshot status: {{snapshot.status}}")
|
|
print(f"Snapshot URL: {{snapshot.url}}")
|
|
|
|
# Check that snapshot reached sealed state
|
|
assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
|
|
|
|
# Verify all config sources are present in merged config
|
|
print("\\nVerifying config merge priority:")
|
|
config = get_config(snapshot=snapshot)
|
|
|
|
# 1. Snapshot.config (highest priority)
|
|
timeout = config.get('TIMEOUT')
|
|
print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
|
|
assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
|
|
|
|
wget_enabled = config.get('SAVE_WGET')
|
|
print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
|
|
assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
|
|
|
|
custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
|
|
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
|
|
assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
|
|
|
|
# 2. Crawl.config
|
|
custom_crawl = config.get('CUSTOM_CRAWL_KEY')
|
|
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
|
|
assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
|
|
|
|
# 6. Machine.config
|
|
custom_machine = config.get('CUSTOM_MACHINE_KEY')
|
|
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
|
|
assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
|
|
|
|
wget_binary = config.get('WGET_BINARY')
|
|
print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
|
|
# Note: This might be overridden by environment or other sources, just check it's present
|
|
assert wget_binary is not None, f"WGET_BINARY should be present"
|
|
|
|
# Check ArchiveResults to verify plugins actually ran with correct config
|
|
results = ArchiveResult.objects.filter(snapshot=snapshot)
|
|
print(f"\\nArchiveResults created: {{results.count()}}")
|
|
|
|
for ar in results.order_by('plugin'):
|
|
print(f" {{ar.plugin}}: {{ar.status}}")
|
|
|
|
# Verify SAVE_WGET=False was respected (should have no wget result)
|
|
wget_results = results.filter(plugin='wget')
|
|
print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
|
|
assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
|
|
|
|
# Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
|
|
screenshot_results = results.filter(plugin='screenshot')
|
|
print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
|
|
assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
|
|
|
|
print("\\n✓ All config sources correctly merged:")
|
|
print(" - Snapshot.config overrides (highest priority)")
|
|
print(" - Crawl.config values present")
|
|
print(" - Machine.config values present")
|
|
print(" - File config values present")
|
|
print("✓ Config priority order verified")
|
|
print("✓ Snapshot successfully sealed")
|
|
"""
|
|
result = subprocess.run(
|
|
['python', '-c', verify_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.returncode != 0:
|
|
print("\nVerification error:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
def test_config_environment_variable_parsing():
|
|
"""
|
|
Test that Process._build_env() correctly serializes config values,
|
|
and get_config() correctly parses them back from environment.
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir) / 'test_archive'
|
|
data_dir.mkdir()
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"Test: Config Environment Variable Parsing")
|
|
print(f"DATA_DIR: {data_dir}")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Initialize archive
|
|
result = subprocess.run(
|
|
['python', '-m', 'archivebox', 'init'],
|
|
cwd=str(data_dir),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
|
|
|
# Test various data types in config
|
|
test_config_types_script = f"""
|
|
import os
|
|
os.environ['DATA_DIR'] = '{data_dir}'
|
|
|
|
from archivebox.config.django import setup_django
|
|
setup_django()
|
|
|
|
from archivebox.config.configset import get_config
|
|
from archivebox.machine.models import Process, Machine
|
|
|
|
# Test get_config() with no overrides (baseline)
|
|
config = get_config()
|
|
print(f"Baseline config keys: {{len(config)}}")
|
|
|
|
# Create a test Process with various config types
|
|
process = Process.objects.create(
|
|
machine=Machine.current(),
|
|
process_type=Process.TypeChoices.WORKER,
|
|
pwd='{data_dir}',
|
|
cmd=['test'],
|
|
env={{
|
|
'STRING_VAL': 'hello',
|
|
'INT_VAL': 123,
|
|
'FLOAT_VAL': 45.67,
|
|
'BOOL_TRUE': True,
|
|
'BOOL_FALSE': False,
|
|
'LIST_VAL': ['a', 'b', 'c'],
|
|
'DICT_VAL': {{'key': 'value'}},
|
|
'NONE_VAL': None,
|
|
}},
|
|
)
|
|
|
|
# Test _build_env() serialization
|
|
env = process._build_env()
|
|
print(f"\\nSerialized environment:")
|
|
print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
|
|
print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
|
|
print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
|
|
print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
|
|
print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
|
|
print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
|
|
print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
|
|
print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
|
|
|
|
# Verify all are strings (required by subprocess.Popen)
|
|
assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
|
|
assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
|
|
assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
|
|
assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
|
|
assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
|
|
assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
|
|
assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
|
|
|
|
print("\\n✓ All environment values correctly serialized as strings")
|
|
|
|
# Now test that get_config() can parse them back
|
|
# Simulate subprocess by setting os.environ
|
|
import json
|
|
for key, val in env.items():
|
|
if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
|
|
os.environ[key] = val
|
|
|
|
# Get config again - should parse from environment
|
|
config = get_config()
|
|
print(f"\\nParsed from environment:")
|
|
print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
|
|
print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
|
|
print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
|
|
print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
|
|
print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
|
|
print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
|
|
print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
|
|
|
|
print("\\n✓ All config values correctly parsed from environment")
|
|
"""
|
|
|
|
result = subprocess.run(
|
|
['python', '-c', test_config_types_script],
|
|
cwd=str(data_dir.parent),
|
|
env={
|
|
**os.environ,
|
|
'DATA_DIR': str(data_dir),
|
|
'USE_COLOR': 'False',
|
|
},
|
|
capture_output=True,
|
|
timeout=30,
|
|
)
|
|
|
|
print(result.stdout.decode())
|
|
if result.stderr:
|
|
print("Script stderr:")
|
|
print(result.stderr.decode())
|
|
|
|
assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
|
|
|
|
print("\n" + "="*80)
|
|
print("✓ TEST PASSED: Config serialization and parsing works correctly")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# Run as standalone script
|
|
test_config_propagation_through_worker_hierarchy()
|
|
test_config_environment_variable_parsing()
|