mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
more migration id/uuid and config propagation fixes
This commit is contained in:
@@ -136,7 +136,7 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['list'])
|
||||
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
|
||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
|
||||
481
archivebox/tests/test_worker_config_propagation.py
Normal file
481
archivebox/tests/test_worker_config_propagation.py
Normal file
@@ -0,0 +1,481 @@
|
||||
"""
|
||||
Integration test for config propagation through worker hierarchy.
|
||||
|
||||
Tests that config is properly merged and passed through:
|
||||
Parent CLI/Orchestrator
|
||||
└── CrawlWorker subprocess (via Process.env)
|
||||
└── SnapshotWorker subprocess (via Process.env)
|
||||
└── Hook subprocess (via Process.env)
|
||||
|
||||
Config priority order (highest to lowest):
|
||||
1. Snapshot.config (JSON field)
|
||||
2. Crawl.config (JSON field)
|
||||
3. User.config (JSON field)
|
||||
4. Environment variables (os.environ + Process.env)
|
||||
5. Config file (ArchiveBox.conf)
|
||||
6. Plugin defaults (config.json)
|
||||
7. Core defaults
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_config_propagation_through_worker_hierarchy():
|
||||
"""
|
||||
Integration test: Verify config is properly merged at every level.
|
||||
|
||||
Test flow:
|
||||
1. Create test archive with custom config in ArchiveBox.conf
|
||||
2. Set custom env vars before spawning worker
|
||||
3. Create Crawl with custom crawl.config JSON field
|
||||
4. Create Snapshot with custom snapshot.config JSON field
|
||||
5. Spawn SnapshotWorker via archivebox run --snapshot-id=...
|
||||
6. Verify worker received merged config from all sources
|
||||
7. Verify hook subprocess also received correct config
|
||||
"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / 'test_archive'
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Propagation Through Worker Hierarchy")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Step 1: Initialize archive
|
||||
print("Step 1: Initialize archive")
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'init'],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
print(f"✓ Archive initialized\n")
|
||||
|
||||
# Step 2: Write custom config to ArchiveBox.conf
|
||||
print("Step 2: Write custom config to ArchiveBox.conf")
|
||||
config_file = data_dir / 'ArchiveBox.conf'
|
||||
config_file.write_text("""
|
||||
[GENERAL]
|
||||
# Custom timeout in config file
|
||||
TIMEOUT = 999
|
||||
|
||||
[ARCHIVING_CONFIG]
|
||||
# Enable all plugins for proper testing
|
||||
SAVE_WGET = True
|
||||
SAVE_WARC = True
|
||||
SAVE_PDF = True
|
||||
SAVE_DOM = True
|
||||
SAVE_SINGLEFILE = True
|
||||
SAVE_READABILITY = True
|
||||
SAVE_MERCURY = True
|
||||
SAVE_HTMLTOTEXT = True
|
||||
SAVE_GIT = True
|
||||
SAVE_MEDIA = True
|
||||
SAVE_ARCHIVE_DOT_ORG = True
|
||||
SAVE_TITLE = True
|
||||
SAVE_FAVICON = True
|
||||
SAVE_SCREENSHOT = True
|
||||
""")
|
||||
print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
|
||||
|
||||
# Step 2.5: Set Machine.config values
|
||||
print("Step 2.5: Set Machine.config with custom binary path")
|
||||
set_machine_config_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
machine.config = {{
|
||||
'CUSTOM_MACHINE_KEY': 'from_machine_config',
|
||||
'WGET_BINARY': '/custom/machine/wget', # Machine-specific binary path
|
||||
}}
|
||||
machine.save()
|
||||
print(f"Machine {{machine.hostname}} config updated")
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', set_machine_config_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
|
||||
print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
|
||||
|
||||
# Step 3: Create Crawl via Django ORM with custom crawl.config
|
||||
print("Step 3: Create Crawl with custom crawl.config JSON")
|
||||
create_crawl_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
# Create crawl with custom config
|
||||
crawl = Crawl.objects.create(
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
urls='https://example.com',
|
||||
config={{
|
||||
'TIMEOUT': 777, # Crawl-level override (higher priority than file)
|
||||
'CUSTOM_CRAWL_KEY': 'from_crawl_json',
|
||||
}}
|
||||
)
|
||||
print(crawl.id)
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', create_crawl_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Create crawl failed: {result.stderr.decode()}"
|
||||
# Extract UUID from output (last line should be the UUID)
|
||||
crawl_id = result.stdout.decode().strip().split('\n')[-1]
|
||||
print(f"✓ Created crawl {crawl_id} with TIMEOUT=777, CUSTOM_CRAWL_KEY=from_crawl_json\n")
|
||||
|
||||
# Step 4: Create Snapshot with custom snapshot.config
|
||||
print("Step 4: Create Snapshot with custom snapshot.config JSON")
|
||||
create_snapshot_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id='{crawl_id}')
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
config={{
|
||||
'TIMEOUT': 555, # Snapshot-level override (highest priority)
|
||||
'CUSTOM_SNAPSHOT_KEY': 'from_snapshot_json',
|
||||
'SAVE_SCREENSHOT': True, # Keep screenshot enabled
|
||||
'SAVE_WGET': False, # But disable wget as a test of per-snapshot override
|
||||
}}
|
||||
)
|
||||
print(snapshot.id)
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', create_snapshot_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
|
||||
# Extract UUID from output (last line should be the UUID)
|
||||
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
|
||||
print(f"✓ Created snapshot {snapshot_id} with TIMEOUT=555, SAVE_WGET=False (override), SAVE_SCREENSHOT=True\n")
|
||||
|
||||
# Step 5: Run SnapshotWorker with additional env var
|
||||
print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
'ENV_VAR_KEY': 'from_environment', # Environment variable
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
stdout = result.stdout.decode()
|
||||
stderr = result.stderr.decode()
|
||||
|
||||
print("\n--- SnapshotWorker stdout ---")
|
||||
print(stdout)
|
||||
print("\n--- SnapshotWorker stderr ---")
|
||||
print(stderr)
|
||||
print("--- End output ---\n")
|
||||
|
||||
# Step 6: Verify config was properly merged
|
||||
print("Step 6: Verify config merging")
|
||||
|
||||
# Check that SnapshotWorker ran successfully
|
||||
assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
|
||||
|
||||
# Verify config by checking stderr debug output and ArchiveResults in database
|
||||
print("\n--- Verifying config propagation ---\n")
|
||||
|
||||
# Check for config debug messages in stderr
|
||||
assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
|
||||
"Expected debug output not found in stderr"
|
||||
print("✓ Config debug output found in stderr")
|
||||
|
||||
# Verify config values were actually used by checking ArchiveResults
|
||||
verify_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
||||
print(f"Snapshot status: {{snapshot.status}}")
|
||||
print(f"Snapshot URL: {{snapshot.url}}")
|
||||
|
||||
# Check that snapshot reached sealed state
|
||||
assert snapshot.status == 'sealed', f"Expected sealed, got {{snapshot.status}}"
|
||||
|
||||
# Verify all config sources are present in merged config
|
||||
print("\\nVerifying config merge priority:")
|
||||
config = get_config(snapshot=snapshot)
|
||||
|
||||
# 1. Snapshot.config (highest priority)
|
||||
timeout = config.get('TIMEOUT')
|
||||
print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
|
||||
assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
|
||||
|
||||
wget_enabled = config.get('SAVE_WGET')
|
||||
print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
|
||||
assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
|
||||
|
||||
custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
|
||||
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
|
||||
assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
|
||||
|
||||
# 2. Crawl.config
|
||||
custom_crawl = config.get('CUSTOM_CRAWL_KEY')
|
||||
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
|
||||
assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
|
||||
|
||||
# 6. Machine.config
|
||||
custom_machine = config.get('CUSTOM_MACHINE_KEY')
|
||||
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
|
||||
assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
|
||||
|
||||
wget_binary = config.get('WGET_BINARY')
|
||||
print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
|
||||
# Note: This might be overridden by environment or other sources, just check it's present
|
||||
assert wget_binary is not None, f"WGET_BINARY should be present"
|
||||
|
||||
# Check ArchiveResults to verify plugins actually ran with correct config
|
||||
results = ArchiveResult.objects.filter(snapshot=snapshot)
|
||||
print(f"\\nArchiveResults created: {{results.count()}}")
|
||||
|
||||
for ar in results.order_by('plugin'):
|
||||
print(f" {{ar.plugin}}: {{ar.status}}")
|
||||
|
||||
# Verify SAVE_WGET=False was respected (should have no wget result)
|
||||
wget_results = results.filter(plugin='wget')
|
||||
print(f"\\nWGET results: {{wget_results.count()}} (expected: 0, disabled in snapshot.config)")
|
||||
assert wget_results.count() == 0, f"WGET should be disabled, found {{wget_results.count()}} results"
|
||||
|
||||
# Verify SAVE_SCREENSHOT=True was respected (should have screenshot result)
|
||||
screenshot_results = results.filter(plugin='screenshot')
|
||||
print(f"SCREENSHOT results: {{screenshot_results.count()}} (expected: >0, enabled globally)")
|
||||
assert screenshot_results.count() > 0, f"SCREENSHOT should be enabled, found {{screenshot_results.count()}} results"
|
||||
|
||||
print("\\n✓ All config sources correctly merged:")
|
||||
print(" - Snapshot.config overrides (highest priority)")
|
||||
print(" - Crawl.config values present")
|
||||
print(" - Machine.config values present")
|
||||
print(" - File config values present")
|
||||
print("✓ Config priority order verified")
|
||||
print("✓ Snapshot successfully sealed")
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', verify_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.returncode != 0:
|
||||
print("\nVerification error:")
|
||||
print(result.stderr.decode())
|
||||
|
||||
assert result.returncode == 0, f"Config verification failed: {result.stderr.decode()}"
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✓ TEST PASSED: Config properly propagated through worker hierarchy")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
def test_config_environment_variable_parsing():
|
||||
"""
|
||||
Test that Process._build_env() correctly serializes config values,
|
||||
and get_config() correctly parses them back from environment.
|
||||
"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / 'test_archive'
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Environment Variable Parsing")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Initialize archive
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'init'],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
|
||||
# Test various data types in config
|
||||
test_config_types_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
# Test get_config() with no overrides (baseline)
|
||||
config = get_config()
|
||||
print(f"Baseline config keys: {{len(config)}}")
|
||||
|
||||
# Create a test Process with various config types
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
pwd='{data_dir}',
|
||||
cmd=['test'],
|
||||
env={{
|
||||
'STRING_VAL': 'hello',
|
||||
'INT_VAL': 123,
|
||||
'FLOAT_VAL': 45.67,
|
||||
'BOOL_TRUE': True,
|
||||
'BOOL_FALSE': False,
|
||||
'LIST_VAL': ['a', 'b', 'c'],
|
||||
'DICT_VAL': {{'key': 'value'}},
|
||||
'NONE_VAL': None,
|
||||
}},
|
||||
)
|
||||
|
||||
# Test _build_env() serialization
|
||||
env = process._build_env()
|
||||
print(f"\\nSerialized environment:")
|
||||
print(f" STRING_VAL: {{env.get('STRING_VAL')}} (type: {{type(env.get('STRING_VAL')).__name__}})")
|
||||
print(f" INT_VAL: {{env.get('INT_VAL')}} (type: {{type(env.get('INT_VAL')).__name__}})")
|
||||
print(f" FLOAT_VAL: {{env.get('FLOAT_VAL')}} (type: {{type(env.get('FLOAT_VAL')).__name__}})")
|
||||
print(f" BOOL_TRUE: {{env.get('BOOL_TRUE')}} (type: {{type(env.get('BOOL_TRUE')).__name__}})")
|
||||
print(f" BOOL_FALSE: {{env.get('BOOL_FALSE')}} (type: {{type(env.get('BOOL_FALSE')).__name__}})")
|
||||
print(f" LIST_VAL: {{env.get('LIST_VAL')}} (type: {{type(env.get('LIST_VAL')).__name__}})")
|
||||
print(f" DICT_VAL: {{env.get('DICT_VAL')}} (type: {{type(env.get('DICT_VAL')).__name__}})")
|
||||
print(f" NONE_VAL: {{env.get('NONE_VAL')}} (should be None/missing)")
|
||||
|
||||
# Verify all are strings (required by subprocess.Popen)
|
||||
assert isinstance(env.get('STRING_VAL'), str), "STRING_VAL should be str"
|
||||
assert isinstance(env.get('INT_VAL'), str), "INT_VAL should be str"
|
||||
assert isinstance(env.get('FLOAT_VAL'), str), "FLOAT_VAL should be str"
|
||||
assert isinstance(env.get('BOOL_TRUE'), str), "BOOL_TRUE should be str"
|
||||
assert isinstance(env.get('BOOL_FALSE'), str), "BOOL_FALSE should be str"
|
||||
assert isinstance(env.get('LIST_VAL'), str), "LIST_VAL should be str"
|
||||
assert isinstance(env.get('DICT_VAL'), str), "DICT_VAL should be str"
|
||||
|
||||
print("\\n✓ All environment values correctly serialized as strings")
|
||||
|
||||
# Now test that get_config() can parse them back
|
||||
# Simulate subprocess by setting os.environ
|
||||
import json
|
||||
for key, val in env.items():
|
||||
if key in ['STRING_VAL', 'INT_VAL', 'FLOAT_VAL', 'BOOL_TRUE', 'BOOL_FALSE', 'LIST_VAL', 'DICT_VAL']:
|
||||
os.environ[key] = val
|
||||
|
||||
# Get config again - should parse from environment
|
||||
config = get_config()
|
||||
print(f"\\nParsed from environment:")
|
||||
print(f" STRING_VAL: {{config.get('STRING_VAL')}} (type: {{type(config.get('STRING_VAL')).__name__}})")
|
||||
print(f" INT_VAL: {{config.get('INT_VAL')}} (type: {{type(config.get('INT_VAL')).__name__}})")
|
||||
print(f" FLOAT_VAL: {{config.get('FLOAT_VAL')}} (type: {{type(config.get('FLOAT_VAL')).__name__}})")
|
||||
print(f" BOOL_TRUE: {{config.get('BOOL_TRUE')}} (type: {{type(config.get('BOOL_TRUE')).__name__}})")
|
||||
print(f" BOOL_FALSE: {{config.get('BOOL_FALSE')}} (type: {{type(config.get('BOOL_FALSE')).__name__}})")
|
||||
print(f" LIST_VAL: {{config.get('LIST_VAL')}} (type: {{type(config.get('LIST_VAL')).__name__}})")
|
||||
print(f" DICT_VAL: {{config.get('DICT_VAL')}} (type: {{type(config.get('DICT_VAL')).__name__}})")
|
||||
|
||||
print("\\n✓ All config values correctly parsed from environment")
|
||||
"""
|
||||
|
||||
result = subprocess.run(
|
||||
['python', '-c', test_config_types_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.stderr:
|
||||
print("Script stderr:")
|
||||
print(result.stderr.decode())
|
||||
|
||||
assert result.returncode == 0, f"Type parsing test failed: {result.stderr.decode()}"
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✓ TEST PASSED: Config serialization and parsing works correctly")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Run as standalone script
|
||||
test_config_propagation_through_worker_hierarchy()
|
||||
test_config_environment_variable_parsing()
|
||||
Reference in New Issue
Block a user