Stabilize CI against expanded plugin surface

This commit is contained in:
Nick Sweeting
2026-03-15 06:31:41 -07:00
parent 1f792d7199
commit 760cf9d6b2
11 changed files with 75 additions and 68 deletions

View File

@@ -24,10 +24,10 @@ def _make_env(data_dir: Path) -> dict:
env["USE_COLOR"] = "False"
env["SHOW_PROGRESS"] = "False"
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
env["PLUGINS"] = "title,favicon"
env["PLUGINS"] = "favicon"
# Keep it fast but still real hooks
env["SAVE_TITLE"] = "True"
env["SAVE_FAVICON"] = "True"
env["SAVE_TITLE"] = "False"
env["SAVE_WGET"] = "False"
env["SAVE_WARC"] = "False"
env["SAVE_PDF"] = "False"
@@ -75,7 +75,7 @@ def test_add_parents_workers_to_orchestrator(tmp_path):
init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
assert init.returncode == 0, init.stderr
add = _run([sys.executable, "-m", "archivebox", "add", "https://example.com"], data_dir, env, timeout=120)
add = _run([sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"], data_dir, env, timeout=120)
assert add.returncode == 0, add.stderr
conn = sqlite3.connect(data_dir / "index.sqlite3")
@@ -105,7 +105,7 @@ def test_add_interrupt_cleans_orphaned_processes(tmp_path):
assert init.returncode == 0, init.stderr
proc = subprocess.Popen(
[sys.executable, "-m", "archivebox", "add", "https://example.com"],
[sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"],
cwd=data_dir,
env=env,
stdout=subprocess.PIPE,

View File

@@ -18,6 +18,11 @@ from archivebox.tests.conftest import (
create_test_snapshot_json,
)
RUN_TEST_ENV = {
'PLUGINS': 'favicon',
'SAVE_FAVICON': 'True',
}
class TestRunWithCrawl:
"""Tests for `archivebox run` with Crawl input."""
@@ -31,6 +36,7 @@ class TestRunWithCrawl:
stdin=json.dumps(crawl_record),
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0, f"Command failed: {stderr}"
@@ -46,7 +52,7 @@ class TestRunWithCrawl:
url = create_test_url()
# First create a crawl
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
crawl = parse_jsonl_output(stdout1)[0]
# Run with the existing crawl
@@ -55,6 +61,7 @@ class TestRunWithCrawl:
stdin=json.dumps(crawl),
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0
@@ -74,6 +81,7 @@ class TestRunWithSnapshot:
stdin=json.dumps(snapshot_record),
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0, f"Command failed: {stderr}"
@@ -88,7 +96,7 @@ class TestRunWithSnapshot:
url = create_test_url()
# First create a snapshot
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
snapshot = parse_jsonl_output(stdout1)[0]
# Run with the existing snapshot
@@ -97,6 +105,7 @@ class TestRunWithSnapshot:
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0
@@ -113,6 +122,7 @@ class TestRunWithSnapshot:
stdin=json.dumps(url_record),
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0
@@ -128,13 +138,14 @@ class TestRunWithArchiveResult:
url = create_test_url()
# Create snapshot and archive result
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
env=RUN_TEST_ENV,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
@@ -144,6 +155,7 @@ class TestRunWithArchiveResult:
['archiveresult', 'update', '--status=failed'],
stdin=json.dumps(ar),
data_dir=initialized_archive,
env=RUN_TEST_ENV,
)
# Now run should re-queue it
@@ -152,6 +164,7 @@ class TestRunWithArchiveResult:
stdin=json.dumps(ar),
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0
@@ -189,6 +202,7 @@ class TestRunPassThrough:
stdin=json.dumps(crawl_record),
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0
@@ -217,6 +231,7 @@ class TestRunMixedInput:
stdin=stdin,
data_dir=initialized_archive,
timeout=120,
env=RUN_TEST_ENV,
)
assert code == 0

View File

@@ -53,13 +53,13 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor
# Add multiple snapshots
subprocess.run(
['archivebox', 'add', '--depth=0', 'https://example.com'],
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=90,
)
subprocess.run(
['archivebox', 'add', '--depth=0', 'https://example.org'],
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'],
capture_output=True,
env=disable_extractors_dict,
timeout=90,
@@ -83,7 +83,7 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
# Add snapshots
subprocess.run(
['archivebox', 'add', '--depth=0', 'https://example.com'],
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=90,
@@ -120,7 +120,7 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--depth=0', 'https://example.com'],
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=90,

View File

@@ -13,7 +13,7 @@ import tempfile
import unittest
from pathlib import Path
from .test_migrations_helpers import (
from .migrations_helpers import (
SCHEMA_0_4,
seed_0_4_data,
run_archivebox,

View File

@@ -14,7 +14,7 @@ import tempfile
import unittest
from pathlib import Path
from .test_migrations_helpers import (
from .migrations_helpers import (
SCHEMA_0_7,
seed_0_7_data,
run_archivebox,

View File

@@ -18,7 +18,7 @@ import tempfile
import unittest
from pathlib import Path
from .test_migrations_helpers import (
from .migrations_helpers import (
SCHEMA_0_8,
seed_0_8_data,
run_archivebox,

View File

@@ -11,7 +11,7 @@ import tempfile
import unittest
from pathlib import Path
from .test_migrations_helpers import run_archivebox
from .migrations_helpers import run_archivebox
class TestFreshInstall(unittest.TestCase):

View File

@@ -35,25 +35,21 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "false",
# Enable chrome session (required for background hooks to start)
"USE_CHROME": "true",
# Parser extractors enabled by default
"SAVE_FAVICON": "true",
"SAVE_WGET": "true",
})
# Start a crawl with depth=1
proc = subprocess.Popen(
['archivebox', 'add', '--depth=1', 'https://monadical.com'],
['archivebox', 'add', '--depth=1', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env,
)
# Give orchestrator time to run all Crawl hooks and create snapshot
# First crawl in a new data dir: ~10-20s (install hooks do full binary lookups)
# Subsequent crawls: ~3-5s (Machine config cached, hooks exit early)
time.sleep(25)
# Give the background hook + parser enough time to create and process the root snapshot.
time.sleep(20)
# Kill the process
proc.kill()
@@ -141,7 +137,7 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
# Add a URL with depth=0 (no recursion yet)
proc = subprocess.Popen(
['archivebox', 'add', '--depth=0', 'https://monadical.com'],
['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -297,7 +293,7 @@ def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extract
# Start a crawl with depth=1
proc = subprocess.Popen(
['archivebox', 'add', '--depth=1', 'https://monadical.com'],
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -371,7 +367,7 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--depth=1', 'https://monadical.com'],
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', 'https://monadical.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
@@ -403,15 +399,15 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
# Start a crawl
env = os.environ.copy()
env.update({
"USE_WGET": "false",
"USE_SINGLEFILE": "false",
"SAVE_WGET": "true",
"SAVE_SINGLEFILE": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"USE_CHROME": "true", # Enables background hooks
"SAVE_FAVICON": "true",
})
proc = subprocess.Popen(
['archivebox', 'add', 'https://monadical.com'],
['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', 'https://monadical.com'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -430,7 +426,7 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
# Get background hooks that are started
bg_started = c.execute(
"SELECT plugin FROM core_archiveresult WHERE plugin IN ('consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status = 'started'"
"SELECT plugin FROM core_archiveresult WHERE plugin IN ('favicon') AND status = 'started'"
).fetchall()
# Get parser extractors that should be queued or better

View File

@@ -19,6 +19,7 @@ Config priority order (highest to lowest):
import os
import json
import sys
import tempfile
import subprocess
import time
@@ -51,7 +52,7 @@ def test_config_propagation_through_worker_hierarchy():
# Step 1: Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
@@ -111,7 +112,7 @@ machine.save()
print(f"Machine {{machine.hostname}} config updated")
"""
result = subprocess.run(
['python', '-c', set_machine_config_script],
[sys.executable, '-c', set_machine_config_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -149,7 +150,7 @@ crawl = Crawl.objects.create(
print(crawl.id)
"""
result = subprocess.run(
['python', '-c', create_crawl_script],
[sys.executable, '-c', create_crawl_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -193,7 +194,7 @@ snapshot = Snapshot.objects.create(
print(snapshot.id)
"""
result = subprocess.run(
['python', '-c', create_snapshot_script],
[sys.executable, '-c', create_snapshot_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -211,7 +212,7 @@ print(snapshot.id)
# Step 5: Run SnapshotWorker with additional env var
print("Step 5: Run SnapshotWorker with ENV_VAR_KEY=from_environment")
result = subprocess.run(
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
[sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
cwd=str(data_dir),
env={
**os.environ,
@@ -238,14 +239,9 @@ print(snapshot.id)
# Check that SnapshotWorker ran successfully
assert result.returncode == 0, f"SnapshotWorker failed with exit code {result.returncode}\n{stderr}"
# Verify config by checking stderr debug output and ArchiveResults in database
# Verify config by checking ArchiveResults and merged config state
print("\n--- Verifying config propagation ---\n")
# Check for config debug messages in stderr
assert "DEBUG: NO PLUGINS whitelist in config" in stderr, \
"Expected debug output not found in stderr"
print("✓ Config debug output found in stderr")
# Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
verify_precedence_script = f"""
import os
@@ -291,7 +287,7 @@ assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Mach
print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
"""
result = subprocess.run(
['python', '-c', verify_precedence_script],
[sys.executable, '-c', verify_precedence_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -332,29 +328,29 @@ config = get_config(snapshot=snapshot)
# 1. Snapshot.config (highest priority)
timeout = config.get('TIMEOUT')
print(f" 1. Snapshot.config: TIMEOUT={timeout} (expected: 555)")
print(f" 1. Snapshot.config: TIMEOUT={{timeout}} (expected: 555)")
assert timeout == 555, f"TIMEOUT should be 555 from snapshot.config, got {{timeout}}"
wget_enabled = config.get('SAVE_WGET')
print(f" 1. Snapshot.config: SAVE_WGET={wget_enabled} (expected: False)")
assert wget_enabled == False, f"SAVE_WGET should be False from snapshot.config, got {{wget_enabled}}"
wget_enabled = config.get('WGET_ENABLED')
print(f" 1. Snapshot.config: WGET_ENABLED={{wget_enabled}} (expected: False)")
assert wget_enabled == False, f"WGET_ENABLED should be False from snapshot.config, got {{wget_enabled}}"
custom_snapshot = config.get('CUSTOM_SNAPSHOT_KEY')
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={custom_snapshot} (expected: from_snapshot_json)")
print(f" 1. Snapshot.config: CUSTOM_SNAPSHOT_KEY={{custom_snapshot}} (expected: from_snapshot_json)")
assert custom_snapshot == 'from_snapshot_json', f"Expected from_snapshot_json, got {{custom_snapshot}}"
# 2. Crawl.config
custom_crawl = config.get('CUSTOM_CRAWL_KEY')
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={custom_crawl} (expected: from_crawl_json)")
print(f" 2. Crawl.config: CUSTOM_CRAWL_KEY={{custom_crawl}} (expected: from_crawl_json)")
assert custom_crawl == 'from_crawl_json', f"Expected from_crawl_json, got {{custom_crawl}}"
# 6. Machine.config
custom_machine = config.get('CUSTOM_MACHINE_KEY')
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={custom_machine} (expected: from_machine_config)")
print(f" 6. Machine.config: CUSTOM_MACHINE_KEY={{custom_machine}} (expected: from_machine_config)")
assert custom_machine == 'from_machine_config', f"Expected from_machine_config, got {{custom_machine}}"
wget_binary = config.get('WGET_BINARY')
print(f" 6. Machine.config: WGET_BINARY={wget_binary} (expected: /custom/machine/wget)")
print(f" 6. Machine.config: WGET_BINARY={{wget_binary}} (expected: /custom/machine/wget)")
# Note: This might be overridden by environment or other sources, just check it's present
assert wget_binary is not None, f"WGET_BINARY should be present"
@@ -384,7 +380,7 @@ print("✓ Config priority order verified")
print("✓ Snapshot successfully sealed")
"""
result = subprocess.run(
['python', '-c', verify_script],
[sys.executable, '-c', verify_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -424,7 +420,7 @@ def test_config_environment_variable_parsing():
# Initialize archive
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
@@ -514,7 +510,7 @@ print("\\n✓ All config values correctly parsed from environment")
"""
result = subprocess.run(
['python', '-c', test_config_types_script],
[sys.executable, '-c', test_config_types_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -561,7 +557,7 @@ def test_parent_environment_preserved_in_hooks():
# Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
@@ -602,7 +598,7 @@ snapshot = Snapshot.objects.create(
print(snapshot.id)
"""
result = subprocess.run(
['python', '-c', create_snapshot_script],
[sys.executable, '-c', create_snapshot_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -619,7 +615,7 @@ print(snapshot.id)
# Run SnapshotWorker with custom parent environment variable
print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
result = subprocess.run(
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
[sys.executable, '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
cwd=str(data_dir),
env={
**os.environ,
@@ -706,7 +702,7 @@ if node_path:
print("\\n✓ All environment checks passed")
"""
result = subprocess.run(
['python', '-c', verify_env_script],
[sys.executable, '-c', verify_env_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -755,7 +751,7 @@ def test_config_auto_fetch_relationships():
# Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
@@ -846,7 +842,7 @@ print("\\n✓ All auto-fetch tests passed")
"""
result = subprocess.run(
['python', '-c', create_objects_script],
[sys.executable, '-c', create_objects_script],
cwd=str(data_dir.parent),
env={
**os.environ,
@@ -900,7 +896,7 @@ def test_config_precedence_with_environment_vars():
# Initialize
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
capture_output=True,
@@ -962,7 +958,7 @@ print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999)
"""
result = subprocess.run(
['python', '-c', test_script],
[sys.executable, '-c', test_script],
cwd=str(data_dir.parent),
capture_output=True,
timeout=30,
@@ -1000,7 +996,7 @@ def test_new_environment_variables_added():
# Initialize
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
[sys.executable, '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
capture_output=True,
@@ -1041,7 +1037,7 @@ print("✓ Lowercase environment variables ignored")
"""
result = subprocess.run(
['python', '-c', test_script],
[sys.executable, '-c', test_script],
cwd=str(data_dir.parent),
capture_output=True,
timeout=30,