Reuse cached binaries in archivebox runtime

This commit is contained in:
Nick Sweeting
2026-03-24 11:03:43 -07:00
parent 39450111dd
commit 50286d3c38
19 changed files with 714 additions and 564 deletions

View File

@@ -518,7 +518,6 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
event = BinaryRequestEvent(
name="wget",
plugin_name="wget",
hook_name="on_Install__10_wget.finite.bg",
output_dir="/tmp/wget",
binproviders="provider",
)

View File

@@ -133,7 +133,13 @@ def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
"description": "Example config used to verify plugin metadata rendering.",
"type": "object",
"required_plugins": ["chrome"],
"required_binaries": ["example-cli"],
"required_binaries": [
{
"name": "example-cli",
"binproviders": "env,apt,brew",
"min_version": None,
},
],
"output_mimetypes": ["text/plain", "application/json"],
"properties": {
"EXAMPLE_ENABLED": {

View File

@@ -3,7 +3,7 @@
Unit tests for the ArchiveBox hook architecture.
Tests hook discovery, execution, JSONL parsing, background hook detection,
binary lookup, and install hook XYZ_BINARY env var handling.
binary lookup, and required_binaries XYZ_BINARY passthrough handling.
Run with:
sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v'
@@ -126,8 +126,8 @@ not json at all
self.assertEqual(records[0]["type"], "ArchiveResult")
class TestInstallHookEnvVarHandling(unittest.TestCase):
"""Test that install hooks respect XYZ_BINARY env vars."""
class TestRequiredBinaryConfigHandling(unittest.TestCase):
"""Test that required_binaries keep configured XYZ_BINARY values intact."""
def setUp(self):
"""Set up test environment."""
@@ -139,39 +139,28 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_binary_env_var_absolute_path_handling(self):
"""Install hooks should handle absolute paths in XYZ_BINARY."""
# Test the logic that install hooks use
"""Absolute binary paths should pass through unchanged."""
configured_binary = "/custom/path/to/wget2"
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
binary_name = configured_binary
self.assertEqual(bin_name, "wget2")
self.assertEqual(binary_name, "/custom/path/to/wget2")
def test_binary_env_var_name_only_handling(self):
"""Install hooks should handle binary names in XYZ_BINARY."""
# Test the logic that install hooks use
"""Binary command names should pass through unchanged."""
configured_binary = "wget2"
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
binary_name = configured_binary
self.assertEqual(bin_name, "wget2")
self.assertEqual(binary_name, "wget2")
def test_binary_env_var_empty_default(self):
"""Install hooks should use default when XYZ_BINARY is empty."""
"""Empty configured values should fall back to config defaults."""
configured_binary = ""
if configured_binary:
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
binary_name = configured_binary
else:
bin_name = "wget" # default
binary_name = "wget"
self.assertEqual(bin_name, "wget")
self.assertEqual(binary_name, "wget")
class TestHookDiscovery(unittest.TestCase):
@@ -187,7 +176,7 @@ class TestHookDiscovery(unittest.TestCase):
wget_dir = self.plugins_dir / "wget"
wget_dir.mkdir()
(wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook")
(wget_dir / "on_Install__10_wget.finite.bg.py").write_text("# install hook")
(wget_dir / "on_BinaryRequest__10_wget.py").write_text("# binary request hook")
chrome_dir = self.plugins_dir / "chrome"
chrome_dir.mkdir(exist_ok=True)
@@ -299,7 +288,7 @@ class TestHookDiscovery(unittest.TestCase):
self.assertIn("on_BinaryRequest__10_npm.py", hook_names)
def test_discover_hooks_accepts_event_class_names(self):
"""discover_hooks should accept InstallEvent / SnapshotEvent class names."""
"""discover_hooks should accept BinaryRequestEvent / SnapshotEvent class names."""
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
@@ -307,10 +296,10 @@ class TestHookDiscovery(unittest.TestCase):
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
):
install_hooks = hooks_module.discover_hooks("InstallEvent", filter_disabled=False)
binary_hooks = hooks_module.discover_hooks("BinaryRequestEvent", filter_disabled=False)
snapshot_hooks = hooks_module.discover_hooks("SnapshotEvent", filter_disabled=False)
self.assertIn("on_Install__10_wget.finite.bg.py", [hook.name for hook in install_hooks])
self.assertIn("on_BinaryRequest__10_wget.py", [hook.name for hook in binary_hooks])
self.assertIn("on_Snapshot__50_wget.py", [hook.name for hook in snapshot_hooks])
def test_discover_hooks_returns_empty_for_non_hook_lifecycle_events(self):
@@ -325,44 +314,6 @@ class TestHookDiscovery(unittest.TestCase):
self.assertEqual(hooks_module.discover_hooks("BinaryEvent", filter_disabled=False), [])
self.assertEqual(hooks_module.discover_hooks("CrawlCleanupEvent", filter_disabled=False), [])
def test_discover_install_hooks_only_include_declared_plugin_dependencies(self):
"""Install hook discovery should include required_plugins without broadening to provider plugins."""
responses_dir = self.plugins_dir / "responses"
responses_dir.mkdir()
(responses_dir / "config.json").write_text(
json.dumps(
{
"type": "object",
"required_plugins": ["chrome"],
"properties": {},
},
),
)
chrome_dir = self.plugins_dir / "chrome"
chrome_dir.mkdir(exist_ok=True)
(chrome_dir / "config.json").write_text('{"type": "object", "properties": {}}')
(chrome_dir / "on_Install__70_chrome.finite.bg.py").write_text("# chrome install hook")
npm_dir = self.plugins_dir / "npm"
npm_dir.mkdir()
(npm_dir / "on_BinaryRequest__10_npm.py").write_text("# npm binary hook")
(npm_dir / "on_Install__00_npm.py").write_text("# npm install hook")
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
with (
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
):
hooks = hooks_module.discover_hooks("Install", config={"PLUGINS": "responses"})
hook_names = [hook.name for hook in hooks]
self.assertIn("on_Install__70_chrome.finite.bg.py", hook_names)
self.assertNotIn("on_Install__00_npm.py", hook_names)
class TestGetExtractorName(unittest.TestCase):
"""Test get_extractor_name() function."""
@@ -484,8 +435,8 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.ge
self.assertEqual(records[0]["url"], "https://example.com")
class TestInstallHookOutput(unittest.TestCase):
"""Test install hook output format compliance."""
class TestDependencyRecordOutput(unittest.TestCase):
"""Test dependency record output format compliance."""
def setUp(self):
"""Set up test environment."""
@@ -495,8 +446,8 @@ class TestInstallHookOutput(unittest.TestCase):
"""Clean up test environment."""
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_install_hook_outputs_binary(self):
"""Install hook should output Binary JSONL when binary found."""
def test_dependency_record_outputs_binary(self):
"""Dependency resolution should output Binary JSONL when binary is found."""
hook_output = json.dumps(
{
"type": "Binary",
@@ -515,8 +466,8 @@ class TestInstallHookOutput(unittest.TestCase):
self.assertEqual(data["name"], "wget")
self.assertTrue(data["abspath"].startswith("/"))
def test_install_hook_outputs_machine_config(self):
"""Install hook should output Machine config update JSONL."""
def test_dependency_record_outputs_machine_config(self):
"""Dependency resolution should output Machine config update JSONL."""
hook_output = json.dumps(
{
"type": "Machine",

View File

@@ -0,0 +1,69 @@
import asyncio
import json
import pytest
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
from archivebox.services import process_service as process_service_module
from archivebox.services.process_service import ProcessService
bus = create_bus(name="test_process_service_inline_process_event")
ProcessService(bus)
monkeypatch.setattr(
process_service_module,
"_ensure_worker",
lambda event: {
"pid": 4321,
"start": 1711111111.0,
"statename": "RUNNING",
"exitstatus": 0,
},
)
async def run_test():
await bus.emit(
ProcessStdoutEvent(
line=json.dumps(
{
"type": "ProcessEvent",
"plugin_name": "search_backend_sonic",
"hook_name": "worker_sonic",
"hook_path": "/usr/bin/sonic",
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
"is_background": True,
"daemon": True,
"url": "tcp://127.0.0.1:1491",
"output_dir": "/tmp/sonic",
"env": {},
"process_type": "worker",
"worker_type": "sonic",
"process_id": "worker:sonic",
"output_str": "127.0.0.1:1491",
},
),
plugin_name="search_backend_sonic",
hook_name="on_CrawlSetup__55_sonic_start.py",
output_dir="/tmp/search_backend_sonic",
snapshot_id="snap-1",
process_id="proc-hook",
),
)
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
await bus.stop()
return started
started = asyncio.run(run_test())
assert started is not None
assert started.hook_name == "worker_sonic"
assert started.process_type == "worker"
assert started.worker_type == "sonic"
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
assert getattr(started, "output_str", "") == "127.0.0.1:1491"

View File

@@ -46,7 +46,7 @@ async def _call_sync(func, *args, **kwargs):
return func(*args, **kwargs)
def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
@@ -87,13 +87,13 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
download_calls = []
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
async def fake_download(*, url, bus, snapshot, **kwargs):
download_calls.append(
{
"url": url,
"bus": bus,
"snapshot_id": config_overrides["SNAPSHOT_ID"],
"source_url": config_overrides["SOURCE_URL"],
"snapshot_id": snapshot.id,
"source_url": snapshot.url,
"abx_snapshot_id": snapshot.id,
},
)
@@ -146,8 +146,8 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
assert len(download_calls) == 2
assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
assert len({id(call["bus"]) for call in download_calls}) == 2
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
assert len({id(call["bus"]) for call in download_calls}) == 1
assert len(created_buses) == 1
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
@@ -353,6 +353,62 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
from archivebox.machine.models import Binary, Machine
from archivebox.services import runner as runner_module
from abx_dl.models import Plugin
machine = Machine.objects.create(
guid="test-guid-runner-singlefile-cache",
hostname="runner-host-singlefile",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid="test-hw-runner-singlefile-cache",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
)
singlefile_extension = Binary.objects.create(
machine=machine,
name="singlefile",
abspath="/tmp/shared-lib/bin/singlefile",
version="1.0.0",
binprovider="chromewebstore",
binproviders="chromewebstore",
status=Binary.StatusChoices.INSTALLED,
)
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath)
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
overrides = runner_module._installed_binary_config_overrides(
{
"singlefile": Plugin(
name="singlefile",
path=Path("."),
hooks=[],
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
binaries=[
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
{"name": "singlefile", "binproviders": "chromewebstore"},
],
),
},
config={"SINGLEFILE_BINARY": "single-file"},
)
assert "SINGLEFILE_BINARY" not in overrides
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
import asgiref.sync
@@ -700,11 +756,9 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
"_run_crawl_cleanup",
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
)
monkeypatch.setattr(crawl, "cleanup", lambda: cleanup_calls.append("crawl_cleanup"))
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert cleanup_calls == ["crawl_cleanup", "abx_cleanup"]
assert cleanup_calls == ["abx_cleanup"]
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
@@ -765,6 +819,9 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
timeout=60,
snapshot_id="snap-1",
is_background=True,
url="https://example.org/",
process_type="hook",
worker_type="hook",
)
async def run_test():