mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Reuse cached binaries in archivebox runtime
This commit is contained in:
@@ -518,7 +518,6 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
||||
event = BinaryRequestEvent(
|
||||
name="wget",
|
||||
plugin_name="wget",
|
||||
hook_name="on_Install__10_wget.finite.bg",
|
||||
output_dir="/tmp/wget",
|
||||
binproviders="provider",
|
||||
)
|
||||
|
||||
@@ -133,7 +133,13 @@ def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
|
||||
"description": "Example config used to verify plugin metadata rendering.",
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"required_binaries": ["example-cli"],
|
||||
"required_binaries": [
|
||||
{
|
||||
"name": "example-cli",
|
||||
"binproviders": "env,apt,brew",
|
||||
"min_version": None,
|
||||
},
|
||||
],
|
||||
"output_mimetypes": ["text/plain", "application/json"],
|
||||
"properties": {
|
||||
"EXAMPLE_ENABLED": {
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Unit tests for the ArchiveBox hook architecture.
|
||||
|
||||
Tests hook discovery, execution, JSONL parsing, background hook detection,
|
||||
binary lookup, and install hook XYZ_BINARY env var handling.
|
||||
binary lookup, and required_binaries XYZ_BINARY passthrough handling.
|
||||
|
||||
Run with:
|
||||
sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v'
|
||||
@@ -126,8 +126,8 @@ not json at all
|
||||
self.assertEqual(records[0]["type"], "ArchiveResult")
|
||||
|
||||
|
||||
class TestInstallHookEnvVarHandling(unittest.TestCase):
|
||||
"""Test that install hooks respect XYZ_BINARY env vars."""
|
||||
class TestRequiredBinaryConfigHandling(unittest.TestCase):
|
||||
"""Test that required_binaries keep configured XYZ_BINARY values intact."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
@@ -139,39 +139,28 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_binary_env_var_absolute_path_handling(self):
|
||||
"""Install hooks should handle absolute paths in XYZ_BINARY."""
|
||||
# Test the logic that install hooks use
|
||||
"""Absolute binary paths should pass through unchanged."""
|
||||
configured_binary = "/custom/path/to/wget2"
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
binary_name = configured_binary
|
||||
|
||||
self.assertEqual(bin_name, "wget2")
|
||||
self.assertEqual(binary_name, "/custom/path/to/wget2")
|
||||
|
||||
def test_binary_env_var_name_only_handling(self):
|
||||
"""Install hooks should handle binary names in XYZ_BINARY."""
|
||||
# Test the logic that install hooks use
|
||||
"""Binary command names should pass through unchanged."""
|
||||
configured_binary = "wget2"
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
binary_name = configured_binary
|
||||
|
||||
self.assertEqual(bin_name, "wget2")
|
||||
self.assertEqual(binary_name, "wget2")
|
||||
|
||||
def test_binary_env_var_empty_default(self):
|
||||
"""Install hooks should use default when XYZ_BINARY is empty."""
|
||||
"""Empty configured values should fall back to config defaults."""
|
||||
configured_binary = ""
|
||||
if configured_binary:
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
binary_name = configured_binary
|
||||
else:
|
||||
bin_name = "wget" # default
|
||||
binary_name = "wget"
|
||||
|
||||
self.assertEqual(bin_name, "wget")
|
||||
self.assertEqual(binary_name, "wget")
|
||||
|
||||
|
||||
class TestHookDiscovery(unittest.TestCase):
|
||||
@@ -187,7 +176,7 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
wget_dir = self.plugins_dir / "wget"
|
||||
wget_dir.mkdir()
|
||||
(wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook")
|
||||
(wget_dir / "on_Install__10_wget.finite.bg.py").write_text("# install hook")
|
||||
(wget_dir / "on_BinaryRequest__10_wget.py").write_text("# binary request hook")
|
||||
|
||||
chrome_dir = self.plugins_dir / "chrome"
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
@@ -299,7 +288,7 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
self.assertIn("on_BinaryRequest__10_npm.py", hook_names)
|
||||
|
||||
def test_discover_hooks_accepts_event_class_names(self):
|
||||
"""discover_hooks should accept InstallEvent / SnapshotEvent class names."""
|
||||
"""discover_hooks should accept BinaryRequestEvent / SnapshotEvent class names."""
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
@@ -307,10 +296,10 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
|
||||
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
|
||||
):
|
||||
install_hooks = hooks_module.discover_hooks("InstallEvent", filter_disabled=False)
|
||||
binary_hooks = hooks_module.discover_hooks("BinaryRequestEvent", filter_disabled=False)
|
||||
snapshot_hooks = hooks_module.discover_hooks("SnapshotEvent", filter_disabled=False)
|
||||
|
||||
self.assertIn("on_Install__10_wget.finite.bg.py", [hook.name for hook in install_hooks])
|
||||
self.assertIn("on_BinaryRequest__10_wget.py", [hook.name for hook in binary_hooks])
|
||||
self.assertIn("on_Snapshot__50_wget.py", [hook.name for hook in snapshot_hooks])
|
||||
|
||||
def test_discover_hooks_returns_empty_for_non_hook_lifecycle_events(self):
|
||||
@@ -325,44 +314,6 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
self.assertEqual(hooks_module.discover_hooks("BinaryEvent", filter_disabled=False), [])
|
||||
self.assertEqual(hooks_module.discover_hooks("CrawlCleanupEvent", filter_disabled=False), [])
|
||||
|
||||
def test_discover_install_hooks_only_include_declared_plugin_dependencies(self):
|
||||
"""Install hook discovery should include required_plugins without broadening to provider plugins."""
|
||||
responses_dir = self.plugins_dir / "responses"
|
||||
responses_dir.mkdir()
|
||||
(responses_dir / "config.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
chrome_dir = self.plugins_dir / "chrome"
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
(chrome_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
(chrome_dir / "on_Install__70_chrome.finite.bg.py").write_text("# chrome install hook")
|
||||
|
||||
npm_dir = self.plugins_dir / "npm"
|
||||
npm_dir.mkdir()
|
||||
(npm_dir / "on_BinaryRequest__10_npm.py").write_text("# npm binary hook")
|
||||
(npm_dir / "on_Install__00_npm.py").write_text("# npm install hook")
|
||||
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
with (
|
||||
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
|
||||
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
|
||||
):
|
||||
hooks = hooks_module.discover_hooks("Install", config={"PLUGINS": "responses"})
|
||||
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn("on_Install__70_chrome.finite.bg.py", hook_names)
|
||||
self.assertNotIn("on_Install__00_npm.py", hook_names)
|
||||
|
||||
|
||||
class TestGetExtractorName(unittest.TestCase):
|
||||
"""Test get_extractor_name() function."""
|
||||
@@ -484,8 +435,8 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.ge
|
||||
self.assertEqual(records[0]["url"], "https://example.com")
|
||||
|
||||
|
||||
class TestInstallHookOutput(unittest.TestCase):
|
||||
"""Test install hook output format compliance."""
|
||||
class TestDependencyRecordOutput(unittest.TestCase):
|
||||
"""Test dependency record output format compliance."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
@@ -495,8 +446,8 @@ class TestInstallHookOutput(unittest.TestCase):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_install_hook_outputs_binary(self):
|
||||
"""Install hook should output Binary JSONL when binary found."""
|
||||
def test_dependency_record_outputs_binary(self):
|
||||
"""Dependency resolution should output Binary JSONL when binary is found."""
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "Binary",
|
||||
@@ -515,8 +466,8 @@ class TestInstallHookOutput(unittest.TestCase):
|
||||
self.assertEqual(data["name"], "wget")
|
||||
self.assertTrue(data["abspath"].startswith("/"))
|
||||
|
||||
def test_install_hook_outputs_machine_config(self):
|
||||
"""Install hook should output Machine config update JSONL."""
|
||||
def test_dependency_record_outputs_machine_config(self):
|
||||
"""Dependency resolution should output Machine config update JSONL."""
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "Machine",
|
||||
|
||||
69
archivebox/tests/test_process_service.py
Normal file
69
archivebox/tests/test_process_service.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
|
||||
from archivebox.services import process_service as process_service_module
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
bus = create_bus(name="test_process_service_inline_process_event")
|
||||
ProcessService(bus)
|
||||
|
||||
monkeypatch.setattr(
|
||||
process_service_module,
|
||||
"_ensure_worker",
|
||||
lambda event: {
|
||||
"pid": 4321,
|
||||
"start": 1711111111.0,
|
||||
"statename": "RUNNING",
|
||||
"exitstatus": 0,
|
||||
},
|
||||
)
|
||||
|
||||
async def run_test():
|
||||
await bus.emit(
|
||||
ProcessStdoutEvent(
|
||||
line=json.dumps(
|
||||
{
|
||||
"type": "ProcessEvent",
|
||||
"plugin_name": "search_backend_sonic",
|
||||
"hook_name": "worker_sonic",
|
||||
"hook_path": "/usr/bin/sonic",
|
||||
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
|
||||
"is_background": True,
|
||||
"daemon": True,
|
||||
"url": "tcp://127.0.0.1:1491",
|
||||
"output_dir": "/tmp/sonic",
|
||||
"env": {},
|
||||
"process_type": "worker",
|
||||
"worker_type": "sonic",
|
||||
"process_id": "worker:sonic",
|
||||
"output_str": "127.0.0.1:1491",
|
||||
},
|
||||
),
|
||||
plugin_name="search_backend_sonic",
|
||||
hook_name="on_CrawlSetup__55_sonic_start.py",
|
||||
output_dir="/tmp/search_backend_sonic",
|
||||
snapshot_id="snap-1",
|
||||
process_id="proc-hook",
|
||||
),
|
||||
)
|
||||
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
|
||||
await bus.stop()
|
||||
return started
|
||||
|
||||
started = asyncio.run(run_test())
|
||||
assert started is not None
|
||||
assert started.hook_name == "worker_sonic"
|
||||
assert started.process_type == "worker"
|
||||
assert started.worker_type == "sonic"
|
||||
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
|
||||
assert getattr(started, "output_str", "") == "127.0.0.1:1491"
|
||||
@@ -46,7 +46,7 @@ async def _call_sync(func, *args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -87,13 +87,13 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
|
||||
download_calls = []
|
||||
|
||||
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
|
||||
async def fake_download(*, url, bus, snapshot, **kwargs):
|
||||
download_calls.append(
|
||||
{
|
||||
"url": url,
|
||||
"bus": bus,
|
||||
"snapshot_id": config_overrides["SNAPSHOT_ID"],
|
||||
"source_url": config_overrides["SOURCE_URL"],
|
||||
"snapshot_id": snapshot.id,
|
||||
"source_url": snapshot.url,
|
||||
"abx_snapshot_id": snapshot.id,
|
||||
},
|
||||
)
|
||||
@@ -146,8 +146,8 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
assert len(download_calls) == 2
|
||||
assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
|
||||
assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
|
||||
assert len({id(call["bus"]) for call in download_calls}) == 2
|
||||
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
|
||||
assert len({id(call["bus"]) for call in download_calls}) == 1
|
||||
assert len(created_buses) == 1
|
||||
|
||||
|
||||
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
|
||||
@@ -353,6 +353,62 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
|
||||
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
|
||||
|
||||
|
||||
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from archivebox.services import runner as runner_module
|
||||
from abx_dl.models import Plugin
|
||||
|
||||
machine = Machine.objects.create(
|
||||
guid="test-guid-runner-singlefile-cache",
|
||||
hostname="runner-host-singlefile",
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer="Test",
|
||||
hw_product="Test Product",
|
||||
hw_uuid="test-hw-runner-singlefile-cache",
|
||||
os_arch="arm64",
|
||||
os_family="darwin",
|
||||
os_platform="macOS",
|
||||
os_release="14.0",
|
||||
os_kernel="Darwin",
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
singlefile_extension = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="singlefile",
|
||||
abspath="/tmp/shared-lib/bin/singlefile",
|
||||
version="1.0.0",
|
||||
binprovider="chromewebstore",
|
||||
binproviders="chromewebstore",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath)
|
||||
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
|
||||
|
||||
overrides = runner_module._installed_binary_config_overrides(
|
||||
{
|
||||
"singlefile": Plugin(
|
||||
name="singlefile",
|
||||
path=Path("."),
|
||||
hooks=[],
|
||||
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
|
||||
binaries=[
|
||||
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
|
||||
{"name": "singlefile", "binproviders": "chromewebstore"},
|
||||
],
|
||||
),
|
||||
},
|
||||
config={"SINGLEFILE_BINARY": "single-file"},
|
||||
)
|
||||
|
||||
assert "SINGLEFILE_BINARY" not in overrides
|
||||
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
|
||||
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
|
||||
|
||||
|
||||
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
|
||||
import asgiref.sync
|
||||
|
||||
@@ -700,11 +756,9 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||
"_run_crawl_cleanup",
|
||||
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
|
||||
)
|
||||
monkeypatch.setattr(crawl, "cleanup", lambda: cleanup_calls.append("crawl_cleanup"))
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
assert cleanup_calls == ["crawl_cleanup", "abx_cleanup"]
|
||||
assert cleanup_calls == ["abx_cleanup"]
|
||||
|
||||
|
||||
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
|
||||
@@ -765,6 +819,9 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
||||
timeout=60,
|
||||
snapshot_id="snap-1",
|
||||
is_background=True,
|
||||
url="https://example.org/",
|
||||
process_type="hook",
|
||||
worker_type="hook",
|
||||
)
|
||||
|
||||
async def run_test():
|
||||
|
||||
Reference in New Issue
Block a user