Reuse cached binaries in archivebox runtime

This commit is contained in:
Nick Sweeting
2026-03-24 11:03:43 -07:00
parent 39450111dd
commit 50286d3c38
19 changed files with 714 additions and 564 deletions

View File

@@ -2,7 +2,6 @@ from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .machine_service import MachineService
from .process_request_service import ProcessRequestService
from .process_service import ProcessService
from .runner import run_binary, run_crawl, run_install, run_pending_crawls
from .snapshot_service import SnapshotService
@@ -13,7 +12,6 @@ __all__ = [
"BinaryService",
"CrawlService",
"MachineService",
"ProcessRequestService",
"ProcessService",
"SnapshotService",
"TagService",

View File

@@ -14,6 +14,23 @@ class BinaryService(BaseService):
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
await run_db_op(self._project_binary, event)
cached = await run_db_op(self._load_cached_binary, event)
if cached is not None:
await self.bus.emit(
BinaryEvent(
name=event.name,
plugin_name=event.plugin_name,
hook_name=event.hook_name,
abspath=cached["abspath"],
version=cached["version"],
sha256=cached["sha256"],
binproviders=event.binproviders or cached["binproviders"],
binprovider=cached["binprovider"],
overrides=event.overrides or cached["overrides"],
binary_id=event.binary_id,
machine_id=event.machine_id or cached["machine_id"],
),
)
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
@@ -44,6 +61,29 @@ class BinaryService(BaseService):
},
)
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
installed = (
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("-modified_at")
.first()
)
if installed is None:
return None
return {
"abspath": installed.abspath,
"version": installed.version or "",
"sha256": installed.sha256 or "",
"binproviders": installed.binproviders or "",
"binprovider": installed.binprovider or "",
"machine_id": str(installed.machine_id),
"overrides": installed.overrides or {},
}
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
resolved = {
"abspath": event.abspath or "",
@@ -77,12 +117,11 @@ class BinaryService(BaseService):
"overrides": event.overrides or {},
}
binary = load_binary(spec)
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
if provider_name:
resolved["binprovider"] = str(provider_name)
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
resolved["version"] = str(binary.version or resolved["version"] or "")
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
resolved["binprovider"] = str(binary.loaded_binprovider.name)
except Exception:
pass

View File

@@ -14,13 +14,13 @@ class MachineService(BaseService):
await run_db_op(self._project, event)
def _project(self, event: MachineEvent) -> None:
from archivebox.machine.models import Machine
from archivebox.machine.models import Machine, _sanitize_machine_config
machine = Machine.current()
config = dict(machine.config or {})
if event.config is not None:
config.update(event.config)
config.update(_sanitize_machine_config(event.config))
elif event.method == "update":
key = event.key.replace("config/", "", 1).strip()
if key:
@@ -28,5 +28,5 @@ class MachineService(BaseService):
else:
return
machine.config = config
machine.config = _sanitize_machine_config(config)
machine.save(update_fields=["config", "modified_at"])

View File

@@ -1,179 +0,0 @@
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
import json
from pathlib import Path
import shlex
import socket
import time
from typing import ClassVar
from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.services.base import BaseService
def _is_port_listening(host: str, port: int) -> bool:
if not host or not port:
return False
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except OSError:
return False
def _supervisor_env(env: dict[str, str]) -> str:
pairs = []
for key, value in env.items():
escaped = value.replace('"', '\\"')
pairs.append(f'{key}="{escaped}"')
return ",".join(pairs)
def _iso_from_epoch(value: object) -> str:
if not isinstance(value, (int, float)) or value <= 0:
return ""
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
output_dir = Path(process_event.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
worker_name = process_event.hook_name
supervisor = get_or_create_supervisord_process(daemonize=True)
existing = get_worker(supervisor, worker_name)
if (
isinstance(existing, dict)
and existing.get("statename") == "RUNNING"
and (
not process_event.daemon_startup_host
or not process_event.daemon_startup_port
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
)
):
return existing
daemon = {
"name": worker_name,
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
"directory": str(output_dir),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
"redirect_stderr": "true",
}
if process_event.env:
daemon["environment"] = _supervisor_env(process_event.env)
proc = start_worker(supervisor, daemon)
deadline = time.monotonic() + max(float(process_event.daemon_startup_timeout), 0.5)
while time.monotonic() < deadline:
current = get_worker(supervisor, worker_name)
if isinstance(current, dict) and current.get("statename") == "RUNNING":
if (
not process_event.daemon_startup_host
or not process_event.daemon_startup_port
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
):
return current
time.sleep(0.1)
return proc if isinstance(proc, dict) else {}
class ProcessRequestService(BaseService):
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
try:
record = json.loads(event.line)
except (json.JSONDecodeError, ValueError):
return
if not isinstance(record, dict) or record.pop("type", "") != "ProcessEvent":
return
process_event = ProcessEvent(
plugin_name=record.get("plugin_name") or event.plugin_name,
hook_name=record.get("hook_name") or "process_request",
hook_path=record["hook_path"],
hook_args=[str(arg) for arg in record.get("hook_args", [])],
is_background=bool(record.get("is_background", True)),
output_dir=record.get("output_dir") or event.output_dir,
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
timeout=int(record.get("timeout") or 60),
daemon=bool(record.get("daemon", False)),
daemon_startup_host=str(record.get("daemon_startup_host") or ""),
daemon_startup_port=int(record.get("daemon_startup_port") or 0),
daemon_startup_timeout=float(record.get("daemon_startup_timeout") or 0.0),
process_type=str(record.get("process_type") or ""),
worker_type=str(record.get("worker_type") or ""),
event_timeout=float(record.get("event_timeout") or 360.0),
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
)
if not process_event.daemon:
await self.bus.emit(process_event)
return
proc = await asyncio.to_thread(_ensure_worker, process_event)
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
start_ts = _iso_from_epoch(proc.get("start"))
pid = int(proc.get("pid") or 0)
statename = str(proc.get("statename") or "")
exitstatus = int(proc.get("exitstatus") or 0)
process_type = process_event.process_type or "worker"
worker_type = process_event.worker_type or process_event.plugin_name
if statename == "RUNNING" and pid:
await self.bus.emit(
ProcessStartedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
output_dir=process_event.output_dir,
env=process_event.env,
timeout=process_event.timeout,
pid=pid,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
is_background=True,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
),
)
return
stderr = (
f"Worker {process_event.hook_name} failed to start"
if not statename
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
)
await self.bus.emit(
ProcessCompletedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
env=process_event.env,
stdout="",
stderr=stderr,
exit_code=exitstatus or 1,
output_dir=process_event.output_dir,
is_background=True,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
pid=pid,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
end_ts=datetime.now(tz=timezone.utc).isoformat(),
),
)
raise RuntimeError(stderr)

View File

@@ -1,11 +1,19 @@
from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
import asyncio
from datetime import datetime, timezone as datetime_timezone
import json
from pathlib import Path
import shlex
import socket
import time
from typing import TYPE_CHECKING, Any, ClassVar
from urllib.parse import urlparse
from django.utils import timezone
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
@@ -14,6 +22,9 @@ if TYPE_CHECKING:
from archivebox.machine.models import Process
WORKER_READY_TIMEOUT = 10.0
def parse_event_datetime(value: str | None):
if not value:
return None
@@ -26,14 +37,218 @@ def parse_event_datetime(value: str | None):
return dt
def _is_port_listening(host: str, port: int) -> bool:
if not host or not port:
return False
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except OSError:
return False
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
if not url:
return None
parsed = urlparse(url)
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
return None
return parsed.hostname, parsed.port
def _supervisor_env(env: dict[str, str]) -> str:
pairs = []
for key, value in env.items():
escaped = value.replace('"', '\\"')
pairs.append(f'{key}="{escaped}"')
return ",".join(pairs)
def _iso_from_epoch(value: object) -> str:
if not isinstance(value, (int, float)) or value <= 0:
return ""
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
def _int_from_object(value: object) -> int:
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
if isinstance(value, str):
try:
return int(value)
except ValueError:
return 0
return 0
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
output_dir = Path(process_event.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
worker_name = process_event.hook_name
supervisor = get_or_create_supervisord_process(daemonize=True)
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
existing = get_worker(supervisor, worker_name)
if (
isinstance(existing, dict)
and existing.get("statename") == "RUNNING"
and (worker_socket is None or _is_port_listening(*worker_socket))
):
return existing
daemon = {
"name": worker_name,
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
"directory": str(output_dir),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
"redirect_stderr": "true",
}
if process_event.env:
daemon["environment"] = _supervisor_env(process_event.env)
proc = start_worker(supervisor, daemon)
deadline = time.monotonic() + WORKER_READY_TIMEOUT
while time.monotonic() < deadline:
current = get_worker(supervisor, worker_name)
if isinstance(current, dict) and current.get("statename") == "RUNNING":
if worker_socket is None or _is_port_listening(*worker_socket):
return current
time.sleep(0.1)
return proc if isinstance(proc, dict) else {}
class ProcessService(BaseService):
LISTENS_TO = [ProcessStartedEvent, ProcessCompletedEvent]
EMITS = []
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
def __init__(self, bus):
self.process_ids: dict[str, str] = {}
super().__init__(bus)
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
try:
record = json.loads(event.line)
except (json.JSONDecodeError, ValueError):
return
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
return
passthrough_fields: dict[str, Any] = {
key: value
for key, value in record.items()
if key
not in {
"type",
"plugin_name",
"hook_name",
"hook_path",
"hook_args",
"is_background",
"output_dir",
"env",
"snapshot_id",
"process_id",
"url",
"timeout",
"daemon",
"process_type",
"worker_type",
"event_timeout",
"event_handler_timeout",
}
}
process_event = ProcessEvent(
plugin_name=record.get("plugin_name") or event.plugin_name,
hook_name=record.get("hook_name") or "process",
hook_path=record["hook_path"],
hook_args=[str(arg) for arg in record.get("hook_args", [])],
is_background=bool(record.get("is_background", True)),
output_dir=record.get("output_dir") or event.output_dir,
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
timeout=int(record.get("timeout") or 60),
daemon=bool(record.get("daemon", False)),
url=str(record.get("url") or ""),
process_type=str(record.get("process_type") or ""),
worker_type=str(record.get("worker_type") or ""),
event_timeout=float(record.get("event_timeout") or 360.0),
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
**passthrough_fields,
)
if not process_event.daemon:
await self.bus.emit(process_event)
return
proc = await asyncio.to_thread(_ensure_worker, process_event)
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
start_ts = _iso_from_epoch(proc.get("start"))
pid = _int_from_object(proc.get("pid"))
statename = str(proc.get("statename") or "")
exitstatus = _int_from_object(proc.get("exitstatus"))
process_type = process_event.process_type or "worker"
worker_type = process_event.worker_type or process_event.plugin_name
if statename == "RUNNING" and pid:
await self.bus.emit(
ProcessStartedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
output_dir=process_event.output_dir,
env=process_event.env,
timeout=process_event.timeout,
pid=pid,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
is_background=True,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
**passthrough_fields,
),
)
return
stderr = (
f"Worker {process_event.hook_name} failed to start"
if not statename
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
)
await self.bus.emit(
ProcessCompletedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
env=process_event.env,
stdout="",
stderr=stderr,
exit_code=exitstatus or 1,
output_dir=process_event.output_dir,
is_background=True,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
pid=pid,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
**passthrough_fields,
),
)
raise RuntimeError(stderr)
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
await run_db_op(self._project_started, event)
@@ -51,7 +266,7 @@ class ProcessService(BaseService):
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process is not None:
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
process.save(update_fields=["iface", "machine", "modified_at"])
@@ -84,6 +299,7 @@ class ProcessService(BaseService):
env=event.env,
timeout=getattr(event, "timeout", 60),
pid=event.pid or None,
url=getattr(event, "url", "") or None,
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
status=Process.StatusChoices.RUNNING,
retry_at=None,
@@ -98,6 +314,7 @@ class ProcessService(BaseService):
process.env = event.env
process.timeout = event.timeout
process.pid = event.pid or None
process.url = getattr(event, "url", "") or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
@@ -113,6 +330,7 @@ class ProcessService(BaseService):
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.url = getattr(event, "url", "") or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import asyncio
import json
import os
import re
import shutil
import subprocess
import sys
@@ -28,8 +29,6 @@ from abx_dl.orchestrator import (
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .machine_service import MachineService
from .process_request_service import ProcessRequestService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
@@ -58,28 +57,34 @@ def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str
)
def _binary_env_key(name: str) -> str:
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
return f"{normalized}_BINARY"
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
keys: list[str] = []
if binary_name != "postlight-parser":
keys.append(_binary_env_key(binary_name))
for plugin in plugins.values():
for spec in plugin.binaries:
template_name = str(spec.get("name") or "").strip()
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
if match is None:
continue
key = match.group(1)
configured_value = config.get(key)
if configured_value is not None and str(configured_value).strip() == binary_name:
keys.append(key)
for key, prop in plugin.config_schema.items():
if key.endswith("_BINARY") and prop.get("default") == binary_name:
keys.insert(0, key)
keys.append(key)
return list(dict.fromkeys(keys))
def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
active_config = dict(config or {})
overrides: dict[str, str] = {}
shared_lib_dir: Path | None = None
pip_home: Path | None = None
@@ -98,7 +103,7 @@ def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str,
continue
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
continue
for key in _binary_config_keys_for_plugins(plugins, binary.name):
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
overrides[key] = binary.abspath
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
@@ -231,10 +236,8 @@ class CrawlRunner:
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
self.process_service = ProcessService(self.bus)
self.machine_service = MachineService(self.bus)
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.process_request_service = ProcessRequestService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
@@ -250,32 +253,10 @@ class CrawlRunner:
self.abx_services = None
self.persona = None
self.base_config: dict[str, Any] = {}
self.derived_config: dict[str, Any] = {}
self.primary_url = ""
self._live_stream = None
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
CrawlService(bus, crawl_id=str(self.crawl.id))
SnapshotService(
bus,
crawl_id=str(self.crawl.id),
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=self.plugins,
config_overrides=config_overrides,
auto_install=True,
emit_jsonl=False,
)
return bus, abx_services
async def run(self) -> None:
from asgiref.sync import sync_to_async
from archivebox.crawls.models import Crawl
@@ -292,6 +273,8 @@ class CrawlRunner:
**self.base_config,
"ABX_RUNTIME": "archivebox",
},
derived_config_overrides=self.derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)
@@ -369,7 +352,7 @@ class CrawlRunner:
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
self.base_config.update(_installed_binary_config_overrides(self.plugins))
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
self.base_config["ABX_RUNTIME"] = "archivebox"
if self.selected_plugins is None:
self.selected_plugins = _selected_plugins_from_config(self.base_config)
@@ -473,7 +456,6 @@ class CrawlRunner:
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=setup_snapshot,
@@ -501,7 +483,6 @@ class CrawlRunner:
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=cleanup_snapshot,
@@ -530,31 +511,22 @@ class CrawlRunner:
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
snapshot_bus, snapshot_services = self._create_projector_bus(
identifier=f"{self.crawl.id}_{snapshot['id']}",
config_overrides=snapshot["config"],
)
try:
_attach_bus_trace(snapshot_bus)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=snapshot_bus,
bus=self.bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
await snapshot_services.process.wait_for_background_monitors()
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
await _stop_bus_trace(snapshot_bus)
await snapshot_bus.stop()
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
@@ -615,19 +587,19 @@ async def _run_binary(binary_id: str) -> None:
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
plugins = discover_plugins()
config = get_config()
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
derived_config_overrides=derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)
@@ -662,19 +634,19 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
plugins = discover_plugins()
config = get_config()
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
derived_config_overrides=derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)