mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Reuse cached binaries in archivebox runtime
This commit is contained in:
@@ -2,7 +2,6 @@ from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_request_service import ProcessRequestService
|
||||
from .process_service import ProcessService
|
||||
from .runner import run_binary, run_crawl, run_install, run_pending_crawls
|
||||
from .snapshot_service import SnapshotService
|
||||
@@ -13,7 +12,6 @@ __all__ = [
|
||||
"BinaryService",
|
||||
"CrawlService",
|
||||
"MachineService",
|
||||
"ProcessRequestService",
|
||||
"ProcessService",
|
||||
"SnapshotService",
|
||||
"TagService",
|
||||
|
||||
@@ -14,6 +14,23 @@ class BinaryService(BaseService):
|
||||
|
||||
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
|
||||
await run_db_op(self._project_binary, event)
|
||||
cached = await run_db_op(self._load_cached_binary, event)
|
||||
if cached is not None:
|
||||
await self.bus.emit(
|
||||
BinaryEvent(
|
||||
name=event.name,
|
||||
plugin_name=event.plugin_name,
|
||||
hook_name=event.hook_name,
|
||||
abspath=cached["abspath"],
|
||||
version=cached["version"],
|
||||
sha256=cached["sha256"],
|
||||
binproviders=event.binproviders or cached["binproviders"],
|
||||
binprovider=cached["binprovider"],
|
||||
overrides=event.overrides or cached["overrides"],
|
||||
binary_id=event.binary_id,
|
||||
machine_id=event.machine_id or cached["machine_id"],
|
||||
),
|
||||
)
|
||||
|
||||
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
|
||||
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
|
||||
@@ -44,6 +61,29 @@ class BinaryService(BaseService):
|
||||
},
|
||||
)
|
||||
|
||||
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
installed = (
|
||||
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("-modified_at")
|
||||
.first()
|
||||
)
|
||||
if installed is None:
|
||||
return None
|
||||
return {
|
||||
"abspath": installed.abspath,
|
||||
"version": installed.version or "",
|
||||
"sha256": installed.sha256 or "",
|
||||
"binproviders": installed.binproviders or "",
|
||||
"binprovider": installed.binprovider or "",
|
||||
"machine_id": str(installed.machine_id),
|
||||
"overrides": installed.overrides or {},
|
||||
}
|
||||
|
||||
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
|
||||
resolved = {
|
||||
"abspath": event.abspath or "",
|
||||
@@ -77,12 +117,11 @@ class BinaryService(BaseService):
|
||||
"overrides": event.overrides or {},
|
||||
}
|
||||
binary = load_binary(spec)
|
||||
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
|
||||
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
|
||||
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
|
||||
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
|
||||
if provider_name:
|
||||
resolved["binprovider"] = str(provider_name)
|
||||
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
|
||||
resolved["version"] = str(binary.version or resolved["version"] or "")
|
||||
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
|
||||
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
|
||||
resolved["binprovider"] = str(binary.loaded_binprovider.name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@@ -14,13 +14,13 @@ class MachineService(BaseService):
|
||||
await run_db_op(self._project, event)
|
||||
|
||||
def _project(self, event: MachineEvent) -> None:
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.machine.models import Machine, _sanitize_machine_config
|
||||
|
||||
machine = Machine.current()
|
||||
config = dict(machine.config or {})
|
||||
|
||||
if event.config is not None:
|
||||
config.update(event.config)
|
||||
config.update(_sanitize_machine_config(event.config))
|
||||
elif event.method == "update":
|
||||
key = event.key.replace("config/", "", 1).strip()
|
||||
if key:
|
||||
@@ -28,5 +28,5 @@ class MachineService(BaseService):
|
||||
else:
|
||||
return
|
||||
|
||||
machine.config = config
|
||||
machine.config = _sanitize_machine_config(config)
|
||||
machine.save(update_fields=["config", "modified_at"])
|
||||
|
||||
@@ -1,179 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import socket
|
||||
import time
|
||||
from typing import ClassVar
|
||||
|
||||
from abxbus import BaseEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
def _is_port_listening(host: str, port: int) -> bool:
|
||||
if not host or not port:
|
||||
return False
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _supervisor_env(env: dict[str, str]) -> str:
|
||||
pairs = []
|
||||
for key, value in env.items():
|
||||
escaped = value.replace('"', '\\"')
|
||||
pairs.append(f'{key}="{escaped}"')
|
||||
return ",".join(pairs)
|
||||
|
||||
|
||||
def _iso_from_epoch(value: object) -> str:
|
||||
if not isinstance(value, (int, float)) or value <= 0:
|
||||
return ""
|
||||
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
||||
|
||||
output_dir = Path(process_event.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
worker_name = process_event.hook_name
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
|
||||
existing = get_worker(supervisor, worker_name)
|
||||
if (
|
||||
isinstance(existing, dict)
|
||||
and existing.get("statename") == "RUNNING"
|
||||
and (
|
||||
not process_event.daemon_startup_host
|
||||
or not process_event.daemon_startup_port
|
||||
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
|
||||
)
|
||||
):
|
||||
return existing
|
||||
|
||||
daemon = {
|
||||
"name": worker_name,
|
||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
||||
"directory": str(output_dir),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
if process_event.env:
|
||||
daemon["environment"] = _supervisor_env(process_event.env)
|
||||
|
||||
proc = start_worker(supervisor, daemon)
|
||||
deadline = time.monotonic() + max(float(process_event.daemon_startup_timeout), 0.5)
|
||||
while time.monotonic() < deadline:
|
||||
current = get_worker(supervisor, worker_name)
|
||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
||||
if (
|
||||
not process_event.daemon_startup_host
|
||||
or not process_event.daemon_startup_port
|
||||
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
|
||||
):
|
||||
return current
|
||||
time.sleep(0.1)
|
||||
return proc if isinstance(proc, dict) else {}
|
||||
|
||||
|
||||
class ProcessRequestService(BaseService):
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
|
||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
||||
try:
|
||||
record = json.loads(event.line)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return
|
||||
if not isinstance(record, dict) or record.pop("type", "") != "ProcessEvent":
|
||||
return
|
||||
|
||||
process_event = ProcessEvent(
|
||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or "process_request",
|
||||
hook_path=record["hook_path"],
|
||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
||||
is_background=bool(record.get("is_background", True)),
|
||||
output_dir=record.get("output_dir") or event.output_dir,
|
||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
timeout=int(record.get("timeout") or 60),
|
||||
daemon=bool(record.get("daemon", False)),
|
||||
daemon_startup_host=str(record.get("daemon_startup_host") or ""),
|
||||
daemon_startup_port=int(record.get("daemon_startup_port") or 0),
|
||||
daemon_startup_timeout=float(record.get("daemon_startup_timeout") or 0.0),
|
||||
process_type=str(record.get("process_type") or ""),
|
||||
worker_type=str(record.get("worker_type") or ""),
|
||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
||||
)
|
||||
if not process_event.daemon:
|
||||
await self.bus.emit(process_event)
|
||||
return
|
||||
|
||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
||||
start_ts = _iso_from_epoch(proc.get("start"))
|
||||
pid = int(proc.get("pid") or 0)
|
||||
statename = str(proc.get("statename") or "")
|
||||
exitstatus = int(proc.get("exitstatus") or 0)
|
||||
process_type = process_event.process_type or "worker"
|
||||
worker_type = process_event.worker_type or process_event.plugin_name
|
||||
|
||||
if statename == "RUNNING" and pid:
|
||||
await self.bus.emit(
|
||||
ProcessStartedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
output_dir=process_event.output_dir,
|
||||
env=process_event.env,
|
||||
timeout=process_event.timeout,
|
||||
pid=pid,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
is_background=True,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
stderr = (
|
||||
f"Worker {process_event.hook_name} failed to start"
|
||||
if not statename
|
||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
||||
)
|
||||
await self.bus.emit(
|
||||
ProcessCompletedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
env=process_event.env,
|
||||
stdout="",
|
||||
stderr=stderr,
|
||||
exit_code=exitstatus or 1,
|
||||
output_dir=process_event.output_dir,
|
||||
is_background=True,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
pid=pid,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
end_ts=datetime.now(tz=timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
raise RuntimeError(stderr)
|
||||
@@ -1,11 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
import asyncio
|
||||
from datetime import datetime, timezone as datetime_timezone
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import socket
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, ClassVar
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abxbus import BaseEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
@@ -14,6 +22,9 @@ if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
|
||||
WORKER_READY_TIMEOUT = 10.0
|
||||
|
||||
|
||||
def parse_event_datetime(value: str | None):
|
||||
if not value:
|
||||
return None
|
||||
@@ -26,14 +37,218 @@ def parse_event_datetime(value: str | None):
|
||||
return dt
|
||||
|
||||
|
||||
def _is_port_listening(host: str, port: int) -> bool:
|
||||
if not host or not port:
|
||||
return False
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
|
||||
if not url:
|
||||
return None
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
|
||||
return None
|
||||
return parsed.hostname, parsed.port
|
||||
|
||||
|
||||
def _supervisor_env(env: dict[str, str]) -> str:
|
||||
pairs = []
|
||||
for key, value in env.items():
|
||||
escaped = value.replace('"', '\\"')
|
||||
pairs.append(f'{key}="{escaped}"')
|
||||
return ",".join(pairs)
|
||||
|
||||
|
||||
def _iso_from_epoch(value: object) -> str:
|
||||
if not isinstance(value, (int, float)) or value <= 0:
|
||||
return ""
|
||||
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _int_from_object(value: object) -> int:
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
||||
|
||||
output_dir = Path(process_event.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
worker_name = process_event.hook_name
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
|
||||
|
||||
existing = get_worker(supervisor, worker_name)
|
||||
if (
|
||||
isinstance(existing, dict)
|
||||
and existing.get("statename") == "RUNNING"
|
||||
and (worker_socket is None or _is_port_listening(*worker_socket))
|
||||
):
|
||||
return existing
|
||||
|
||||
daemon = {
|
||||
"name": worker_name,
|
||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
||||
"directory": str(output_dir),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
if process_event.env:
|
||||
daemon["environment"] = _supervisor_env(process_event.env)
|
||||
|
||||
proc = start_worker(supervisor, daemon)
|
||||
deadline = time.monotonic() + WORKER_READY_TIMEOUT
|
||||
while time.monotonic() < deadline:
|
||||
current = get_worker(supervisor, worker_name)
|
||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
||||
if worker_socket is None or _is_port_listening(*worker_socket):
|
||||
return current
|
||||
time.sleep(0.1)
|
||||
return proc if isinstance(proc, dict) else {}
|
||||
|
||||
|
||||
class ProcessService(BaseService):
|
||||
LISTENS_TO = [ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS = []
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
|
||||
def __init__(self, bus):
|
||||
self.process_ids: dict[str, str] = {}
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
||||
try:
|
||||
record = json.loads(event.line)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return
|
||||
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
|
||||
return
|
||||
|
||||
passthrough_fields: dict[str, Any] = {
|
||||
key: value
|
||||
for key, value in record.items()
|
||||
if key
|
||||
not in {
|
||||
"type",
|
||||
"plugin_name",
|
||||
"hook_name",
|
||||
"hook_path",
|
||||
"hook_args",
|
||||
"is_background",
|
||||
"output_dir",
|
||||
"env",
|
||||
"snapshot_id",
|
||||
"process_id",
|
||||
"url",
|
||||
"timeout",
|
||||
"daemon",
|
||||
"process_type",
|
||||
"worker_type",
|
||||
"event_timeout",
|
||||
"event_handler_timeout",
|
||||
}
|
||||
}
|
||||
process_event = ProcessEvent(
|
||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or "process",
|
||||
hook_path=record["hook_path"],
|
||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
||||
is_background=bool(record.get("is_background", True)),
|
||||
output_dir=record.get("output_dir") or event.output_dir,
|
||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
timeout=int(record.get("timeout") or 60),
|
||||
daemon=bool(record.get("daemon", False)),
|
||||
url=str(record.get("url") or ""),
|
||||
process_type=str(record.get("process_type") or ""),
|
||||
worker_type=str(record.get("worker_type") or ""),
|
||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
||||
**passthrough_fields,
|
||||
)
|
||||
if not process_event.daemon:
|
||||
await self.bus.emit(process_event)
|
||||
return
|
||||
|
||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
||||
start_ts = _iso_from_epoch(proc.get("start"))
|
||||
pid = _int_from_object(proc.get("pid"))
|
||||
statename = str(proc.get("statename") or "")
|
||||
exitstatus = _int_from_object(proc.get("exitstatus"))
|
||||
process_type = process_event.process_type or "worker"
|
||||
worker_type = process_event.worker_type or process_event.plugin_name
|
||||
|
||||
if statename == "RUNNING" and pid:
|
||||
await self.bus.emit(
|
||||
ProcessStartedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
output_dir=process_event.output_dir,
|
||||
env=process_event.env,
|
||||
timeout=process_event.timeout,
|
||||
pid=pid,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
is_background=True,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
stderr = (
|
||||
f"Worker {process_event.hook_name} failed to start"
|
||||
if not statename
|
||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
||||
)
|
||||
await self.bus.emit(
|
||||
ProcessCompletedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
env=process_event.env,
|
||||
stdout="",
|
||||
stderr=stderr,
|
||||
exit_code=exitstatus or 1,
|
||||
output_dir=process_event.output_dir,
|
||||
is_background=True,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
pid=pid,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
raise RuntimeError(stderr)
|
||||
|
||||
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
|
||||
await run_db_op(self._project_started, event)
|
||||
|
||||
@@ -51,7 +266,7 @@ class ProcessService(BaseService):
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process is not None:
|
||||
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
@@ -84,6 +299,7 @@ class ProcessService(BaseService):
|
||||
env=event.env,
|
||||
timeout=getattr(event, "timeout", 60),
|
||||
pid=event.pid or None,
|
||||
url=getattr(event, "url", "") or None,
|
||||
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
@@ -98,6 +314,7 @@ class ProcessService(BaseService):
|
||||
process.env = event.env
|
||||
process.timeout = event.timeout
|
||||
process.pid = event.pid or None
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
@@ -113,6 +330,7 @@ class ProcessService(BaseService):
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -28,8 +29,6 @@ from abx_dl.orchestrator import (
|
||||
from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_request_service import ProcessRequestService
|
||||
from .process_service import ProcessService
|
||||
from .snapshot_service import SnapshotService
|
||||
from .tag_service import TagService
|
||||
@@ -58,28 +57,34 @@ def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str
|
||||
)
|
||||
|
||||
|
||||
def _binary_env_key(name: str) -> str:
|
||||
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
|
||||
return f"{normalized}_BINARY"
|
||||
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
|
||||
|
||||
|
||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
|
||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
|
||||
keys: list[str] = []
|
||||
if binary_name != "postlight-parser":
|
||||
keys.append(_binary_env_key(binary_name))
|
||||
|
||||
for plugin in plugins.values():
|
||||
for spec in plugin.binaries:
|
||||
template_name = str(spec.get("name") or "").strip()
|
||||
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
|
||||
if match is None:
|
||||
continue
|
||||
key = match.group(1)
|
||||
configured_value = config.get(key)
|
||||
if configured_value is not None and str(configured_value).strip() == binary_name:
|
||||
keys.append(key)
|
||||
for key, prop in plugin.config_schema.items():
|
||||
if key.endswith("_BINARY") and prop.get("default") == binary_name:
|
||||
keys.insert(0, key)
|
||||
keys.append(key)
|
||||
|
||||
return list(dict.fromkeys(keys))
|
||||
|
||||
|
||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
|
||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
active_config = dict(config or {})
|
||||
overrides: dict[str, str] = {}
|
||||
shared_lib_dir: Path | None = None
|
||||
pip_home: Path | None = None
|
||||
@@ -98,7 +103,7 @@ def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str,
|
||||
continue
|
||||
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
|
||||
continue
|
||||
for key in _binary_config_keys_for_plugins(plugins, binary.name):
|
||||
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
|
||||
overrides[key] = binary.abspath
|
||||
|
||||
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
|
||||
@@ -231,10 +236,8 @@ class CrawlRunner:
|
||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||
self.plugins = discover_plugins()
|
||||
self.process_service = ProcessService(self.bus)
|
||||
self.machine_service = MachineService(self.bus)
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.process_request_service = ProcessRequestService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||
self.snapshot_service = SnapshotService(
|
||||
@@ -250,32 +253,10 @@ class CrawlRunner:
|
||||
self.abx_services = None
|
||||
self.persona = None
|
||||
self.base_config: dict[str, Any] = {}
|
||||
self.derived_config: dict[str, Any] = {}
|
||||
self.primary_url = ""
|
||||
self._live_stream = None
|
||||
|
||||
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
|
||||
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
CrawlService(bus, crawl_id=str(self.crawl.id))
|
||||
SnapshotService(
|
||||
bus,
|
||||
crawl_id=str(self.crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=config_overrides,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
return bus, abx_services
|
||||
|
||||
async def run(self) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
from archivebox.crawls.models import Crawl
|
||||
@@ -292,6 +273,8 @@ class CrawlRunner:
|
||||
**self.base_config,
|
||||
"ABX_RUNTIME": "archivebox",
|
||||
},
|
||||
derived_config_overrides=self.derived_config,
|
||||
persist_derived=False,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
@@ -369,7 +352,7 @@ class CrawlRunner:
|
||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
self.base_config.update(_installed_binary_config_overrides(self.plugins))
|
||||
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
|
||||
self.base_config["ABX_RUNTIME"] = "archivebox"
|
||||
if self.selected_plugins is None:
|
||||
self.selected_plugins = _selected_plugins_from_config(self.base_config)
|
||||
@@ -473,7 +456,6 @@ class CrawlRunner:
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=setup_snapshot,
|
||||
@@ -501,7 +483,6 @@ class CrawlRunner:
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=cleanup_snapshot,
|
||||
@@ -530,31 +511,22 @@ class CrawlRunner:
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
snapshot_bus, snapshot_services = self._create_projector_bus(
|
||||
identifier=f"{self.crawl.id}_{snapshot['id']}",
|
||||
config_overrides=snapshot["config"],
|
||||
)
|
||||
try:
|
||||
_attach_bus_trace(snapshot_bus)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=snapshot_bus,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
await snapshot_services.process.wait_for_background_monitors()
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
await _stop_bus_trace(snapshot_bus)
|
||||
await snapshot_bus.stop()
|
||||
|
||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -615,19 +587,19 @@ async def _run_binary(binary_id: str) -> None:
|
||||
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
derived_config_overrides=derived_config,
|
||||
persist_derived=False,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
@@ -662,19 +634,19 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
derived_config_overrides=derived_config,
|
||||
persist_derived=False,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user