mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
@@ -2,6 +2,7 @@ from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_request_service import ProcessRequestService
|
||||
from .process_service import ProcessService
|
||||
from .runner import run_binary, run_crawl, run_install, run_pending_crawls
|
||||
from .snapshot_service import SnapshotService
|
||||
@@ -12,6 +13,7 @@ __all__ = [
|
||||
"BinaryService",
|
||||
"CrawlService",
|
||||
"MachineService",
|
||||
"ProcessRequestService",
|
||||
"ProcessService",
|
||||
"SnapshotService",
|
||||
"TagService",
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import mimetypes
|
||||
from collections import defaultdict
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
|
||||
from abx_dl.output_files import guess_mimetype
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
@@ -35,27 +37,157 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
|
||||
stat = file_path.stat()
|
||||
except OSError:
|
||||
continue
|
||||
mime_type, _ = mimetypes.guess_type(str(file_path))
|
||||
mime_type = mime_type or "application/octet-stream"
|
||||
mime_type = guess_mimetype(file_path) or "application/octet-stream"
|
||||
relative_path = str(file_path.relative_to(plugin_dir))
|
||||
output_files[relative_path] = {}
|
||||
output_files[relative_path] = {
|
||||
"extension": file_path.suffix.lower().lstrip("."),
|
||||
"mimetype": mime_type,
|
||||
"size": stat.st_size,
|
||||
}
|
||||
mime_sizes[mime_type] += stat.st_size
|
||||
total_size += stat.st_size
|
||||
|
||||
output_mimetypes = ",".join(
|
||||
mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)
|
||||
)
|
||||
output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
|
||||
return output_files, total_size, output_mimetypes
|
||||
|
||||
|
||||
def _coerce_output_file_size(value: Any) -> int:
|
||||
try:
|
||||
return max(int(value or 0), 0)
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
def _normalize_output_files(raw_output_files: Any) -> dict[str, dict]:
|
||||
def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]:
|
||||
normalized = dict(metadata)
|
||||
if "extension" not in normalized:
|
||||
normalized["extension"] = Path(path).suffix.lower().lstrip(".")
|
||||
if "mimetype" not in normalized:
|
||||
guessed = guess_mimetype(path)
|
||||
if guessed:
|
||||
normalized["mimetype"] = guessed
|
||||
return normalized
|
||||
|
||||
if raw_output_files is None:
|
||||
return {}
|
||||
|
||||
if isinstance(raw_output_files, str):
|
||||
try:
|
||||
raw_output_files = json.loads(raw_output_files)
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
|
||||
if isinstance(raw_output_files, dict):
|
||||
normalized: dict[str, dict] = {}
|
||||
for path, metadata in raw_output_files.items():
|
||||
if not path:
|
||||
continue
|
||||
metadata_dict = dict(metadata) if isinstance(metadata, dict) else {}
|
||||
metadata_dict.pop("path", None)
|
||||
normalized[str(path)] = _enrich_metadata(str(path), metadata_dict)
|
||||
return normalized
|
||||
|
||||
if not isinstance(raw_output_files, Iterable):
|
||||
return {}
|
||||
|
||||
normalized: dict[str, dict] = {}
|
||||
for item in raw_output_files:
|
||||
if isinstance(item, str):
|
||||
normalized[item] = _enrich_metadata(item, {})
|
||||
continue
|
||||
if hasattr(item, "model_dump"):
|
||||
item = item.model_dump()
|
||||
elif hasattr(item, "path"):
|
||||
item = {
|
||||
"path": getattr(item, "path", ""),
|
||||
"extension": getattr(item, "extension", ""),
|
||||
"mimetype": getattr(item, "mimetype", ""),
|
||||
"size": getattr(item, "size", 0),
|
||||
}
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
path = str(item.get("path") or "").strip()
|
||||
if not path:
|
||||
continue
|
||||
normalized[path] = _enrich_metadata(path, {key: value for key, value in item.items() if key != "path" and value not in (None, "")})
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _has_structured_output_metadata(output_files: dict[str, dict]) -> bool:
|
||||
return any(any(key in metadata for key in ("extension", "mimetype", "size")) for metadata in output_files.values())
|
||||
|
||||
|
||||
def _summarize_output_files(output_files: dict[str, dict]) -> tuple[int, str]:
|
||||
mime_sizes: dict[str, int] = defaultdict(int)
|
||||
total_size = 0
|
||||
|
||||
for metadata in output_files.values():
|
||||
if not isinstance(metadata, dict):
|
||||
continue
|
||||
size = _coerce_output_file_size(metadata.get("size"))
|
||||
mimetype = str(metadata.get("mimetype") or "").strip()
|
||||
total_size += size
|
||||
if mimetype and size:
|
||||
mime_sizes[mimetype] += size
|
||||
|
||||
output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
|
||||
return total_size, output_mimetypes
|
||||
|
||||
|
||||
def _resolve_output_metadata(raw_output_files: Any, plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
|
||||
normalized_output_files = _normalize_output_files(raw_output_files)
|
||||
if normalized_output_files and _has_structured_output_metadata(normalized_output_files):
|
||||
output_size, output_mimetypes = _summarize_output_files(normalized_output_files)
|
||||
return normalized_output_files, output_size, output_mimetypes
|
||||
return _collect_output_metadata(plugin_dir)
|
||||
|
||||
|
||||
def _normalize_status(status: str) -> str:
|
||||
if status == "noresult":
|
||||
return "noresults"
|
||||
return status or "failed"
|
||||
|
||||
|
||||
def _has_content_files(output_files: list[str]) -> bool:
|
||||
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
|
||||
def _normalize_snapshot_title(candidate: str, *, snapshot_url: str) -> str:
|
||||
title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip()
|
||||
if not title:
|
||||
return ""
|
||||
if title.lower() in {"pending...", "no title found"}:
|
||||
return ""
|
||||
if title == snapshot_url:
|
||||
return ""
|
||||
if "/" in title and title.lower().endswith(".txt"):
|
||||
return ""
|
||||
return title
|
||||
|
||||
|
||||
def _extract_snapshot_title(snapshot_output_dir: str, plugin: str, output_str: str, *, snapshot_url: str) -> str:
|
||||
if plugin != "title":
|
||||
return ""
|
||||
|
||||
title_file = Path(snapshot_output_dir) / "title" / "title.txt"
|
||||
if title_file.exists():
|
||||
try:
|
||||
file_title = _normalize_snapshot_title(title_file.read_text(encoding="utf-8"), snapshot_url=snapshot_url)
|
||||
except OSError:
|
||||
file_title = ""
|
||||
if file_title:
|
||||
return file_title
|
||||
|
||||
return _normalize_snapshot_title(output_str, snapshot_url=snapshot_url)
|
||||
|
||||
|
||||
def _should_update_snapshot_title(current_title: str, next_title: str, *, snapshot_url: str) -> bool:
|
||||
current = (current_title or "").strip()
|
||||
if not current or current.lower() == "pending..." or current == snapshot_url:
|
||||
return True
|
||||
return len(next_title) > len(current)
|
||||
|
||||
|
||||
def _has_content_files(output_files: Any) -> bool:
|
||||
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in _normalize_output_files(output_files))
|
||||
|
||||
|
||||
def _iter_archiveresult_records(stdout: str) -> list[dict]:
|
||||
@@ -86,7 +218,7 @@ class ArchiveResultService(BaseService):
|
||||
if snapshot_output_dir is None:
|
||||
return
|
||||
plugin_dir = Path(snapshot_output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
|
||||
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
@@ -94,7 +226,7 @@ class ArchiveResultService(BaseService):
|
||||
return
|
||||
|
||||
plugin_dir = Path(event.output_dir)
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
records = _iter_archiveresult_records(event.stdout)
|
||||
if records:
|
||||
for record in records:
|
||||
@@ -172,6 +304,11 @@ class ArchiveResultService(BaseService):
|
||||
result.notes = event.error
|
||||
result.save()
|
||||
|
||||
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
|
||||
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
|
||||
snapshot.title = next_title
|
||||
snapshot.save(update_fields=["title", "modified_at"])
|
||||
|
||||
def _project_from_process_completed(
|
||||
self,
|
||||
event: ProcessCompletedEvent,
|
||||
@@ -188,6 +325,7 @@ class ArchiveResultService(BaseService):
|
||||
process_id=event.process_id,
|
||||
output_str=record.get("output_str") or "",
|
||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||
output_files=event.output_files,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||
|
||||
@@ -101,4 +101,6 @@ class BinaryService(BaseService):
|
||||
binary.overrides = event.overrides
|
||||
binary.status = Binary.StatusChoices.INSTALLED
|
||||
binary.retry_at = None
|
||||
binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])
|
||||
binary.save(
|
||||
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
|
||||
)
|
||||
|
||||
@@ -1 +1,51 @@
|
||||
from abx_dl.cli import LiveBusUI
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from rich.console import Console
|
||||
|
||||
|
||||
class LiveBusUI:
|
||||
"""Small tty-only runner UI.
|
||||
|
||||
The runner only needs a context manager and a couple of print helpers here.
|
||||
Keeping this minimal avoids a hard dependency on a heavier live dashboard.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bus: Any,
|
||||
*,
|
||||
total_hooks: int,
|
||||
timeout_seconds: int,
|
||||
ui_console: Console,
|
||||
interactive_tty: bool,
|
||||
) -> None:
|
||||
self.bus = bus
|
||||
self.total_hooks = total_hooks
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.ui_console = ui_console
|
||||
self.interactive_tty = interactive_tty
|
||||
|
||||
def __enter__(self) -> LiveBusUI:
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb) -> bool:
|
||||
return False
|
||||
|
||||
def print_intro(self, *, url: str, output_dir: Path, plugins_label: str) -> None:
|
||||
if not self.interactive_tty:
|
||||
return
|
||||
self.ui_console.print(
|
||||
f"[bold]ArchiveBox[/bold] {url} -> [dim]{output_dir}[/dim] "
|
||||
f"([cyan]{plugins_label}[/cyan], {self.total_hooks} hooks, {self.timeout_seconds}s timeout)",
|
||||
)
|
||||
|
||||
def print_summary(self, results: list[Any] | tuple[Any, ...] | None, *, output_dir: Path) -> None:
|
||||
if not self.interactive_tty:
|
||||
return
|
||||
total_results = len(results or [])
|
||||
self.ui_console.print(
|
||||
f"[green]Completed[/green] {total_results} result(s) in [dim]{output_dir}[/dim]",
|
||||
)
|
||||
|
||||
179
archivebox/services/process_request_service.py
Normal file
179
archivebox/services/process_request_service.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import socket
|
||||
import time
|
||||
from typing import ClassVar
|
||||
|
||||
from abxbus import BaseEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
def _is_port_listening(host: str, port: int) -> bool:
|
||||
if not host or not port:
|
||||
return False
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _supervisor_env(env: dict[str, str]) -> str:
|
||||
pairs = []
|
||||
for key, value in env.items():
|
||||
escaped = value.replace('"', '\\"')
|
||||
pairs.append(f'{key}="{escaped}"')
|
||||
return ",".join(pairs)
|
||||
|
||||
|
||||
def _iso_from_epoch(value: object) -> str:
|
||||
if not isinstance(value, (int, float)) or value <= 0:
|
||||
return ""
|
||||
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
||||
|
||||
output_dir = Path(process_event.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
worker_name = process_event.hook_name
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
|
||||
existing = get_worker(supervisor, worker_name)
|
||||
if (
|
||||
isinstance(existing, dict)
|
||||
and existing.get("statename") == "RUNNING"
|
||||
and (
|
||||
not process_event.daemon_startup_host
|
||||
or not process_event.daemon_startup_port
|
||||
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
|
||||
)
|
||||
):
|
||||
return existing
|
||||
|
||||
daemon = {
|
||||
"name": worker_name,
|
||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
||||
"directory": str(output_dir),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
if process_event.env:
|
||||
daemon["environment"] = _supervisor_env(process_event.env)
|
||||
|
||||
proc = start_worker(supervisor, daemon)
|
||||
deadline = time.monotonic() + max(float(process_event.daemon_startup_timeout), 0.5)
|
||||
while time.monotonic() < deadline:
|
||||
current = get_worker(supervisor, worker_name)
|
||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
||||
if (
|
||||
not process_event.daemon_startup_host
|
||||
or not process_event.daemon_startup_port
|
||||
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
|
||||
):
|
||||
return current
|
||||
time.sleep(0.1)
|
||||
return proc if isinstance(proc, dict) else {}
|
||||
|
||||
|
||||
class ProcessRequestService(BaseService):
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
|
||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
||||
try:
|
||||
record = json.loads(event.line)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return
|
||||
if not isinstance(record, dict) or record.pop("type", "") != "ProcessEvent":
|
||||
return
|
||||
|
||||
process_event = ProcessEvent(
|
||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or "process_request",
|
||||
hook_path=record["hook_path"],
|
||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
||||
is_background=bool(record.get("is_background", True)),
|
||||
output_dir=record.get("output_dir") or event.output_dir,
|
||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
timeout=int(record.get("timeout") or 60),
|
||||
daemon=bool(record.get("daemon", False)),
|
||||
daemon_startup_host=str(record.get("daemon_startup_host") or ""),
|
||||
daemon_startup_port=int(record.get("daemon_startup_port") or 0),
|
||||
daemon_startup_timeout=float(record.get("daemon_startup_timeout") or 0.0),
|
||||
process_type=str(record.get("process_type") or ""),
|
||||
worker_type=str(record.get("worker_type") or ""),
|
||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
||||
)
|
||||
if not process_event.daemon:
|
||||
await self.bus.emit(process_event)
|
||||
return
|
||||
|
||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
||||
start_ts = _iso_from_epoch(proc.get("start"))
|
||||
pid = int(proc.get("pid") or 0)
|
||||
statename = str(proc.get("statename") or "")
|
||||
exitstatus = int(proc.get("exitstatus") or 0)
|
||||
process_type = process_event.process_type or "worker"
|
||||
worker_type = process_event.worker_type or process_event.plugin_name
|
||||
|
||||
if statename == "RUNNING" and pid:
|
||||
await self.bus.emit(
|
||||
ProcessStartedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
output_dir=process_event.output_dir,
|
||||
env=process_event.env,
|
||||
timeout=process_event.timeout,
|
||||
pid=pid,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
is_background=True,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
stderr = (
|
||||
f"Worker {process_event.hook_name} failed to start"
|
||||
if not statename
|
||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
||||
)
|
||||
await self.bus.emit(
|
||||
ProcessCompletedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
env=process_event.env,
|
||||
stdout="",
|
||||
stderr=stderr,
|
||||
exit_code=exitstatus or 1,
|
||||
output_dir=process_event.output_dir,
|
||||
is_background=True,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
pid=pid,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
end_ts=datetime.now(tz=timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
raise RuntimeError(stderr)
|
||||
@@ -43,7 +43,7 @@ class ProcessService(BaseService):
|
||||
def get_db_process_id(self, process_id: str) -> str | None:
|
||||
return self.process_ids.get(process_id)
|
||||
|
||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
|
||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
db_process_id = self.process_ids.get(event.process_id)
|
||||
@@ -57,11 +57,28 @@ class ProcessService(BaseService):
|
||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
return process
|
||||
|
||||
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
|
||||
process_type = getattr(event, "process_type", "") or (
|
||||
Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
|
||||
)
|
||||
worker_type = getattr(event, "worker_type", "") or ""
|
||||
if process_type == Process.TypeChoices.WORKER and worker_type:
|
||||
existing = (
|
||||
Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
)
|
||||
.order_by("-modified_at")
|
||||
.first()
|
||||
)
|
||||
if existing is not None:
|
||||
self.process_ids[event.process_id] = str(existing.id)
|
||||
return existing
|
||||
process = Process.objects.create(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
@@ -81,6 +98,8 @@ class ProcessService(BaseService):
|
||||
process.env = event.env
|
||||
process.timeout = event.timeout
|
||||
process.pid = event.pid or None
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
process.status = process.StatusChoices.RUNNING
|
||||
process.retry_at = None
|
||||
@@ -94,6 +113,8 @@ class ProcessService(BaseService):
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
process.stdout = event.stdout
|
||||
|
||||
@@ -16,13 +16,21 @@ from django.utils import timezone
|
||||
from rich.console import Console
|
||||
|
||||
from abx_dl.events import BinaryEvent
|
||||
from abx_dl.limits import CrawlLimitState
|
||||
from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
|
||||
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
|
||||
from abx_dl.orchestrator import (
|
||||
create_bus,
|
||||
download,
|
||||
install_plugins as abx_install_plugins,
|
||||
prepare_install_plugins,
|
||||
setup_services as setup_abx_services,
|
||||
)
|
||||
|
||||
from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_request_service import ProcessRequestService
|
||||
from .process_service import ProcessService
|
||||
from .snapshot_service import SnapshotService
|
||||
from .tag_service import TagService
|
||||
@@ -54,6 +62,51 @@ def _runner_debug(message: str) -> None:
|
||||
print(f"[runner] {message}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def _binary_env_key(name: str) -> str:
|
||||
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
|
||||
return f"{normalized}_BINARY"
|
||||
|
||||
|
||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
|
||||
keys = [_binary_env_key(binary_name)]
|
||||
|
||||
if binary_name == "postlight-parser":
|
||||
keys.insert(0, "MERCURY_BINARY")
|
||||
|
||||
for plugin in plugins.values():
|
||||
for key, prop in plugin.config_schema.items():
|
||||
if key.endswith("_BINARY") and prop.get("default") == binary_name:
|
||||
keys.insert(0, key)
|
||||
|
||||
return list(dict.fromkeys(keys))
|
||||
|
||||
|
||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
overrides: dict[str, str] = {}
|
||||
binaries = (
|
||||
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
|
||||
)
|
||||
|
||||
for binary in binaries:
|
||||
try:
|
||||
resolved_path = Path(binary.abspath).expanduser()
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
|
||||
continue
|
||||
for key in _binary_config_keys_for_plugins(plugins, binary.name):
|
||||
overrides[key] = binary.abspath
|
||||
|
||||
return overrides
|
||||
|
||||
|
||||
def _limit_stop_reason(config: dict[str, Any]) -> str:
|
||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
||||
|
||||
|
||||
def _attach_bus_trace(bus) -> None:
|
||||
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
|
||||
if not trace_target:
|
||||
@@ -105,6 +158,7 @@ def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
Process.cleanup_orphaned_workers()
|
||||
machine = Machine.current()
|
||||
if Process.objects.filter(
|
||||
machine=machine,
|
||||
@@ -149,6 +203,7 @@ class CrawlRunner:
|
||||
self.machine_service = MachineService(self.bus)
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.process_request_service = ProcessRequestService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||
self.snapshot_service = SnapshotService(
|
||||
@@ -173,6 +228,7 @@ class CrawlRunner:
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
CrawlService(bus, crawl_id=str(self.crawl.id))
|
||||
SnapshotService(
|
||||
bus,
|
||||
@@ -201,7 +257,10 @@ class CrawlRunner:
|
||||
self.abx_services = setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=self.base_config,
|
||||
config_overrides={
|
||||
**self.base_config,
|
||||
"ABX_RUNTIME": "archivebox",
|
||||
},
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
@@ -293,6 +352,8 @@ class CrawlRunner:
|
||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
self.base_config.update(_installed_binary_config_overrides(self.plugins))
|
||||
self.base_config["ABX_RUNTIME"] = "archivebox"
|
||||
if self.selected_plugins is None:
|
||||
self.selected_plugins = _selected_plugins_from_config(self.base_config)
|
||||
if self.persona:
|
||||
@@ -459,6 +520,11 @@ class CrawlRunner:
|
||||
|
||||
async with self.snapshot_semaphore:
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
if snapshot["status"] == "sealed":
|
||||
return
|
||||
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
|
||||
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
|
||||
return
|
||||
abx_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
@@ -513,11 +579,22 @@ class CrawlRunner:
|
||||
"created_at": snapshot.created_at.isoformat() if snapshot.created_at else "",
|
||||
"tags": snapshot.tags_str(),
|
||||
"depth": snapshot.depth,
|
||||
"status": snapshot.status,
|
||||
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot.output_dir),
|
||||
"config": self._snapshot_config(snapshot),
|
||||
}
|
||||
|
||||
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||
if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
return
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
|
||||
def run_crawl(
|
||||
crawl_id: str,
|
||||
@@ -535,7 +612,7 @@ def run_crawl(
|
||||
snapshot_ids=snapshot_ids,
|
||||
selected_plugins=selected_plugins,
|
||||
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
|
||||
).run()
|
||||
).run(),
|
||||
)
|
||||
|
||||
|
||||
@@ -546,9 +623,17 @@ async def _run_binary(binary_id: str) -> None:
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
|
||||
config = get_config()
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
config.update(_installed_binary_config_overrides(plugins))
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
@@ -556,11 +641,6 @@ async def _run_binary(binary_id: str) -> None:
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
try:
|
||||
_attach_bus_trace(bus)
|
||||
@@ -592,9 +672,17 @@ def run_binary(binary_id: str) -> None:
|
||||
async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config()
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
config.update(_installed_binary_config_overrides(plugins))
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
@@ -602,11 +690,6 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
live_stream = None
|
||||
|
||||
try:
|
||||
@@ -763,8 +846,7 @@ def recover_orphaned_snapshots() -> int:
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_snapshots = (
|
||||
Snapshot.objects
|
||||
.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
.select_related("crawl")
|
||||
.prefetch_related("archiveresult_set")
|
||||
)
|
||||
|
||||
@@ -4,6 +4,7 @@ from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
||||
from abx_dl.limits import CrawlLimitState
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
@@ -47,6 +48,8 @@ class SnapshotService(BaseService):
|
||||
|
||||
if event.depth > crawl.max_depth:
|
||||
return None
|
||||
if self._crawl_limit_stop_reason(crawl) == "max_size":
|
||||
return None
|
||||
|
||||
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
|
||||
if parent_snapshot is None:
|
||||
@@ -84,15 +87,38 @@ class SnapshotService(BaseService):
|
||||
def _seal_snapshot(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return None
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
||||
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
|
||||
return str(snapshot.id)
|
||||
|
||||
def _crawl_limit_stop_reason(self, crawl) -> str:
|
||||
config = dict(crawl.config or {})
|
||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
||||
|
||||
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
return (
|
||||
Snapshot.objects.filter(
|
||||
crawl_id=crawl_id,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
.exclude(id=exclude_snapshot_id)
|
||||
.update(
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
)
|
||||
|
||||
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
|
||||
Reference in New Issue
Block a user