wip

2026-04-06 07:47:53 +10:00 · 2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions
--- a/archivebox/services/init.py
+++ b/archivebox/services/init.py
@@ -2,6 +2,7 @@ from .archive_result_service import ArchiveResultService
 from .binary_service import BinaryService
 from .crawl_service import CrawlService
 from .machine_service import MachineService
+from .process_request_service import ProcessRequestService
 from .process_service import ProcessService
 from .runner import run_binary, run_crawl, run_install, run_pending_crawls
 from .snapshot_service import SnapshotService
@@ -12,6 +13,7 @@ __all__ = [
    "BinaryService",
    "CrawlService",
    "MachineService",
+    "ProcessRequestService",
    "ProcessService",
    "SnapshotService",
    "TagService",
--- a/archivebox/services/archive_result_service.py
+++ b/archivebox/services/archive_result_service.py
@@ -1,14 +1,16 @@
 from __future__ import annotations

 import json
-import mimetypes
 from collections import defaultdict
+from collections.abc import Iterable
 from pathlib import Path
+from typing import Any

 from asgiref.sync import sync_to_async
 from django.utils import timezone

 from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
+from abx_dl.output_files import guess_mimetype
 from abx_dl.services.base import BaseService

 from .db import run_db_op
@@ -35,27 +37,157 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
            stat = file_path.stat()
        except OSError:
            continue
-        mime_type, _ = mimetypes.guess_type(str(file_path))
-        mime_type = mime_type or "application/octet-stream"
+        mime_type = guess_mimetype(file_path) or "application/octet-stream"
        relative_path = str(file_path.relative_to(plugin_dir))
-        output_files[relative_path] = {}
+        output_files[relative_path] = {
+            "extension": file_path.suffix.lower().lstrip("."),
+            "mimetype": mime_type,
+            "size": stat.st_size,
+        }
        mime_sizes[mime_type] += stat.st_size
        total_size += stat.st_size

-    output_mimetypes = ",".join(
-        mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)
-    )
+    output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
    return output_files, total_size, output_mimetypes


+def _coerce_output_file_size(value: Any) -> int:
+    try:
+        return max(int(value or 0), 0)
+    except (TypeError, ValueError):
+        return 0
+
+
+def _normalize_output_files(raw_output_files: Any) -> dict[str, dict]:
+    def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]:
+        normalized = dict(metadata)
+        if "extension" not in normalized:
+            normalized["extension"] = Path(path).suffix.lower().lstrip(".")
+        if "mimetype" not in normalized:
+            guessed = guess_mimetype(path)
+            if guessed:
+                normalized["mimetype"] = guessed
+        return normalized
+
+    if raw_output_files is None:
+        return {}
+
+    if isinstance(raw_output_files, str):
+        try:
+            raw_output_files = json.loads(raw_output_files)
+        except json.JSONDecodeError:
+            return {}
+
+    if isinstance(raw_output_files, dict):
+        normalized: dict[str, dict] = {}
+        for path, metadata in raw_output_files.items():
+            if not path:
+                continue
+            metadata_dict = dict(metadata) if isinstance(metadata, dict) else {}
+            metadata_dict.pop("path", None)
+            normalized[str(path)] = _enrich_metadata(str(path), metadata_dict)
+        return normalized
+
+    if not isinstance(raw_output_files, Iterable):
+        return {}
+
+    normalized: dict[str, dict] = {}
+    for item in raw_output_files:
+        if isinstance(item, str):
+            normalized[item] = _enrich_metadata(item, {})
+            continue
+        if hasattr(item, "model_dump"):
+            item = item.model_dump()
+        elif hasattr(item, "path"):
+            item = {
+                "path": getattr(item, "path", ""),
+                "extension": getattr(item, "extension", ""),
+                "mimetype": getattr(item, "mimetype", ""),
+                "size": getattr(item, "size", 0),
+            }
+        if not isinstance(item, dict):
+            continue
+        path = str(item.get("path") or "").strip()
+        if not path:
+            continue
+        normalized[path] = _enrich_metadata(path, {key: value for key, value in item.items() if key != "path" and value not in (None, "")})
+
+    return normalized
+
+
+def _has_structured_output_metadata(output_files: dict[str, dict]) -> bool:
+    return any(any(key in metadata for key in ("extension", "mimetype", "size")) for metadata in output_files.values())
+
+
+def _summarize_output_files(output_files: dict[str, dict]) -> tuple[int, str]:
+    mime_sizes: dict[str, int] = defaultdict(int)
+    total_size = 0
+
+    for metadata in output_files.values():
+        if not isinstance(metadata, dict):
+            continue
+        size = _coerce_output_file_size(metadata.get("size"))
+        mimetype = str(metadata.get("mimetype") or "").strip()
+        total_size += size
+        if mimetype and size:
+            mime_sizes[mimetype] += size
+
+    output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
+    return total_size, output_mimetypes
+
+
+def _resolve_output_metadata(raw_output_files: Any, plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
+    normalized_output_files = _normalize_output_files(raw_output_files)
+    if normalized_output_files and _has_structured_output_metadata(normalized_output_files):
+        output_size, output_mimetypes = _summarize_output_files(normalized_output_files)
+        return normalized_output_files, output_size, output_mimetypes
+    return _collect_output_metadata(plugin_dir)
+
+
 def _normalize_status(status: str) -> str:
    if status == "noresult":
        return "noresults"
    return status or "failed"


-def _has_content_files(output_files: list[str]) -> bool:
-    return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
+def _normalize_snapshot_title(candidate: str, *, snapshot_url: str) -> str:
+    title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip()
+    if not title:
+        return ""
+    if title.lower() in {"pending...", "no title found"}:
+        return ""
+    if title == snapshot_url:
+        return ""
+    if "/" in title and title.lower().endswith(".txt"):
+        return ""
+    return title
+
+
+def _extract_snapshot_title(snapshot_output_dir: str, plugin: str, output_str: str, *, snapshot_url: str) -> str:
+    if plugin != "title":
+        return ""
+
+    title_file = Path(snapshot_output_dir) / "title" / "title.txt"
+    if title_file.exists():
+        try:
+            file_title = _normalize_snapshot_title(title_file.read_text(encoding="utf-8"), snapshot_url=snapshot_url)
+        except OSError:
+            file_title = ""
+        if file_title:
+            return file_title
+
+    return _normalize_snapshot_title(output_str, snapshot_url=snapshot_url)
+
+
+def _should_update_snapshot_title(current_title: str, next_title: str, *, snapshot_url: str) -> bool:
+    current = (current_title or "").strip()
+    if not current or current.lower() == "pending..." or current == snapshot_url:
+        return True
+    return len(next_title) > len(current)
+
+
+def _has_content_files(output_files: Any) -> bool:
+    return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in _normalize_output_files(output_files))


 def _iter_archiveresult_records(stdout: str) -> list[dict]:
@@ -86,7 +218,7 @@ class ArchiveResultService(BaseService):
        if snapshot_output_dir is None:
            return
        plugin_dir = Path(snapshot_output_dir) / event.plugin
-        output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
+        output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
        await run_db_op(self._project, event, output_files, output_size, output_mimetypes)

    async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
@@ -94,7 +226,7 @@ class ArchiveResultService(BaseService):
            return

        plugin_dir = Path(event.output_dir)
-        output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
+        output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
        records = _iter_archiveresult_records(event.stdout)
        if records:
            for record in records:
@@ -172,6 +304,11 @@ class ArchiveResultService(BaseService):
            result.notes = event.error
        result.save()

+        next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
+        if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
+            snapshot.title = next_title
+            snapshot.save(update_fields=["title", "modified_at"])
+
    def _project_from_process_completed(
        self,
        event: ProcessCompletedEvent,
@@ -188,6 +325,7 @@ class ArchiveResultService(BaseService):
            process_id=event.process_id,
            output_str=record.get("output_str") or "",
            output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
+            output_files=event.output_files,
            start_ts=event.start_ts,
            end_ts=event.end_ts,
            error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
--- a/archivebox/services/binary_service.py
+++ b/archivebox/services/binary_service.py
@@ -101,4 +101,6 @@ class BinaryService(BaseService):
            binary.overrides = event.overrides
        binary.status = Binary.StatusChoices.INSTALLED
        binary.retry_at = None
-        binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])
+        binary.save(
+            update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
+        )
--- a/archivebox/services/live_ui.py
+++ b/archivebox/services/live_ui.py
@@ -1 +1,51 @@
-from abx_dl.cli import LiveBusUI
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from rich.console import Console
+
+
+class LiveBusUI:
+    """Small tty-only runner UI.
+
+    The runner only needs a context manager and a couple of print helpers here.
+    Keeping this minimal avoids a hard dependency on a heavier live dashboard.
+    """
+
+    def __init__(
+        self,
+        bus: Any,
+        *,
+        total_hooks: int,
+        timeout_seconds: int,
+        ui_console: Console,
+        interactive_tty: bool,
+    ) -> None:
+        self.bus = bus
+        self.total_hooks = total_hooks
+        self.timeout_seconds = timeout_seconds
+        self.ui_console = ui_console
+        self.interactive_tty = interactive_tty
+
+    def __enter__(self) -> LiveBusUI:
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> bool:
+        return False
+
+    def print_intro(self, *, url: str, output_dir: Path, plugins_label: str) -> None:
+        if not self.interactive_tty:
+            return
+        self.ui_console.print(
+            f"[bold]ArchiveBox[/bold] {url} -> [dim]{output_dir}[/dim] "
+            f"([cyan]{plugins_label}[/cyan], {self.total_hooks} hooks, {self.timeout_seconds}s timeout)",
+        )
+
+    def print_summary(self, results: list[Any] | tuple[Any, ...] | None, *, output_dir: Path) -> None:
+        if not self.interactive_tty:
+            return
+        total_results = len(results or [])
+        self.ui_console.print(
+            f"[green]Completed[/green] {total_results} result(s) in [dim]{output_dir}[/dim]",
+        )
--- a/archivebox/services/process_request_service.py
+++ b/archivebox/services/process_request_service.py
@@ -0,0 +1,179 @@
+from __future__ import annotations
+
+import asyncio
+from datetime import datetime, timezone
+import json
+from pathlib import Path
+import shlex
+import socket
+import time
+from typing import ClassVar
+
+from abxbus import BaseEvent
+from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
+from abx_dl.services.base import BaseService
+
+
+def _is_port_listening(host: str, port: int) -> bool:
+    if not host or not port:
+        return False
+    try:
+        with socket.create_connection((host, port), timeout=0.5):
+            return True
+    except OSError:
+        return False
+
+
+def _supervisor_env(env: dict[str, str]) -> str:
+    pairs = []
+    for key, value in env.items():
+        escaped = value.replace('"', '\\"')
+        pairs.append(f'{key}="{escaped}"')
+    return ",".join(pairs)
+
+
+def _iso_from_epoch(value: object) -> str:
+    if not isinstance(value, (int, float)) or value <= 0:
+        return ""
+    return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
+
+
+def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
+    from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
+
+    output_dir = Path(process_event.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    worker_name = process_event.hook_name
+    supervisor = get_or_create_supervisord_process(daemonize=True)
+
+    existing = get_worker(supervisor, worker_name)
+    if (
+        isinstance(existing, dict)
+        and existing.get("statename") == "RUNNING"
+        and (
+            not process_event.daemon_startup_host
+            or not process_event.daemon_startup_port
+            or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
+        )
+    ):
+        return existing
+
+    daemon = {
+        "name": worker_name,
+        "command": shlex.join([process_event.hook_path, *process_event.hook_args]),
+        "directory": str(output_dir),
+        "autostart": "false",
+        "autorestart": "true",
+        "stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
+        "redirect_stderr": "true",
+    }
+    if process_event.env:
+        daemon["environment"] = _supervisor_env(process_event.env)
+
+    proc = start_worker(supervisor, daemon)
+    deadline = time.monotonic() + max(float(process_event.daemon_startup_timeout), 0.5)
+    while time.monotonic() < deadline:
+        current = get_worker(supervisor, worker_name)
+        if isinstance(current, dict) and current.get("statename") == "RUNNING":
+            if (
+                not process_event.daemon_startup_host
+                or not process_event.daemon_startup_port
+                or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
+            ):
+                return current
+        time.sleep(0.1)
+    return proc if isinstance(proc, dict) else {}
+
+
+class ProcessRequestService(BaseService):
+    LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent]
+    EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
+
+    async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
+        try:
+            record = json.loads(event.line)
+        except (json.JSONDecodeError, ValueError):
+            return
+        if not isinstance(record, dict) or record.pop("type", "") != "ProcessEvent":
+            return
+
+        process_event = ProcessEvent(
+            plugin_name=record.get("plugin_name") or event.plugin_name,
+            hook_name=record.get("hook_name") or "process_request",
+            hook_path=record["hook_path"],
+            hook_args=[str(arg) for arg in record.get("hook_args", [])],
+            is_background=bool(record.get("is_background", True)),
+            output_dir=record.get("output_dir") or event.output_dir,
+            env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
+            snapshot_id=record.get("snapshot_id") or event.snapshot_id,
+            timeout=int(record.get("timeout") or 60),
+            daemon=bool(record.get("daemon", False)),
+            daemon_startup_host=str(record.get("daemon_startup_host") or ""),
+            daemon_startup_port=int(record.get("daemon_startup_port") or 0),
+            daemon_startup_timeout=float(record.get("daemon_startup_timeout") or 0.0),
+            process_type=str(record.get("process_type") or ""),
+            worker_type=str(record.get("worker_type") or ""),
+            event_timeout=float(record.get("event_timeout") or 360.0),
+            event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
+        )
+        if not process_event.daemon:
+            await self.bus.emit(process_event)
+            return
+
+        proc = await asyncio.to_thread(_ensure_worker, process_event)
+        process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
+        start_ts = _iso_from_epoch(proc.get("start"))
+        pid = int(proc.get("pid") or 0)
+        statename = str(proc.get("statename") or "")
+        exitstatus = int(proc.get("exitstatus") or 0)
+        process_type = process_event.process_type or "worker"
+        worker_type = process_event.worker_type or process_event.plugin_name
+
+        if statename == "RUNNING" and pid:
+            await self.bus.emit(
+                ProcessStartedEvent(
+                    plugin_name=process_event.plugin_name,
+                    hook_name=process_event.hook_name,
+                    hook_path=process_event.hook_path,
+                    hook_args=process_event.hook_args,
+                    output_dir=process_event.output_dir,
+                    env=process_event.env,
+                    timeout=process_event.timeout,
+                    pid=pid,
+                    process_id=process_id,
+                    snapshot_id=process_event.snapshot_id,
+                    is_background=True,
+                    process_type=process_type,
+                    worker_type=worker_type,
+                    start_ts=start_ts,
+                ),
+            )
+            return
+
+        stderr = (
+            f"Worker {process_event.hook_name} failed to start"
+            if not statename
+            else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
+        )
+        await self.bus.emit(
+            ProcessCompletedEvent(
+                plugin_name=process_event.plugin_name,
+                hook_name=process_event.hook_name,
+                hook_path=process_event.hook_path,
+                hook_args=process_event.hook_args,
+                env=process_event.env,
+                stdout="",
+                stderr=stderr,
+                exit_code=exitstatus or 1,
+                output_dir=process_event.output_dir,
+                is_background=True,
+                process_id=process_id,
+                snapshot_id=process_event.snapshot_id,
+                pid=pid,
+                process_type=process_type,
+                worker_type=worker_type,
+                start_ts=start_ts,
+                end_ts=datetime.now(tz=timezone.utc).isoformat(),
+            ),
+        )
+        raise RuntimeError(stderr)
--- a/archivebox/services/process_service.py
+++ b/archivebox/services/process_service.py
@@ -43,7 +43,7 @@ class ProcessService(BaseService):
    def get_db_process_id(self, process_id: str) -> str | None:
        return self.process_ids.get(process_id)

-    def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
+    def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
        from archivebox.machine.models import NetworkInterface, Process

        db_process_id = self.process_ids.get(event.process_id)
@@ -57,11 +57,28 @@ class ProcessService(BaseService):
                    process.save(update_fields=["iface", "machine", "modified_at"])
                return process

-        process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
+        process_type = getattr(event, "process_type", "") or (
+            Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
+        )
+        worker_type = getattr(event, "worker_type", "") or ""
+        if process_type == Process.TypeChoices.WORKER and worker_type:
+            existing = (
+                Process.objects.filter(
+                    process_type=Process.TypeChoices.WORKER,
+                    worker_type=worker_type,
+                    pwd=event.output_dir,
+                )
+                .order_by("-modified_at")
+                .first()
+            )
+            if existing is not None:
+                self.process_ids[event.process_id] = str(existing.id)
+                return existing
        process = Process.objects.create(
            machine=iface.machine,
            iface=iface,
            process_type=process_type,
+            worker_type=worker_type,
            pwd=event.output_dir,
            cmd=[event.hook_path, *event.hook_args],
            env=event.env,
@@ -81,6 +98,8 @@ class ProcessService(BaseService):
        process.env = event.env
        process.timeout = event.timeout
        process.pid = event.pid or None
+        process.process_type = getattr(event, "process_type", "") or process.process_type
+        process.worker_type = getattr(event, "worker_type", "") or process.worker_type
        process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
        process.status = process.StatusChoices.RUNNING
        process.retry_at = None
@@ -94,6 +113,8 @@ class ProcessService(BaseService):
            process.cmd = [event.hook_path, *event.hook_args]
        process.env = event.env
        process.pid = event.pid or process.pid
+        process.process_type = getattr(event, "process_type", "") or process.process_type
+        process.worker_type = getattr(event, "worker_type", "") or process.worker_type
        process.started_at = parse_event_datetime(event.start_ts) or process.started_at
        process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
        process.stdout = event.stdout
--- a/archivebox/services/runner.py
+++ b/archivebox/services/runner.py
@@ -16,13 +16,21 @@ from django.utils import timezone
 from rich.console import Console

 from abx_dl.events import BinaryEvent
+from abx_dl.limits import CrawlLimitState
 from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
-from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
+from abx_dl.orchestrator import (
+    create_bus,
+    download,
+    install_plugins as abx_install_plugins,
+    prepare_install_plugins,
+    setup_services as setup_abx_services,
+)

 from .archive_result_service import ArchiveResultService
 from .binary_service import BinaryService
 from .crawl_service import CrawlService
 from .machine_service import MachineService
+from .process_request_service import ProcessRequestService
 from .process_service import ProcessService
 from .snapshot_service import SnapshotService
 from .tag_service import TagService
@@ -54,6 +62,51 @@ def _runner_debug(message: str) -> None:
    print(f"[runner] {message}", file=sys.stderr, flush=True)


+def _binary_env_key(name: str) -> str:
+    normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
+    return f"{normalized}_BINARY"
+
+
+def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
+    keys = [_binary_env_key(binary_name)]
+
+    if binary_name == "postlight-parser":
+        keys.insert(0, "MERCURY_BINARY")
+
+    for plugin in plugins.values():
+        for key, prop in plugin.config_schema.items():
+            if key.endswith("_BINARY") and prop.get("default") == binary_name:
+                keys.insert(0, key)
+
+    return list(dict.fromkeys(keys))
+
+
+def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
+    from archivebox.machine.models import Binary, Machine
+
+    machine = Machine.current()
+    overrides: dict[str, str] = {}
+    binaries = (
+        Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
+    )
+
+    for binary in binaries:
+        try:
+            resolved_path = Path(binary.abspath).expanduser()
+        except (TypeError, ValueError):
+            continue
+        if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
+            continue
+        for key in _binary_config_keys_for_plugins(plugins, binary.name):
+            overrides[key] = binary.abspath
+
+    return overrides
+
+
+def _limit_stop_reason(config: dict[str, Any]) -> str:
+    return CrawlLimitState.from_config(config).get_stop_reason()
+
+
 def _attach_bus_trace(bus) -> None:
    trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
    if not trace_target:
@@ -105,6 +158,7 @@ def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
    from archivebox.machine.models import Machine, Process

    Process.cleanup_stale_running()
+    Process.cleanup_orphaned_workers()
    machine = Machine.current()
    if Process.objects.filter(
        machine=machine,
@@ -149,6 +203,7 @@ class CrawlRunner:
        self.machine_service = MachineService(self.bus)
        self.binary_service = BinaryService(self.bus)
        self.tag_service = TagService(self.bus)
+        self.process_request_service = ProcessRequestService(self.bus)
        self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
        self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
        self.snapshot_service = SnapshotService(
@@ -173,6 +228,7 @@ class CrawlRunner:
        MachineService(bus)
        BinaryService(bus)
        TagService(bus)
+        ProcessRequestService(bus)
        CrawlService(bus, crawl_id=str(self.crawl.id))
        SnapshotService(
            bus,
@@ -201,7 +257,10 @@ class CrawlRunner:
                self.abx_services = setup_abx_services(
                    self.bus,
                    plugins=self.plugins,
-                    config_overrides=self.base_config,
+                    config_overrides={
+                        **self.base_config,
+                        "ABX_RUNTIME": "archivebox",
+                    },
                    auto_install=True,
                    emit_jsonl=False,
                )
@@ -293,6 +352,8 @@ class CrawlRunner:
            current_process.save(update_fields=["iface", "machine", "modified_at"])
        self.persona = self.crawl.resolve_persona()
        self.base_config = get_config(crawl=self.crawl)
+        self.base_config.update(_installed_binary_config_overrides(self.plugins))
+        self.base_config["ABX_RUNTIME"] = "archivebox"
        if self.selected_plugins is None:
            self.selected_plugins = _selected_plugins_from_config(self.base_config)
        if self.persona:
@@ -459,6 +520,11 @@ class CrawlRunner:

        async with self.snapshot_semaphore:
            snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
+            if snapshot["status"] == "sealed":
+                return
+            if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
+                await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
+                return
            abx_snapshot = AbxSnapshot(
                url=snapshot["url"],
                id=snapshot["id"],
@@ -513,11 +579,22 @@ class CrawlRunner:
            "created_at": snapshot.created_at.isoformat() if snapshot.created_at else "",
            "tags": snapshot.tags_str(),
            "depth": snapshot.depth,
+            "status": snapshot.status,
            "parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
            "output_dir": str(snapshot.output_dir),
            "config": self._snapshot_config(snapshot),
        }

+    def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
+        from archivebox.core.models import Snapshot
+
+        snapshot = Snapshot.objects.filter(id=snapshot_id).first()
+        if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED:
+            return
+        snapshot.status = Snapshot.StatusChoices.SEALED
+        snapshot.retry_at = None
+        snapshot.save(update_fields=["status", "retry_at", "modified_at"])
+

 def run_crawl(
    crawl_id: str,
@@ -535,7 +612,7 @@ def run_crawl(
            snapshot_ids=snapshot_ids,
            selected_plugins=selected_plugins,
            process_discovered_snapshots_inline=process_discovered_snapshots_inline,
-        ).run()
+        ).run(),
    )


@@ -546,9 +623,17 @@ async def _run_binary(binary_id: str) -> None:
    from archivebox.machine.models import Binary

    binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
-    config = get_config()
    plugins = discover_plugins()
+    config = get_config()
+    config.update(_installed_binary_config_overrides(plugins))
+    config["ABX_RUNTIME"] = "archivebox"
    bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
+    process_service = ProcessService(bus)
+    MachineService(bus)
+    BinaryService(bus)
+    TagService(bus)
+    ProcessRequestService(bus)
+    ArchiveResultService(bus, process_service=process_service)
    setup_abx_services(
        bus,
        plugins=plugins,
@@ -556,11 +641,6 @@ async def _run_binary(binary_id: str) -> None:
        auto_install=True,
        emit_jsonl=False,
    )
-    process_service = ProcessService(bus)
-    MachineService(bus)
-    BinaryService(bus)
-    TagService(bus)
-    ArchiveResultService(bus, process_service=process_service)

    try:
        _attach_bus_trace(bus)
@@ -592,9 +672,17 @@ def run_binary(binary_id: str) -> None:
 async def _run_install(plugin_names: list[str] | None = None) -> None:
    from archivebox.config.configset import get_config

-    config = get_config()
    plugins = discover_plugins()
+    config = get_config()
+    config.update(_installed_binary_config_overrides(plugins))
+    config["ABX_RUNTIME"] = "archivebox"
    bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
+    process_service = ProcessService(bus)
+    MachineService(bus)
+    BinaryService(bus)
+    TagService(bus)
+    ProcessRequestService(bus)
+    ArchiveResultService(bus, process_service=process_service)
    abx_services = setup_abx_services(
        bus,
        plugins=plugins,
@@ -602,11 +690,6 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
        auto_install=True,
        emit_jsonl=False,
    )
-    process_service = ProcessService(bus)
-    MachineService(bus)
-    BinaryService(bus)
-    TagService(bus)
-    ArchiveResultService(bus, process_service=process_service)
    live_stream = None

    try:
@@ -763,8 +846,7 @@ def recover_orphaned_snapshots() -> int:
    recovered = 0
    now = timezone.now()
    orphaned_snapshots = (
-        Snapshot.objects
-        .filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
+        Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
        .select_related("crawl")
        .prefetch_related("archiveresult_set")
    )
--- a/archivebox/services/snapshot_service.py
+++ b/archivebox/services/snapshot_service.py
@@ -4,6 +4,7 @@ from asgiref.sync import sync_to_async
 from django.utils import timezone

 from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
+from abx_dl.limits import CrawlLimitState
 from abx_dl.services.base import BaseService

 from .db import run_db_op
@@ -47,6 +48,8 @@ class SnapshotService(BaseService):

        if event.depth > crawl.max_depth:
            return None
+        if self._crawl_limit_stop_reason(crawl) == "max_size":
+            return None

        parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
        if parent_snapshot is None:
@@ -84,15 +87,38 @@ class SnapshotService(BaseService):
    def _seal_snapshot(self, snapshot_id: str) -> str | None:
        from archivebox.core.models import Snapshot

-        snapshot = Snapshot.objects.filter(id=snapshot_id).first()
+        snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
        if snapshot is None:
            return None
        snapshot.status = Snapshot.StatusChoices.SEALED
        snapshot.retry_at = None
        snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
        snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
+        if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
+            self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
        return str(snapshot.id)

+    def _crawl_limit_stop_reason(self, crawl) -> str:
+        config = dict(crawl.config or {})
+        config["CRAWL_DIR"] = str(crawl.output_dir)
+        return CrawlLimitState.from_config(config).get_stop_reason()
+
+    def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
+        from archivebox.core.models import Snapshot
+
+        return (
+            Snapshot.objects.filter(
+                crawl_id=crawl_id,
+                status=Snapshot.StatusChoices.QUEUED,
+            )
+            .exclude(id=exclude_snapshot_id)
+            .update(
+                status=Snapshot.StatusChoices.SEALED,
+                retry_at=None,
+                modified_at=timezone.now(),
+            )
+        )
+
    def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
        from archivebox.core.models import Snapshot