Files
ArchiveBox/archivebox/services/runner.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

963 lines
38 KiB
Python

from __future__ import annotations
import asyncio
import json
import os
import shutil
import subprocess
import sys
import time
from contextlib import nullcontext
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any
from django.utils import timezone
from rich.console import Console
from abx_dl.events import BinaryEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
from abx_dl.orchestrator import (
create_bus,
download,
install_plugins as abx_install_plugins,
prepare_install_plugins,
setup_services as setup_abx_services,
)
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .machine_service import MachineService
from .process_request_service import ProcessRequestService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
from .live_ui import LiveBusUI
def _bus_name(prefix: str, identifier: str) -> str:
normalized = "".join(ch if ch.isalnum() else "_" for ch in identifier)
return f"{prefix}_{normalized}"
def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
raw = str(config.get("PLUGINS") or "").strip()
if not raw:
return None
return [name.strip() for name in raw.split(",") if name.strip()]
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
total = 0
for plugin in selected.values():
total += len(list(plugin.get_crawl_hooks()))
total += len(list(plugin.get_snapshot_hooks()))
return total
def _runner_debug(message: str) -> None:
print(f"[runner] {message}", file=sys.stderr, flush=True)
def _binary_env_key(name: str) -> str:
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
return f"{normalized}_BINARY"
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
keys = [_binary_env_key(binary_name)]
if binary_name == "postlight-parser":
keys.insert(0, "MERCURY_BINARY")
for plugin in plugins.values():
for key, prop in plugin.config_schema.items():
if key.endswith("_BINARY") and prop.get("default") == binary_name:
keys.insert(0, key)
return list(dict.fromkeys(keys))
def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
overrides: dict[str, str] = {}
binaries = (
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
)
for binary in binaries:
try:
resolved_path = Path(binary.abspath).expanduser()
except (TypeError, ValueError):
continue
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
continue
for key in _binary_config_keys_for_plugins(plugins, binary.name):
overrides[key] = binary.abspath
return overrides
def _limit_stop_reason(config: dict[str, Any]) -> str:
return CrawlLimitState.from_config(config).get_stop_reason()
def _attach_bus_trace(bus) -> None:
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
if not trace_target:
return
if getattr(bus, "_archivebox_trace_task", None) is not None:
return
trace_path = None if trace_target in {"1", "-", "stderr"} else Path(trace_target)
stop_event = asyncio.Event()
async def trace_loop() -> None:
seen_event_ids: set[str] = set()
while not stop_event.is_set():
for event_id, event in list(bus.event_history.items()):
if event_id in seen_event_ids:
continue
seen_event_ids.add(event_id)
payload = event.model_dump(mode="json")
payload["bus_name"] = bus.name
line = json.dumps(payload, ensure_ascii=False, default=str, separators=(",", ":"))
if trace_path is None:
print(line, file=sys.stderr, flush=True)
else:
trace_path.parent.mkdir(parents=True, exist_ok=True)
with trace_path.open("a", encoding="utf-8") as handle:
handle.write(line + "\n")
await asyncio.sleep(0.05)
bus._archivebox_trace_stop = stop_event
bus._archivebox_trace_task = asyncio.create_task(trace_loop())
async def _stop_bus_trace(bus) -> None:
stop_event = getattr(bus, "_archivebox_trace_stop", None)
trace_task = getattr(bus, "_archivebox_trace_task", None)
if stop_event is None or trace_task is None:
return
stop_event.set()
await asyncio.gather(trace_task, return_exceptions=True)
bus._archivebox_trace_stop = None
bus._archivebox_trace_task = None
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
if os.environ.get("PYTEST_CURRENT_TEST") and not allow_under_pytest:
return False
from archivebox.config import CONSTANTS
from archivebox.machine.models import Machine, Process
Process.cleanup_stale_running()
Process.cleanup_orphaned_workers()
machine = Machine.current()
if Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists():
return False
log_path = CONSTANTS.LOGS_DIR / "errors.log"
log_path.parent.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env.setdefault("DATA_DIR", str(CONSTANTS.DATA_DIR))
with log_path.open("a", encoding="utf-8") as log_handle:
subprocess.Popen(
[sys.executable, "-m", "archivebox", "run", "--daemon"],
cwd=str(CONSTANTS.DATA_DIR),
env=env,
stdin=subprocess.DEVNULL,
stdout=log_handle,
stderr=log_handle,
start_new_session=True,
)
return True
class CrawlRunner:
MAX_CONCURRENT_SNAPSHOTS = 8
def __init__(
self,
crawl,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
):
self.crawl = crawl
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
self.process_service = ProcessService(self.bus)
self.machine_service = MachineService(self.bus)
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.process_request_service = ProcessRequestService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
self.bus,
crawl_id=str(crawl.id),
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
self.selected_plugins = selected_plugins
self.initial_snapshot_ids = snapshot_ids
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
self.abx_services = None
self.persona = None
self.base_config: dict[str, Any] = {}
self.primary_url = ""
self._live_stream = None
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
CrawlService(bus, crawl_id=str(self.crawl.id))
SnapshotService(
bus,
crawl_id=str(self.crawl.id),
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=self.plugins,
config_overrides=config_overrides,
auto_install=True,
emit_jsonl=False,
)
return bus, abx_services
async def run(self) -> None:
from asgiref.sync import sync_to_async
from archivebox.crawls.models import Crawl
try:
await sync_to_async(self._prepare, thread_sensitive=True)()
live_ui = self._create_live_ui()
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(self.bus)
self.abx_services = setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides={
**self.base_config,
"ABX_RUNTIME": "archivebox",
},
auto_install=True,
emit_jsonl=False,
)
if self.crawl.get_system_task() == INSTALL_URL:
await self._run_install_crawl()
else:
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
_runner_debug(f"crawl {self.crawl.id} starting crawl setup root_snapshot={root_snapshot_id}")
await self._run_crawl_setup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished crawl setup root_snapshot={root_snapshot_id}")
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
_runner_debug(f"crawl {self.crawl.id} waiting for snapshot tasks count={len(self.snapshot_tasks)}")
await self._wait_for_snapshot_tasks()
_runner_debug(f"crawl {self.crawl.id} finished waiting for snapshot tasks")
_runner_debug(f"crawl {self.crawl.id} starting django crawl.cleanup()")
await sync_to_async(self.crawl.cleanup, thread_sensitive=True)()
_runner_debug(f"crawl {self.crawl.id} finished django crawl.cleanup()")
_runner_debug(f"crawl {self.crawl.id} starting abx crawl cleanup root_snapshot={root_snapshot_id}")
await self._run_crawl_cleanup(root_snapshot_id)
_runner_debug(f"crawl {self.crawl.id} finished abx crawl cleanup root_snapshot={root_snapshot_id}")
if self.abx_services is not None:
_runner_debug(f"crawl {self.crawl.id} waiting for main bus background monitors")
await self.abx_services.process.wait_for_background_monitors()
_runner_debug(f"crawl {self.crawl.id} finished waiting for main bus background monitors")
finally:
await _stop_bus_trace(self.bus)
await self.bus.stop()
if self._live_stream is not None:
try:
self._live_stream.close()
except Exception:
pass
self._live_stream = None
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
if crawl_is_finished:
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
else:
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
async def enqueue_snapshot(self, snapshot_id: str) -> None:
task = self.snapshot_tasks.get(snapshot_id)
if task is not None and not task.done():
return
task = asyncio.create_task(self._run_snapshot(snapshot_id))
self.snapshot_tasks[snapshot_id] = task
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
return None
async def _wait_for_snapshot_tasks(self) -> None:
while True:
pending_tasks: list[asyncio.Task[None]] = []
for snapshot_id, task in list(self.snapshot_tasks.items()):
if task.done():
if self.snapshot_tasks.get(snapshot_id) is task:
self.snapshot_tasks.pop(snapshot_id, None)
task.result()
continue
pending_tasks.append(task)
if not pending_tasks:
return
done, _pending = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
task.result()
def _prepare(self) -> None:
from archivebox.config.configset import get_config
from archivebox.machine.models import NetworkInterface, Process
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
current_iface = NetworkInterface.current(refresh=True)
current_process = Process.current()
if current_process.iface_id != current_iface.id or current_process.machine_id != current_iface.machine_id:
current_process.iface = current_iface
current_process.machine = current_iface.machine
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
self.base_config.update(_installed_binary_config_overrides(self.plugins))
self.base_config["ABX_RUNTIME"] = "archivebox"
if self.selected_plugins is None:
self.selected_plugins = _selected_plugins_from_config(self.base_config)
if self.persona:
chrome_binary = str(self.base_config.get("CHROME_BINARY") or "")
self.base_config.update(self.persona.prepare_runtime_for_crawl(self.crawl, chrome_binary=chrome_binary))
def _cleanup_persona(self) -> None:
if self.persona:
self.persona.cleanup_runtime_for_crawl(self.crawl)
def _create_live_ui(self) -> LiveBusUI | None:
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
if not interactive_tty:
return None
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
stream = self._live_stream
except OSError:
self._live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
plugins_label = ", ".join(self.selected_plugins) if self.selected_plugins else f"all ({len(self.plugins)} available)"
live_ui = LiveBusUI(
self.bus,
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
ui_console=ui_console,
interactive_tty=True,
)
live_ui.print_intro(
url=self.primary_url or INSTALL_URL,
output_dir=Path(self.crawl.output_dir),
plugins_label=plugins_label,
)
return live_ui
def _create_root_snapshots(self) -> list[str]:
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
return [str(snapshot.id) for snapshot in snapshots]
def _initial_snapshot_ids(self) -> list[str]:
if self.initial_snapshot_ids:
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
return self._create_root_snapshots()
def _snapshot_config(self, snapshot) -> dict[str, Any]:
from archivebox.config.configset import get_config
config = get_config(crawl=self.crawl, snapshot=snapshot)
config.update(self.base_config)
config["CRAWL_DIR"] = str(self.crawl.output_dir)
config["SNAP_DIR"] = str(snapshot.output_dir)
config["SNAPSHOT_ID"] = str(snapshot.id)
config["SNAPSHOT_DEPTH"] = snapshot.depth
config["CRAWL_ID"] = str(self.crawl.id)
config["SOURCE_URL"] = snapshot.url
if snapshot.parent_snapshot_id:
config["PARENT_SNAPSHOT_ID"] = str(snapshot.parent_snapshot_id)
return config
async def _run_install_crawl(self) -> None:
install_snapshot = AbxSnapshot(
url=self.primary_url or INSTALL_URL,
id=str(self.crawl.id),
crawl_id=str(self.crawl.id),
)
await download(
url=self.primary_url or INSTALL_URL,
plugins=self.plugins,
output_dir=Path(self.crawl.output_dir),
selected_plugins=self.selected_plugins,
config_overrides={
**self.base_config,
"CRAWL_DIR": str(self.crawl.output_dir),
"SNAP_DIR": str(self.crawl.output_dir),
"CRAWL_ID": str(self.crawl.id),
"SOURCE_URL": self.crawl.urls,
},
bus=self.bus,
emit_jsonl=False,
snapshot=install_snapshot,
crawl_only=True,
)
async def _run_crawl_setup(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
setup_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=setup_snapshot,
crawl_setup_only=True,
)
async def _run_crawl_cleanup(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
cleanup_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=cleanup_snapshot,
crawl_cleanup_only=True,
)
async def _run_snapshot(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
async with self.snapshot_semaphore:
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
if snapshot["status"] == "sealed":
return
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
return
abx_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
snapshot_bus, snapshot_services = self._create_projector_bus(
identifier=f"{self.crawl.id}_{snapshot['id']}",
config_overrides=snapshot["config"],
)
try:
_attach_bus_trace(snapshot_bus)
_runner_debug(f"snapshot {snapshot_id} starting download()")
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=snapshot_bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
_runner_debug(f"snapshot {snapshot_id} finished download(), waiting for background monitors")
await snapshot_services.process.wait_for_background_monitors()
_runner_debug(f"snapshot {snapshot_id} finished waiting for background monitors")
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
await _stop_bus_trace(snapshot_bus)
await snapshot_bus.stop()
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
return {
"id": str(snapshot.id),
"url": snapshot.url,
"title": snapshot.title,
"timestamp": snapshot.timestamp,
"bookmarked_at": snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else "",
"created_at": snapshot.created_at.isoformat() if snapshot.created_at else "",
"tags": snapshot.tags_str(),
"depth": snapshot.depth,
"status": snapshot.status,
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
"output_dir": str(snapshot.output_dir),
"config": self._snapshot_config(snapshot),
}
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED:
return
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
def run_crawl(
crawl_id: str,
*,
snapshot_ids: list[str] | None = None,
selected_plugins: list[str] | None = None,
process_discovered_snapshots_inline: bool = True,
) -> None:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=crawl_id)
asyncio.run(
CrawlRunner(
crawl,
snapshot_ids=snapshot_ids,
selected_plugins=selected_plugins,
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
).run(),
)
async def _run_binary(binary_id: str) -> None:
from asgiref.sync import sync_to_async
from archivebox.config.configset import get_config
from archivebox.machine.models import Binary
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
plugins = discover_plugins()
config = get_config()
config.update(_installed_binary_config_overrides(plugins))
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
auto_install=True,
emit_jsonl=False,
)
try:
_attach_bus_trace(bus)
await bus.emit(
BinaryEvent(
name=binary.name,
plugin_name="archivebox",
hook_name="archivebox_run",
output_dir=str(binary.output_dir),
binary_id=str(binary.id),
machine_id=str(binary.machine_id),
abspath=binary.abspath,
version=binary.version,
sha256=binary.sha256,
binproviders=binary.binproviders,
binprovider=binary.binprovider,
overrides=binary.overrides or None,
),
)
finally:
await _stop_bus_trace(bus)
await bus.stop()
def run_binary(binary_id: str) -> None:
asyncio.run(_run_binary(binary_id))
async def _run_install(plugin_names: list[str] | None = None) -> None:
from archivebox.config.configset import get_config
plugins = discover_plugins()
config = get_config()
config.update(_installed_binary_config_overrides(plugins))
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
auto_install=True,
emit_jsonl=False,
)
live_stream = None
try:
selected_plugins = prepare_install_plugins(plugins, plugin_names=plugin_names)
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
timeout_seconds = int(config.get("TIMEOUT") or 60)
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
ui_console = None
live_ui = None
if interactive_tty:
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
stream = live_stream
except OSError:
live_stream = None
try:
terminal_size = os.get_terminal_size(stream.fileno())
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
except (AttributeError, OSError, ValueError):
terminal_size = shutil.get_terminal_size(fallback=(160, 40))
terminal_width = terminal_size.columns
terminal_height = terminal_size.lines
ui_console = Console(
file=stream,
force_terminal=True,
width=terminal_width,
height=terminal_height,
_environ={
"COLUMNS": str(terminal_width),
"LINES": str(terminal_height),
},
)
with TemporaryDirectory(prefix="archivebox-install-") as temp_dir:
output_dir = Path(temp_dir)
if ui_console is not None:
live_ui = LiveBusUI(
bus,
total_hooks=_count_selected_hooks(selected_plugins, None),
timeout_seconds=timeout_seconds,
ui_console=ui_console,
interactive_tty=interactive_tty,
)
live_ui.print_intro(
url=INSTALL_URL,
output_dir=output_dir,
plugins_label=plugins_label,
)
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(bus)
results = await abx_install_plugins(
plugin_names=plugin_names,
plugins=plugins,
output_dir=output_dir,
config_overrides=config,
emit_jsonl=False,
bus=bus,
)
await abx_services.process.wait_for_background_monitors()
if live_ui is not None:
live_ui.print_summary(results, output_dir=output_dir)
finally:
await _stop_bus_trace(bus)
await bus.stop()
try:
if live_stream is not None:
live_stream.close()
except Exception:
pass
def run_install(*, plugin_names: list[str] | None = None) -> None:
asyncio.run(_run_install(plugin_names=plugin_names))
def recover_orphaned_crawls() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.machine.models import Process
active_crawl_ids: set[str] = set()
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
continue
crawl_id = env.get("CRAWL_ID")
if crawl_id:
active_crawl_ids.add(str(crawl_id))
recovered = 0
now = timezone.now()
orphaned_crawls = Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set")
for crawl in orphaned_crawls:
if str(crawl.id) in active_crawl_ids:
continue
snapshots = list(crawl.snapshot_set.all())
if not snapshots or all(snapshot.status == Snapshot.StatusChoices.SEALED for snapshot in snapshots):
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
crawl.retry_at = now
crawl.save(update_fields=["retry_at", "modified_at"])
recovered += 1
return recovered
def recover_orphaned_snapshots() -> int:
from archivebox.crawls.models import Crawl
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
active_snapshot_ids: set[str] = set()
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
continue
snapshot_id = env.get("SNAPSHOT_ID")
if snapshot_id:
active_snapshot_ids.add(str(snapshot_id))
recovered = 0
now = timezone.now()
orphaned_snapshots = (
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set")
)
for snapshot in orphaned_snapshots:
if str(snapshot.id) in active_snapshot_ids:
continue
results = list(snapshot.archiveresult_set.all())
if results and all(result.status in ArchiveResult.FINAL_STATES for result in results):
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or now
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
crawl = snapshot.crawl
if crawl.is_finished() and crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
continue
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = now
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
crawl = snapshot.crawl
crawl.status = Crawl.StatusChoices.QUEUED
crawl.retry_at = now
crawl.save(update_fields=["status", "retry_at", "modified_at"])
recovered += 1
return recovered
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Snapshot
from archivebox.machine.models import Binary
while True:
if daemon and crawl_id is None:
now = timezone.now()
for schedule in CrawlSchedule.objects.filter(is_enabled=True).select_related("template", "template__created_by"):
if schedule.is_due(now):
schedule.enqueue(queued_at=now)
if crawl_id is None:
binary = (
Binary.objects.filter(retry_at__lte=timezone.now())
.exclude(status=Binary.StatusChoices.INSTALLED)
.order_by("retry_at", "created_at")
.first()
)
if binary is not None:
if not binary.claim_processing_lock(lock_seconds=60):
continue
run_binary(str(binary.id))
continue
queued_crawls = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.QUEUED,
)
if crawl_id:
queued_crawls = queued_crawls.filter(id=crawl_id)
queued_crawls = queued_crawls.order_by("retry_at", "created_at")
queued_crawl = queued_crawls.first()
if queued_crawl is not None:
if not queued_crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(queued_crawl.id), process_discovered_snapshots_inline=False)
continue
if crawl_id is None:
snapshot = (
Snapshot.objects.filter(retry_at__lte=timezone.now())
.exclude(status=Snapshot.StatusChoices.SEALED)
.select_related("crawl")
.order_by("retry_at", "created_at")
.first()
)
if snapshot is not None:
if not snapshot.claim_processing_lock(lock_seconds=60):
continue
run_crawl(
str(snapshot.crawl_id),
snapshot_ids=[str(snapshot.id)],
process_discovered_snapshots_inline=False,
)
continue
pending = Crawl.objects.filter(
retry_at__lte=timezone.now(),
status=Crawl.StatusChoices.STARTED,
)
if crawl_id:
pending = pending.filter(id=crawl_id)
pending = pending.order_by("retry_at", "created_at")
crawl = pending.first()
if crawl is None:
if daemon:
time.sleep(2.0)
continue
return 0
if not crawl.claim_processing_lock(lock_seconds=60):
continue
run_crawl(str(crawl.id), process_discovered_snapshots_inline=False)