This commit is contained in:
Nick Sweeting
2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions

View File

@@ -2,6 +2,7 @@ from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .machine_service import MachineService
from .process_request_service import ProcessRequestService
from .process_service import ProcessService
from .runner import run_binary, run_crawl, run_install, run_pending_crawls
from .snapshot_service import SnapshotService
@@ -12,6 +13,7 @@ __all__ = [
"BinaryService",
"CrawlService",
"MachineService",
"ProcessRequestService",
"ProcessService",
"SnapshotService",
"TagService",

View File

@@ -1,14 +1,16 @@
from __future__ import annotations
import json
import mimetypes
from collections import defaultdict
from collections.abc import Iterable
from pathlib import Path
from typing import Any
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
from abx_dl.output_files import guess_mimetype
from abx_dl.services.base import BaseService
from .db import run_db_op
@@ -35,27 +37,157 @@ def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, st
stat = file_path.stat()
except OSError:
continue
mime_type, _ = mimetypes.guess_type(str(file_path))
mime_type = mime_type or "application/octet-stream"
mime_type = guess_mimetype(file_path) or "application/octet-stream"
relative_path = str(file_path.relative_to(plugin_dir))
output_files[relative_path] = {}
output_files[relative_path] = {
"extension": file_path.suffix.lower().lstrip("."),
"mimetype": mime_type,
"size": stat.st_size,
}
mime_sizes[mime_type] += stat.st_size
total_size += stat.st_size
output_mimetypes = ",".join(
mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)
)
output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
return output_files, total_size, output_mimetypes
def _coerce_output_file_size(value: Any) -> int:
try:
return max(int(value or 0), 0)
except (TypeError, ValueError):
return 0
def _normalize_output_files(raw_output_files: Any) -> dict[str, dict]:
def _enrich_metadata(path: str, metadata: dict[str, Any]) -> dict[str, Any]:
normalized = dict(metadata)
if "extension" not in normalized:
normalized["extension"] = Path(path).suffix.lower().lstrip(".")
if "mimetype" not in normalized:
guessed = guess_mimetype(path)
if guessed:
normalized["mimetype"] = guessed
return normalized
if raw_output_files is None:
return {}
if isinstance(raw_output_files, str):
try:
raw_output_files = json.loads(raw_output_files)
except json.JSONDecodeError:
return {}
if isinstance(raw_output_files, dict):
normalized: dict[str, dict] = {}
for path, metadata in raw_output_files.items():
if not path:
continue
metadata_dict = dict(metadata) if isinstance(metadata, dict) else {}
metadata_dict.pop("path", None)
normalized[str(path)] = _enrich_metadata(str(path), metadata_dict)
return normalized
if not isinstance(raw_output_files, Iterable):
return {}
normalized: dict[str, dict] = {}
for item in raw_output_files:
if isinstance(item, str):
normalized[item] = _enrich_metadata(item, {})
continue
if hasattr(item, "model_dump"):
item = item.model_dump()
elif hasattr(item, "path"):
item = {
"path": getattr(item, "path", ""),
"extension": getattr(item, "extension", ""),
"mimetype": getattr(item, "mimetype", ""),
"size": getattr(item, "size", 0),
}
if not isinstance(item, dict):
continue
path = str(item.get("path") or "").strip()
if not path:
continue
normalized[path] = _enrich_metadata(path, {key: value for key, value in item.items() if key != "path" and value not in (None, "")})
return normalized
def _has_structured_output_metadata(output_files: dict[str, dict]) -> bool:
return any(any(key in metadata for key in ("extension", "mimetype", "size")) for metadata in output_files.values())
def _summarize_output_files(output_files: dict[str, dict]) -> tuple[int, str]:
mime_sizes: dict[str, int] = defaultdict(int)
total_size = 0
for metadata in output_files.values():
if not isinstance(metadata, dict):
continue
size = _coerce_output_file_size(metadata.get("size"))
mimetype = str(metadata.get("mimetype") or "").strip()
total_size += size
if mimetype and size:
mime_sizes[mimetype] += size
output_mimetypes = ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
return total_size, output_mimetypes
def _resolve_output_metadata(raw_output_files: Any, plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
normalized_output_files = _normalize_output_files(raw_output_files)
if normalized_output_files and _has_structured_output_metadata(normalized_output_files):
output_size, output_mimetypes = _summarize_output_files(normalized_output_files)
return normalized_output_files, output_size, output_mimetypes
return _collect_output_metadata(plugin_dir)
def _normalize_status(status: str) -> str:
if status == "noresult":
return "noresults"
return status or "failed"
def _has_content_files(output_files: list[str]) -> bool:
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in output_files)
def _normalize_snapshot_title(candidate: str, *, snapshot_url: str) -> str:
title = " ".join(line.strip() for line in str(candidate or "").splitlines() if line.strip()).strip()
if not title:
return ""
if title.lower() in {"pending...", "no title found"}:
return ""
if title == snapshot_url:
return ""
if "/" in title and title.lower().endswith(".txt"):
return ""
return title
def _extract_snapshot_title(snapshot_output_dir: str, plugin: str, output_str: str, *, snapshot_url: str) -> str:
if plugin != "title":
return ""
title_file = Path(snapshot_output_dir) / "title" / "title.txt"
if title_file.exists():
try:
file_title = _normalize_snapshot_title(title_file.read_text(encoding="utf-8"), snapshot_url=snapshot_url)
except OSError:
file_title = ""
if file_title:
return file_title
return _normalize_snapshot_title(output_str, snapshot_url=snapshot_url)
def _should_update_snapshot_title(current_title: str, next_title: str, *, snapshot_url: str) -> bool:
current = (current_title or "").strip()
if not current or current.lower() == "pending..." or current == snapshot_url:
return True
return len(next_title) > len(current)
def _has_content_files(output_files: Any) -> bool:
return any(Path(path).suffix not in {".log", ".pid", ".sh"} for path in _normalize_output_files(output_files))
def _iter_archiveresult_records(stdout: str) -> list[dict]:
@@ -86,7 +218,7 @@ class ArchiveResultService(BaseService):
if snapshot_output_dir is None:
return
plugin_dir = Path(snapshot_output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
@@ -94,7 +226,7 @@ class ArchiveResultService(BaseService):
return
plugin_dir = Path(event.output_dir)
output_files, output_size, output_mimetypes = await sync_to_async(_collect_output_metadata)(plugin_dir)
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
records = _iter_archiveresult_records(event.stdout)
if records:
for record in records:
@@ -172,6 +304,11 @@ class ArchiveResultService(BaseService):
result.notes = event.error
result.save()
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
snapshot.title = next_title
snapshot.save(update_fields=["title", "modified_at"])
def _project_from_process_completed(
self,
event: ProcessCompletedEvent,
@@ -188,6 +325,7 @@ class ArchiveResultService(BaseService):
process_id=event.process_id,
output_str=record.get("output_str") or "",
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),

View File

@@ -101,4 +101,6 @@ class BinaryService(BaseService):
binary.overrides = event.overrides
binary.status = Binary.StatusChoices.INSTALLED
binary.retry_at = None
binary.save(update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"])
binary.save(
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
)

View File

@@ -1 +1,51 @@
from abx_dl.cli import LiveBusUI
from __future__ import annotations
from pathlib import Path
from typing import Any
from rich.console import Console
class LiveBusUI:
"""Small tty-only runner UI.
The runner only needs a context manager and a couple of print helpers here.
Keeping this minimal avoids a hard dependency on a heavier live dashboard.
"""
def __init__(
self,
bus: Any,
*,
total_hooks: int,
timeout_seconds: int,
ui_console: Console,
interactive_tty: bool,
) -> None:
self.bus = bus
self.total_hooks = total_hooks
self.timeout_seconds = timeout_seconds
self.ui_console = ui_console
self.interactive_tty = interactive_tty
def __enter__(self) -> LiveBusUI:
return self
def __exit__(self, exc_type, exc, tb) -> bool:
return False
def print_intro(self, *, url: str, output_dir: Path, plugins_label: str) -> None:
if not self.interactive_tty:
return
self.ui_console.print(
f"[bold]ArchiveBox[/bold] {url} -> [dim]{output_dir}[/dim] "
f"([cyan]{plugins_label}[/cyan], {self.total_hooks} hooks, {self.timeout_seconds}s timeout)",
)
def print_summary(self, results: list[Any] | tuple[Any, ...] | None, *, output_dir: Path) -> None:
if not self.interactive_tty:
return
total_results = len(results or [])
self.ui_console.print(
f"[green]Completed[/green] {total_results} result(s) in [dim]{output_dir}[/dim]",
)

View File

@@ -0,0 +1,179 @@
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
import json
from pathlib import Path
import shlex
import socket
import time
from typing import ClassVar
from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.services.base import BaseService
def _is_port_listening(host: str, port: int) -> bool:
if not host or not port:
return False
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except OSError:
return False
def _supervisor_env(env: dict[str, str]) -> str:
pairs = []
for key, value in env.items():
escaped = value.replace('"', '\\"')
pairs.append(f'{key}="{escaped}"')
return ",".join(pairs)
def _iso_from_epoch(value: object) -> str:
if not isinstance(value, (int, float)) or value <= 0:
return ""
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
output_dir = Path(process_event.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
worker_name = process_event.hook_name
supervisor = get_or_create_supervisord_process(daemonize=True)
existing = get_worker(supervisor, worker_name)
if (
isinstance(existing, dict)
and existing.get("statename") == "RUNNING"
and (
not process_event.daemon_startup_host
or not process_event.daemon_startup_port
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
)
):
return existing
daemon = {
"name": worker_name,
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
"directory": str(output_dir),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
"redirect_stderr": "true",
}
if process_event.env:
daemon["environment"] = _supervisor_env(process_event.env)
proc = start_worker(supervisor, daemon)
deadline = time.monotonic() + max(float(process_event.daemon_startup_timeout), 0.5)
while time.monotonic() < deadline:
current = get_worker(supervisor, worker_name)
if isinstance(current, dict) and current.get("statename") == "RUNNING":
if (
not process_event.daemon_startup_host
or not process_event.daemon_startup_port
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
):
return current
time.sleep(0.1)
return proc if isinstance(proc, dict) else {}
class ProcessRequestService(BaseService):
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
try:
record = json.loads(event.line)
except (json.JSONDecodeError, ValueError):
return
if not isinstance(record, dict) or record.pop("type", "") != "ProcessEvent":
return
process_event = ProcessEvent(
plugin_name=record.get("plugin_name") or event.plugin_name,
hook_name=record.get("hook_name") or "process_request",
hook_path=record["hook_path"],
hook_args=[str(arg) for arg in record.get("hook_args", [])],
is_background=bool(record.get("is_background", True)),
output_dir=record.get("output_dir") or event.output_dir,
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
timeout=int(record.get("timeout") or 60),
daemon=bool(record.get("daemon", False)),
daemon_startup_host=str(record.get("daemon_startup_host") or ""),
daemon_startup_port=int(record.get("daemon_startup_port") or 0),
daemon_startup_timeout=float(record.get("daemon_startup_timeout") or 0.0),
process_type=str(record.get("process_type") or ""),
worker_type=str(record.get("worker_type") or ""),
event_timeout=float(record.get("event_timeout") or 360.0),
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
)
if not process_event.daemon:
await self.bus.emit(process_event)
return
proc = await asyncio.to_thread(_ensure_worker, process_event)
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
start_ts = _iso_from_epoch(proc.get("start"))
pid = int(proc.get("pid") or 0)
statename = str(proc.get("statename") or "")
exitstatus = int(proc.get("exitstatus") or 0)
process_type = process_event.process_type or "worker"
worker_type = process_event.worker_type or process_event.plugin_name
if statename == "RUNNING" and pid:
await self.bus.emit(
ProcessStartedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
output_dir=process_event.output_dir,
env=process_event.env,
timeout=process_event.timeout,
pid=pid,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
is_background=True,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
),
)
return
stderr = (
f"Worker {process_event.hook_name} failed to start"
if not statename
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
)
await self.bus.emit(
ProcessCompletedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
env=process_event.env,
stdout="",
stderr=stderr,
exit_code=exitstatus or 1,
output_dir=process_event.output_dir,
is_background=True,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
pid=pid,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
end_ts=datetime.now(tz=timezone.utc).isoformat(),
),
)
raise RuntimeError(stderr)

View File

@@ -43,7 +43,7 @@ class ProcessService(BaseService):
def get_db_process_id(self, process_id: str) -> str | None:
return self.process_ids.get(process_id)
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
from archivebox.machine.models import NetworkInterface, Process
db_process_id = self.process_ids.get(event.process_id)
@@ -57,11 +57,28 @@ class ProcessService(BaseService):
process.save(update_fields=["iface", "machine", "modified_at"])
return process
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
process_type = getattr(event, "process_type", "") or (
Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
)
worker_type = getattr(event, "worker_type", "") or ""
if process_type == Process.TypeChoices.WORKER and worker_type:
existing = (
Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
worker_type=worker_type,
pwd=event.output_dir,
)
.order_by("-modified_at")
.first()
)
if existing is not None:
self.process_ids[event.process_id] = str(existing.id)
return existing
process = Process.objects.create(
machine=iface.machine,
iface=iface,
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
@@ -81,6 +98,8 @@ class ProcessService(BaseService):
process.env = event.env
process.timeout = event.timeout
process.pid = event.pid or None
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
process.status = process.StatusChoices.RUNNING
process.retry_at = None
@@ -94,6 +113,8 @@ class ProcessService(BaseService):
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
process.stdout = event.stdout

View File

@@ -16,13 +16,21 @@ from django.utils import timezone
from rich.console import Console
from abx_dl.events import BinaryEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.models import INSTALL_URL, Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, prepare_install_plugins, setup_services as setup_abx_services
from abx_dl.orchestrator import (
create_bus,
download,
install_plugins as abx_install_plugins,
prepare_install_plugins,
setup_services as setup_abx_services,
)
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .machine_service import MachineService
from .process_request_service import ProcessRequestService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
@@ -54,6 +62,51 @@ def _runner_debug(message: str) -> None:
print(f"[runner] {message}", file=sys.stderr, flush=True)
def _binary_env_key(name: str) -> str:
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
return f"{normalized}_BINARY"
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
keys = [_binary_env_key(binary_name)]
if binary_name == "postlight-parser":
keys.insert(0, "MERCURY_BINARY")
for plugin in plugins.values():
for key, prop in plugin.config_schema.items():
if key.endswith("_BINARY") and prop.get("default") == binary_name:
keys.insert(0, key)
return list(dict.fromkeys(keys))
def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
overrides: dict[str, str] = {}
binaries = (
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
)
for binary in binaries:
try:
resolved_path = Path(binary.abspath).expanduser()
except (TypeError, ValueError):
continue
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
continue
for key in _binary_config_keys_for_plugins(plugins, binary.name):
overrides[key] = binary.abspath
return overrides
def _limit_stop_reason(config: dict[str, Any]) -> str:
return CrawlLimitState.from_config(config).get_stop_reason()
def _attach_bus_trace(bus) -> None:
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
if not trace_target:
@@ -105,6 +158,7 @@ def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
from archivebox.machine.models import Machine, Process
Process.cleanup_stale_running()
Process.cleanup_orphaned_workers()
machine = Machine.current()
if Process.objects.filter(
machine=machine,
@@ -149,6 +203,7 @@ class CrawlRunner:
self.machine_service = MachineService(self.bus)
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.process_request_service = ProcessRequestService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
@@ -173,6 +228,7 @@ class CrawlRunner:
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
CrawlService(bus, crawl_id=str(self.crawl.id))
SnapshotService(
bus,
@@ -201,7 +257,10 @@ class CrawlRunner:
self.abx_services = setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides=self.base_config,
config_overrides={
**self.base_config,
"ABX_RUNTIME": "archivebox",
},
auto_install=True,
emit_jsonl=False,
)
@@ -293,6 +352,8 @@ class CrawlRunner:
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
self.base_config.update(_installed_binary_config_overrides(self.plugins))
self.base_config["ABX_RUNTIME"] = "archivebox"
if self.selected_plugins is None:
self.selected_plugins = _selected_plugins_from_config(self.base_config)
if self.persona:
@@ -459,6 +520,11 @@ class CrawlRunner:
async with self.snapshot_semaphore:
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
if snapshot["status"] == "sealed":
return
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
return
abx_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
@@ -513,11 +579,22 @@ class CrawlRunner:
"created_at": snapshot.created_at.isoformat() if snapshot.created_at else "",
"tags": snapshot.tags_str(),
"depth": snapshot.depth,
"status": snapshot.status,
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
"output_dir": str(snapshot.output_dir),
"config": self._snapshot_config(snapshot),
}
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
if snapshot is None or snapshot.status == Snapshot.StatusChoices.SEALED:
return
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
def run_crawl(
crawl_id: str,
@@ -535,7 +612,7 @@ def run_crawl(
snapshot_ids=snapshot_ids,
selected_plugins=selected_plugins,
process_discovered_snapshots_inline=process_discovered_snapshots_inline,
).run()
).run(),
)
@@ -546,9 +623,17 @@ async def _run_binary(binary_id: str) -> None:
from archivebox.machine.models import Binary
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
config = get_config()
plugins = discover_plugins()
config = get_config()
config.update(_installed_binary_config_overrides(plugins))
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
setup_abx_services(
bus,
plugins=plugins,
@@ -556,11 +641,6 @@ async def _run_binary(binary_id: str) -> None:
auto_install=True,
emit_jsonl=False,
)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus, process_service=process_service)
try:
_attach_bus_trace(bus)
@@ -592,9 +672,17 @@ def run_binary(binary_id: str) -> None:
async def _run_install(plugin_names: list[str] | None = None) -> None:
from archivebox.config.configset import get_config
config = get_config()
plugins = discover_plugins()
config = get_config()
config.update(_installed_binary_config_overrides(plugins))
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=plugins,
@@ -602,11 +690,6 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
auto_install=True,
emit_jsonl=False,
)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus, process_service=process_service)
live_stream = None
try:
@@ -763,8 +846,7 @@ def recover_orphaned_snapshots() -> int:
recovered = 0
now = timezone.now()
orphaned_snapshots = (
Snapshot.objects
.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set")
)

View File

@@ -4,6 +4,7 @@ from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.services.base import BaseService
from .db import run_db_op
@@ -47,6 +48,8 @@ class SnapshotService(BaseService):
if event.depth > crawl.max_depth:
return None
if self._crawl_limit_stop_reason(crawl) == "max_size":
return None
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
if parent_snapshot is None:
@@ -84,15 +87,38 @@ class SnapshotService(BaseService):
def _seal_snapshot(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
if snapshot is None:
return None
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
return str(snapshot.id)
def _crawl_limit_stop_reason(self, crawl) -> str:
config = dict(crawl.config or {})
config["CRAWL_DIR"] = str(crawl.output_dir)
return CrawlLimitState.from_config(config).get_stop_reason()
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
from archivebox.core.models import Snapshot
return (
Snapshot.objects.filter(
crawl_id=crawl_id,
status=Snapshot.StatusChoices.QUEUED,
)
.exclude(id=exclude_snapshot_id)
.update(
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
modified_at=timezone.now(),
)
)
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot