update working changes

This commit is contained in:
Nick Sweeting
2026-03-25 05:36:07 -07:00
parent 80243accfd
commit f3622d8cd3
29 changed files with 985 additions and 1666 deletions

View File

@@ -26,7 +26,7 @@ EVENT_FLOW_DIAGRAM = """
│ CrawlStartEvent │
│ └─ SnapshotEvent │
│ └─ on_Snapshot__* │
│ └─ Snapshot / ArchiveResult / Tag / Machine / BinaryRequest
│ └─ ArchiveResult / Snapshot / Tag
│ │
│ SnapshotCleanupEvent -> internal cleanup, no direct hook family │
│ CrawlCleanupEvent -> internal cleanup, no direct hook family │
@@ -89,8 +89,8 @@ def pluginmap(
"emits": ["ProcessEvent"],
},
"SnapshotEvent": {
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, and BinaryRequest records.",
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "BinaryRequestEvent", "ProcessEvent"],
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, and Tag records.",
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "ProcessEvent"],
},
"SnapshotCleanupEvent": {
"description": "Internal snapshot cleanup phase.",

View File

@@ -267,19 +267,13 @@ def get_config(
if crawl and hasattr(crawl, "output_dir"):
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
config["CRAWL_DIR"] = str(crawl.output_dir)
config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID")
# Apply snapshot config overrides (highest priority)
if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config)
if snapshot:
config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID")
config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0)
if hasattr(snapshot, "output_dir"):
config["SNAP_DIR"] = str(snapshot.output_dir)
if getattr(snapshot, "crawl_id", None):
config["CRAWL_ID"] = str(snapshot.crawl_id)
if snapshot and hasattr(snapshot, "output_dir"):
config["SNAP_DIR"] = str(snapshot.output_dir)
# Normalize all aliases to canonical names (after all sources merged)
# This handles aliases that came from user/crawl/snapshot configs, not just env

View File

@@ -38,8 +38,8 @@ def _quote_shell_string(value: str) -> str:
def _get_replay_source_url(result: ArchiveResult) -> str:
process_env = getattr(getattr(result, "process", None), "env", None) or {}
return str(process_env.get("SOURCE_URL") or result.snapshot.url or "")
process = getattr(result, "process", None)
return str(getattr(process, "url", None) or result.snapshot.url or "")
def build_abx_dl_display_command(result: ArchiveResult) -> str:

View File

@@ -1322,6 +1322,17 @@ def live_progress_view(request):
# Build hierarchical active crawls with nested snapshots and archive results
active_crawls_qs = (
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
.prefetch_related(
"snapshot_set",
"snapshot_set__archiveresult_set",
"snapshot_set__archiveresult_set__process",
)
.distinct()
.order_by("-modified_at")[:10]
)
running_processes = Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
@@ -1343,28 +1354,45 @@ def live_progress_view(request):
process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {}
process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {}
seen_process_records: set[str] = set()
snapshots = [snapshot for crawl in active_crawls_qs for snapshot in crawl.snapshot_set.all()]
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
env = {}
crawl_id = env.get("CRAWL_ID")
snapshot_id = env.get("SNAPSHOT_ID")
if not proc.pwd:
continue
proc_pwd = Path(proc.pwd)
matched_snapshot = None
for snapshot in snapshots:
try:
proc_pwd.relative_to(snapshot.output_dir)
matched_snapshot = snapshot
break
except ValueError:
continue
if matched_snapshot is None:
continue
crawl_id = str(matched_snapshot.crawl_id)
snapshot_id = str(matched_snapshot.id)
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
if crawl_id and proc.pid:
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
crawl_process_pids.setdefault(crawl_id, proc.pid)
if phase == "snapshot" and snapshot_id and proc.pid:
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
snapshot_process_pids.setdefault(snapshot_id, proc.pid)
for proc in recent_processes:
env = proc.env or {}
if not isinstance(env, dict):
env = {}
crawl_id = env.get("CRAWL_ID")
snapshot_id = env.get("SNAPSHOT_ID")
if not crawl_id and not snapshot_id:
if not proc.pwd:
continue
proc_pwd = Path(proc.pwd)
matched_snapshot = None
for snapshot in snapshots:
try:
proc_pwd.relative_to(snapshot.output_dir)
matched_snapshot = snapshot
break
except ValueError:
continue
if matched_snapshot is None:
continue
crawl_id = str(matched_snapshot.crawl_id)
snapshot_id = str(matched_snapshot.id)
plugin, label, phase, hook_name = process_label(proc.cmd)
@@ -1393,20 +1421,9 @@ def live_progress_view(request):
payload["pid"] = proc.pid
proc_started_at = proc.started_at or proc.modified_at
if phase == "snapshot" and snapshot_id:
process_records_by_snapshot.setdefault(str(snapshot_id), []).append((payload, proc_started_at))
process_records_by_snapshot.setdefault(snapshot_id, []).append((payload, proc_started_at))
elif crawl_id:
process_records_by_crawl.setdefault(str(crawl_id), []).append((payload, proc_started_at))
active_crawls_qs = (
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
.prefetch_related(
"snapshot_set",
"snapshot_set__archiveresult_set",
"snapshot_set__archiveresult_set__process",
)
.distinct()
.order_by("-modified_at")[:10]
)
process_records_by_crawl.setdefault(crawl_id, []).append((payload, proc_started_at))
active_crawls = []
total_workers = 0

View File

@@ -827,7 +827,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
for record in records[:3]:
print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}")
if system_task:
records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary", "Machine")]
records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary")]
overrides = {"crawl": self}
stats = process_hook_records(records, overrides=overrides)
if stats:

View File

@@ -13,13 +13,9 @@ Hook-backed event families are discovered from filenames like:
on_CrawlSetup__*
on_Snapshot__*
InstallEvent itself is still part of the runtime lifecycle, but it has no
corresponding hook family. Its dependency declarations come directly from each
plugin's `config.json > required_binaries`.
Lifecycle event names like `InstallEvent` or `SnapshotCleanupEvent` are
normalized to the corresponding `on_{EventFamily}__*` prefix by a simple
string transform. If no scripts exist for that prefix, discovery returns `[]`.
Internal bus event names are normalized to the corresponding
`on_{EventFamily}__*` prefix by a simple string transform. If no scripts exist
for that prefix, discovery returns `[]`.
Directory structure:
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
@@ -120,7 +116,6 @@ def normalize_hook_event_name(event_name: str) -> str | None:
Normalize a hook event family or event class name to its on_* prefix.
Examples:
InstallEvent -> Install
BinaryRequestEvent -> BinaryRequest
CrawlSetupEvent -> CrawlSetup
SnapshotEvent -> Snapshot
@@ -171,7 +166,7 @@ def discover_hooks(
Args:
event_name: Hook event family or event class name.
Examples: 'Install', 'InstallEvent', 'BinaryRequestEvent', 'Snapshot'.
Examples: 'BinaryRequestEvent', 'Snapshot'.
Event names are normalized by stripping a trailing `Event`.
If no matching `on_{EventFamily}__*` scripts exist, returns [].
filter_disabled: If True, skip hooks from disabled plugins (default: True)
@@ -1070,9 +1065,8 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
Process JSONL records emitted by hook stdout.
This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest,
Binary, and Machine. It does not process bus lifecycle events like
InstallEvent, CrawlEvent, CrawlCleanupEvent, or SnapshotCleanupEvent, since
those are not emitted as JSONL records by hook subprocesses.
and Binary. It does not process internal bus lifecycle events, since those
are not emitted as JSONL records by hook subprocesses.
Args:
records: List of JSONL record dicts from result['records']
@@ -1131,13 +1125,6 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
if obj:
stats[record_type] = stats.get(record_type, 0) + 1
elif record_type == "Machine":
from archivebox.machine.models import Machine
obj = Machine.from_json(record.copy(), overrides)
if obj:
stats["Machine"] = stats.get("Machine", 0) + 1
else:
import sys

View File

@@ -566,33 +566,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
return None
return {provider.strip() for provider in providers.split(",") if provider.strip()}
def _get_custom_install_command(self) -> str | None:
"""Extract a custom install command from overrides when the custom provider is used."""
import shlex
if not isinstance(self.overrides, dict):
return None
for key in ("custom_cmd", "cmd", "command"):
value = self.overrides.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
custom_overrides = self.overrides.get("custom")
if isinstance(custom_overrides, dict):
for key in ("custom_cmd", "cmd", "command"):
value = custom_overrides.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
install_args = custom_overrides.get("install_args")
if isinstance(install_args, str) and install_args.strip():
return install_args.strip()
if isinstance(install_args, list) and install_args:
return " ".join(shlex.quote(str(arg)) for arg in install_args if str(arg).strip())
return None
def run(self):
"""
Execute binary installation by running on_BinaryRequest__* hooks.
@@ -637,13 +610,8 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
plugin_output_dir = output_dir / plugin_name
plugin_output_dir.mkdir(parents=True, exist_ok=True)
custom_cmd = None
overrides_json = None
if plugin_name == "custom":
custom_cmd = self._get_custom_install_command()
if not custom_cmd:
continue
elif self.overrides:
if self.overrides:
overrides_json = json.dumps(self.overrides)
# Run the hook
@@ -656,7 +624,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
machine_id=str(self.machine_id),
name=self.name,
binproviders=self.binproviders,
custom_cmd=custom_cmd,
overrides=overrides_json,
)

View File

@@ -9,12 +9,11 @@ from typing import Any
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent
from abx_dl.output_files import guess_mimetype
from abx_dl.services.base import BaseService
from .db import run_db_op
from .process_service import ProcessService, parse_event_datetime
from .process_service import parse_event_datetime
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
@@ -209,79 +208,41 @@ class ArchiveResultService(BaseService):
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
EMITS = []
def __init__(self, bus, *, process_service: ProcessService):
self.process_service = process_service
def __init__(self, bus):
super().__init__(bus)
self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db)
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
if snapshot_output_dir is None:
return
plugin_dir = Path(snapshot_output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
return
plugin_dir = Path(event.output_dir)
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
records = _iter_archiveresult_records(event.stdout)
if records:
for record in records:
await run_db_op(
self._project_from_process_completed,
event,
record,
output_files,
output_size,
output_mimetypes,
)
return
synthetic_record = {
"plugin": event.plugin_name,
"hook_name": event.hook_name,
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
"output_str": event.stderr if event.exit_code != 0 else "",
"error": event.stderr if event.exit_code != 0 else "",
}
await run_db_op(
self._project_from_process_completed,
event,
synthetic_record,
output_files,
output_size,
output_mimetypes,
)
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
return str(snapshot.output_dir) if snapshot is not None else None
def _project(
self,
event: ArchiveResultEvent,
output_files: dict[str, dict],
output_size: int,
output_mimetypes: str,
) -> None:
async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None:
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").afirst()
if snapshot is None:
return
plugin_dir = Path(snapshot.output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
process_started = await self.bus.find(
ProcessStartedEvent,
past=True,
future=False,
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
)
process = None
db_process_id = self.process_service.get_db_process_id(event.process_id)
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process_started is not None:
started_at = parse_event_datetime(process_started.start_ts)
if started_at is None:
raise ValueError("ProcessStartedEvent.start_ts is required")
process_query = Process.objects.filter(
pwd=process_started.output_dir,
cmd=[process_started.hook_path, *process_started.hook_args],
started_at=started_at,
)
if process_started.pid:
process_query = process_query.filter(pid=process_started.pid)
process = await process_query.order_by("-modified_at").afirst()
result, _created = ArchiveResult.objects.get_or_create(
result, _created = await ArchiveResult.objects.aget_or_create(
snapshot=snapshot,
plugin=event.plugin,
hook_name=event.hook_name,
@@ -302,32 +263,54 @@ class ArchiveResultService(BaseService):
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
if event.error:
result.notes = event.error
result.save()
await result.asave()
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
snapshot.title = next_title
snapshot.save(update_fields=["title", "modified_at"])
await snapshot.asave(update_fields=["title", "modified_at"])
def _project_from_process_completed(
self,
event: ProcessCompletedEvent,
record: dict,
output_files: dict[str, dict],
output_size: int,
output_mimetypes: str,
) -> None:
archive_result_event = ArchiveResultEvent(
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
plugin=record.get("plugin") or event.plugin_name,
hook_name=record.get("hook_name") or event.hook_name,
status=record.get("status") or "",
process_id=event.process_id,
output_str=record.get("output_str") or "",
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
if not event.hook_name.startswith("on_Snapshot"):
return
snapshot_event = await self.bus.find(
SnapshotEvent,
past=True,
future=False,
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
)
if snapshot_event is None:
return
records = _iter_archiveresult_records(event.stdout)
if records:
for record in records:
await self.bus.emit(
ArchiveResultEvent(
snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id,
plugin=record.get("plugin") or event.plugin_name,
hook_name=record.get("hook_name") or event.hook_name,
status=record.get("status") or "",
output_str=record.get("output_str") or "",
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
),
)
return
await self.bus.emit(
ArchiveResultEvent(
snapshot_id=snapshot_event.snapshot_id,
plugin=event.plugin_name,
hook_name=event.hook_name,
status="failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
output_str=event.stderr if event.exit_code != 0 else "",
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=event.stderr if event.exit_code != 0 else "",
),
)
self._project(archive_result_event, output_files, output_size, output_mimetypes)

View File

@@ -1,20 +1,62 @@
from __future__ import annotations
import asyncio
from asgiref.sync import sync_to_async
from abx_dl.events import BinaryRequestEvent, BinaryEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class BinaryService(BaseService):
LISTENS_TO = [BinaryRequestEvent, BinaryEvent]
EMITS = []
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
await run_db_op(self._project_binary, event)
cached = await run_db_op(self._load_cached_binary, event)
def __init__(self, bus):
super().__init__(bus)
self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent)
self.bus.on(BinaryEvent, self.on_BinaryEvent)
async def on_BinaryRequestEvent(self, event: BinaryRequestEvent) -> None:
from archivebox.machine.models import Binary, Machine
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
existing = await Binary.objects.filter(machine=machine, name=event.name).afirst()
if existing and existing.status == Binary.StatusChoices.INSTALLED:
changed = False
if event.binproviders and existing.binproviders != event.binproviders:
existing.binproviders = event.binproviders
changed = True
if event.overrides and existing.overrides != event.overrides:
existing.overrides = event.overrides
changed = True
if changed:
await existing.asave(update_fields=["binproviders", "overrides", "modified_at"])
elif existing is None:
await Binary.objects.acreate(
machine=machine,
name=event.name,
binproviders=event.binproviders,
overrides=event.overrides or {},
status=Binary.StatusChoices.QUEUED,
)
installed = (
await Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("-modified_at")
.afirst()
)
cached = None
if installed is not None:
cached = {
"abspath": installed.abspath,
"version": installed.version or "",
"sha256": installed.sha256 or "",
"binproviders": installed.binproviders or "",
"binprovider": installed.binprovider or "",
"machine_id": str(installed.machine_id),
"overrides": installed.overrides or {},
}
if cached is not None:
await self.bus.emit(
BinaryEvent(
@@ -28,126 +70,34 @@ class BinaryService(BaseService):
binprovider=cached["binprovider"],
overrides=event.overrides or cached["overrides"],
binary_id=event.binary_id,
machine_id=event.machine_id or cached["machine_id"],
machine_id=cached["machine_id"],
),
)
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
await run_db_op(self._project_installed_binary, event, resolved)
def _project_binary(self, event: BinaryRequestEvent) -> None:
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
existing = Binary.objects.filter(machine=machine, name=event.name).first()
if existing and existing.status == Binary.StatusChoices.INSTALLED:
changed = False
if event.binproviders and existing.binproviders != event.binproviders:
existing.binproviders = event.binproviders
changed = True
if event.overrides and existing.overrides != event.overrides:
existing.overrides = event.overrides
changed = True
if changed:
existing.save(update_fields=["binproviders", "overrides", "modified_at"])
return
Binary.from_json(
{
"name": event.name,
"binproviders": event.binproviders,
"overrides": event.overrides or {},
},
)
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
installed = (
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("-modified_at")
.first()
)
if installed is None:
return None
return {
"abspath": installed.abspath,
"version": installed.version or "",
"sha256": installed.sha256 or "",
"binproviders": installed.binproviders or "",
"binprovider": installed.binprovider or "",
"machine_id": str(installed.machine_id),
"overrides": installed.overrides or {},
}
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
resolved = {
"abspath": event.abspath or "",
"version": event.version or "",
"sha256": event.sha256 or "",
"binproviders": event.binproviders or "",
"binprovider": event.binprovider or "",
}
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
return resolved
if resolved["abspath"] and not resolved["version"]:
try:
from abx_pkg.semver import bin_version
detected_version = bin_version(resolved["abspath"])
except Exception:
detected_version = None
if detected_version:
resolved["version"] = str(detected_version)
if resolved["version"] and resolved["binprovider"]:
return resolved
try:
from abx_dl.dependencies import load_binary
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
spec = {
"name": event.name,
"binproviders": allowed_providers,
"overrides": event.overrides or {},
}
binary = load_binary(spec)
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
resolved["version"] = str(binary.version or resolved["version"] or "")
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
resolved["binprovider"] = str(binary.loaded_binprovider.name)
except Exception:
pass
return resolved
def _project_installed_binary(self, event: BinaryEvent, resolved: dict[str, str]) -> None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
binary, _ = Binary.objects.get_or_create(
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
binary, _ = await Binary.objects.aget_or_create(
machine=machine,
name=event.name,
defaults={
"status": Binary.StatusChoices.QUEUED,
},
)
binary.abspath = resolved["abspath"] or binary.abspath
binary.version = resolved["version"] or binary.version
binary.sha256 = resolved["sha256"] or binary.sha256
if resolved["binproviders"]:
binary.binproviders = resolved["binproviders"]
binary.binprovider = resolved["binprovider"] or binary.binprovider
binary.abspath = event.abspath
if event.version:
binary.version = event.version
if event.sha256:
binary.sha256 = event.sha256
if event.binproviders:
binary.binproviders = event.binproviders
if event.binprovider:
binary.binprovider = event.binprovider
if event.overrides and binary.overrides != event.overrides:
binary.overrides = event.overrides
binary.status = Binary.StatusChoices.INSTALLED
binary.retry_at = None
binary.save(
await binary.asave(
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
)

View File

@@ -3,8 +3,6 @@ from __future__ import annotations
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class CrawlService(BaseService):
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
@@ -13,32 +11,42 @@ class CrawlService(BaseService):
def __init__(self, bus, *, crawl_id: str):
self.crawl_id = crawl_id
super().__init__(bus)
self.bus.on(CrawlSetupEvent, self.on_CrawlSetupEvent__save_to_db)
self.bus.on(CrawlStartEvent, self.on_CrawlStartEvent__save_to_db)
self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__save_to_db)
self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__save_to_db)
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
await run_db_op(self._mark_completed)
def _mark_started(self) -> None:
async def on_CrawlSetupEvent__save_to_db(self, event: CrawlSetupEvent) -> None:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=self.crawl_id)
crawl = await Crawl.objects.aget(id=self.crawl_id)
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
def _mark_completed(self) -> None:
async def on_CrawlStartEvent__save_to_db(self, event: CrawlStartEvent) -> None:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=self.crawl_id)
crawl = await Crawl.objects.aget(id=self.crawl_id)
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = None
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
async def on_CrawlCleanupEvent__save_to_db(self, event: CrawlCleanupEvent) -> None:
from archivebox.crawls.models import Crawl
crawl = await Crawl.objects.aget(id=self.crawl_id)
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = None
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
async def on_CrawlCompletedEvent__save_to_db(self, event: CrawlCompletedEvent) -> None:
from archivebox.crawls.models import Crawl
crawl = await Crawl.objects.aget(id=self.crawl_id)
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])

View File

@@ -1,16 +0,0 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from django.db import close_old_connections
def _run_db_op(func, *args, **kwargs):
close_old_connections()
try:
return func(*args, **kwargs)
finally:
close_old_connections()
async def run_db_op(func, *args, **kwargs):
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)

View File

@@ -1,22 +1,23 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from abx_dl.events import MachineEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class MachineService(BaseService):
LISTENS_TO = [MachineEvent]
EMITS = []
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
await run_db_op(self._project, event)
def __init__(self, bus):
super().__init__(bus)
self.bus.on(MachineEvent, self.on_MachineEvent__save_to_db)
def _project(self, event: MachineEvent) -> None:
async def on_MachineEvent__save_to_db(self, event: MachineEvent) -> None:
from archivebox.machine.models import Machine, _sanitize_machine_config
machine = Machine.current()
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
config = dict(machine.config or {})
if event.config is not None:
@@ -29,4 +30,4 @@ class MachineService(BaseService):
return
machine.config = _sanitize_machine_config(config)
machine.save(update_fields=["config", "modified_at"])
await machine.asave(update_fields=["config", "modified_at"])

View File

@@ -1,29 +1,15 @@
from __future__ import annotations
import asyncio
from datetime import datetime, timezone as datetime_timezone
import json
from pathlib import Path
import shlex
import socket
import time
from typing import TYPE_CHECKING, Any, ClassVar
from urllib.parse import urlparse
from datetime import datetime
from typing import ClassVar
from asgiref.sync import sync_to_async
from django.utils import timezone
from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
if TYPE_CHECKING:
from archivebox.machine.models import Process
WORKER_READY_TIMEOUT = 10.0
def parse_event_datetime(value: str | None):
if not value:
@@ -37,308 +23,133 @@ def parse_event_datetime(value: str | None):
return dt
def _is_port_listening(host: str, port: int) -> bool:
if not host or not port:
return False
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except OSError:
return False
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
if not url:
return None
parsed = urlparse(url)
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
return None
return parsed.hostname, parsed.port
def _supervisor_env(env: dict[str, str]) -> str:
pairs = []
for key, value in env.items():
escaped = value.replace('"', '\\"')
pairs.append(f'{key}="{escaped}"')
return ",".join(pairs)
def _iso_from_epoch(value: object) -> str:
if not isinstance(value, (int, float)) or value <= 0:
return ""
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
def _int_from_object(value: object) -> int:
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
if isinstance(value, str):
try:
return int(value)
except ValueError:
return 0
return 0
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
output_dir = Path(process_event.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
worker_name = process_event.hook_name
supervisor = get_or_create_supervisord_process(daemonize=True)
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
existing = get_worker(supervisor, worker_name)
if (
isinstance(existing, dict)
and existing.get("statename") == "RUNNING"
and (worker_socket is None or _is_port_listening(*worker_socket))
):
return existing
daemon = {
"name": worker_name,
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
"directory": str(output_dir),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
"redirect_stderr": "true",
}
if process_event.env:
daemon["environment"] = _supervisor_env(process_event.env)
proc = start_worker(supervisor, daemon)
deadline = time.monotonic() + WORKER_READY_TIMEOUT
while time.monotonic() < deadline:
current = get_worker(supervisor, worker_name)
if isinstance(current, dict) and current.get("statename") == "RUNNING":
if worker_socket is None or _is_port_listening(*worker_socket):
return current
time.sleep(0.1)
return proc if isinstance(proc, dict) else {}
class ProcessService(BaseService):
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStartedEvent, ProcessCompletedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = []
def __init__(self, bus):
self.process_ids: dict[str, str] = {}
super().__init__(bus)
self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db)
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
try:
record = json.loads(event.line)
except (json.JSONDecodeError, ValueError):
return
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
return
passthrough_fields: dict[str, Any] = {
key: value
for key, value in record.items()
if key
not in {
"type",
"plugin_name",
"hook_name",
"hook_path",
"hook_args",
"is_background",
"output_dir",
"env",
"snapshot_id",
"process_id",
"url",
"timeout",
"daemon",
"process_type",
"worker_type",
"event_timeout",
"event_handler_timeout",
}
}
process_event = ProcessEvent(
plugin_name=record.get("plugin_name") or event.plugin_name,
hook_name=record.get("hook_name") or "process",
hook_path=record["hook_path"],
hook_args=[str(arg) for arg in record.get("hook_args", [])],
is_background=bool(record.get("is_background", True)),
output_dir=record.get("output_dir") or event.output_dir,
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
timeout=int(record.get("timeout") or 60),
daemon=bool(record.get("daemon", False)),
url=str(record.get("url") or ""),
process_type=str(record.get("process_type") or ""),
worker_type=str(record.get("worker_type") or ""),
event_timeout=float(record.get("event_timeout") or 360.0),
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
**passthrough_fields,
)
if not process_event.daemon:
await self.bus.emit(process_event)
return
proc = await asyncio.to_thread(_ensure_worker, process_event)
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
start_ts = _iso_from_epoch(proc.get("start"))
pid = _int_from_object(proc.get("pid"))
statename = str(proc.get("statename") or "")
exitstatus = _int_from_object(proc.get("exitstatus"))
process_type = process_event.process_type or "worker"
worker_type = process_event.worker_type or process_event.plugin_name
if statename == "RUNNING" and pid:
await self.bus.emit(
ProcessStartedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
output_dir=process_event.output_dir,
env=process_event.env,
timeout=process_event.timeout,
pid=pid,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
is_background=True,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
**passthrough_fields,
),
)
return
stderr = (
f"Worker {process_event.hook_name} failed to start"
if not statename
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
)
await self.bus.emit(
ProcessCompletedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
env=process_event.env,
stdout="",
stderr=stderr,
exit_code=exitstatus or 1,
output_dir=process_event.output_dir,
is_background=True,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
pid=pid,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
**passthrough_fields,
),
)
raise RuntimeError(stderr)
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
await run_db_op(self._project_started, event)
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
await run_db_op(self._project_completed, event)
def get_db_process_id(self, process_id: str) -> str | None:
return self.process_ids.get(process_id)
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None:
from archivebox.machine.models import NetworkInterface, Process
db_process_id = self.process_ids.get(event.process_id)
iface = NetworkInterface.current(refresh=True)
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process is not None:
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
process.save(update_fields=["iface", "machine", "modified_at"])
return process
process_type = getattr(event, "process_type", "") or (
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
process_type = event.process_type or (
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
)
worker_type = getattr(event, "worker_type", "") or ""
if process_type == Process.TypeChoices.WORKER and worker_type:
existing = (
Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
worker_type=worker_type,
pwd=event.output_dir,
)
.order_by("-modified_at")
.first()
)
if existing is not None:
self.process_ids[event.process_id] = str(existing.id)
return existing
process = Process.objects.create(
machine=iface.machine,
iface=iface,
worker_type = event.worker_type or ""
started_at = parse_event_datetime(event.start_ts)
if started_at is None:
raise ValueError("ProcessStartedEvent.start_ts is required")
process_query = Process.objects.filter(
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
timeout=getattr(event, "timeout", 60),
pid=event.pid or None,
url=getattr(event, "url", "") or None,
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
status=Process.StatusChoices.RUNNING,
retry_at=None,
started_at=started_at,
)
self.process_ids[event.process_id] = str(process.id)
return process
if event.pid:
process_query = process_query.filter(pid=event.pid)
process = await process_query.order_by("-modified_at").afirst()
if process is None:
process = await Process.objects.acreate(
machine=iface.machine,
iface=iface,
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
started_at=started_at,
status=Process.StatusChoices.RUNNING,
retry_at=None,
)
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
await process.asave(update_fields=["iface", "machine", "modified_at"])
def _project_started(self, event: ProcessStartedEvent) -> None:
process = self._get_or_create_process(event)
process.pwd = event.output_dir
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.timeout = event.timeout
process.pid = event.pid or None
process.url = getattr(event, "url", "") or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
process.url = event.url or process.url
process.process_type = process_type or process.process_type
process.worker_type = worker_type or process.worker_type
process.started_at = started_at
process.status = process.StatusChoices.RUNNING
process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
plugin_name=event.plugin_name,
hook_path=event.hook_path,
)
await process.asave()
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
from archivebox.machine.models import NetworkInterface, Process
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
process_type = event.process_type or (
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
)
worker_type = event.worker_type or ""
started_at = parse_event_datetime(event.start_ts)
if started_at is None:
raise ValueError("ProcessCompletedEvent.start_ts is required")
process_query = Process.objects.filter(
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
started_at=started_at,
)
if event.pid:
process_query = process_query.filter(pid=event.pid)
process = await process_query.order_by("-modified_at").afirst()
if process is None:
process = await Process.objects.acreate(
machine=iface.machine,
iface=iface,
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
started_at=started_at,
status=Process.StatusChoices.RUNNING,
retry_at=None,
)
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
await process.asave(update_fields=["iface", "machine", "modified_at"])
def _project_completed(self, event: ProcessCompletedEvent) -> None:
process = self._get_or_create_process(event)
process.pwd = event.output_dir
if not process.cmd:
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.url = getattr(event, "url", "") or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
process.url = event.url or process.url
process.process_type = process_type or process.process_type
process.worker_type = worker_type or process.worker_type
process.started_at = started_at
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
process.stdout = event.stdout
process.stderr = event.stderr
process.exit_code = event.exit_code
process.status = process.StatusChoices.EXITED
process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
process.save()
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
plugin_name=event.plugin_name,
hook_path=event.hook_path,
)
await process.asave()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
import asyncio
import json
import os
import re
import shutil
import subprocess
import sys
@@ -13,12 +12,13 @@ from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any
from asgiref.sync import sync_to_async
from django.utils import timezone
from rich.console import Console
from abx_dl.events import BinaryRequestEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.models import Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
from abx_dl.models import Plugin, discover_plugins, filter_plugins
from abx_dl.orchestrator import (
create_bus,
download,
@@ -40,150 +40,9 @@ def _bus_name(prefix: str, identifier: str) -> str:
return f"{prefix}_{normalized}"
def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
raw = str(config.get("PLUGINS") or "").strip()
if not raw:
return None
return [name.strip() for name in raw.split(",") if name.strip()]
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
return sum(
1
for plugin in selected.values()
for hook in plugin.hooks
if "Install" in hook.name or "CrawlSetup" in hook.name or "Snapshot" in hook.name
)
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
keys: list[str] = []
for plugin in plugins.values():
for spec in plugin.binaries:
template_name = str(spec.get("name") or "").strip()
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
if match is None:
continue
key = match.group(1)
configured_value = config.get(key)
if configured_value is not None and str(configured_value).strip() == binary_name:
keys.append(key)
for key, prop in plugin.config_schema.items():
if key.endswith("_BINARY") and prop.get("default") == binary_name:
keys.append(key)
return list(dict.fromkeys(keys))
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
active_config = dict(config or {})
overrides: dict[str, str] = {}
shared_lib_dir: Path | None = None
pip_home: Path | None = None
pip_bin_dir: Path | None = None
npm_home: Path | None = None
node_modules_dir: Path | None = None
npm_bin_dir: Path | None = None
binaries = (
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
)
for binary in binaries:
try:
resolved_path = Path(binary.abspath).expanduser()
except (TypeError, ValueError):
continue
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
continue
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
overrides[key] = binary.abspath
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
npm_bin_dir = npm_bin_dir or resolved_path.parent
node_modules_dir = node_modules_dir or resolved_path.parent.parent
npm_home = npm_home or resolved_path.parent.parent.parent
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
elif (
resolved_path.parent.name == "bin"
and resolved_path.parent.parent.name == "venv"
and resolved_path.parent.parent.parent.name == "pip"
):
pip_bin_dir = pip_bin_dir or resolved_path.parent
pip_home = pip_home or resolved_path.parent.parent.parent
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
if shared_lib_dir is not None:
overrides["LIB_DIR"] = str(shared_lib_dir)
overrides["LIB_BIN_DIR"] = str(shared_lib_dir / "bin")
if pip_home is not None:
overrides["PIP_HOME"] = str(pip_home)
if pip_bin_dir is not None:
overrides["PIP_BIN_DIR"] = str(pip_bin_dir)
if npm_home is not None:
overrides["NPM_HOME"] = str(npm_home)
if node_modules_dir is not None:
overrides["NODE_MODULES_DIR"] = str(node_modules_dir)
overrides["NODE_MODULE_DIR"] = str(node_modules_dir)
overrides["NODE_PATH"] = str(node_modules_dir)
if npm_bin_dir is not None:
overrides["NPM_BIN_DIR"] = str(npm_bin_dir)
return overrides
def _limit_stop_reason(config: dict[str, Any]) -> str:
return CrawlLimitState.from_config(config).get_stop_reason()
def _attach_bus_trace(bus) -> None:
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
if not trace_target:
return
if getattr(bus, "_archivebox_trace_task", None) is not None:
return
trace_path = None if trace_target in {"1", "-", "stderr"} else Path(trace_target)
stop_event = asyncio.Event()
async def trace_loop() -> None:
seen_event_ids: set[str] = set()
while not stop_event.is_set():
for event_id, event in list(bus.event_history.items()):
if event_id in seen_event_ids:
continue
seen_event_ids.add(event_id)
payload = event.model_dump(mode="json")
payload["bus_name"] = bus.name
line = json.dumps(payload, ensure_ascii=False, default=str, separators=(",", ":"))
if trace_path is None:
print(line, file=sys.stderr, flush=True)
else:
trace_path.parent.mkdir(parents=True, exist_ok=True)
with trace_path.open("a", encoding="utf-8") as handle:
handle.write(line + "\n")
await asyncio.sleep(0.05)
bus._archivebox_trace_stop = stop_event
bus._archivebox_trace_task = asyncio.create_task(trace_loop())
async def _stop_bus_trace(bus) -> None:
stop_event = getattr(bus, "_archivebox_trace_stop", None)
trace_task = getattr(bus, "_archivebox_trace_task", None)
if stop_event is None or trace_task is None:
return
stop_event.set()
await asyncio.gather(trace_task, return_exceptions=True)
bus._archivebox_trace_stop = None
bus._archivebox_trace_task = None
return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name)
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
@@ -235,22 +94,25 @@ class CrawlRunner:
self.crawl = crawl
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
self.process_service = ProcessService(self.bus)
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
ProcessService(self.bus)
BinaryService(self.bus)
TagService(self.bus)
CrawlService(self.bus, crawl_id=str(crawl.id))
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
async def ignore_snapshot(_snapshot_id: str) -> None:
return None
SnapshotService(
self.bus,
crawl_id=str(crawl.id),
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot,
)
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
ArchiveResultService(self.bus)
self.selected_plugins = selected_plugins
self.initial_snapshot_ids = snapshot_ids
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
self.abx_services = None
self.persona = None
self.base_config: dict[str, Any] = {}
self.derived_config: dict[str, Any] = {}
@@ -258,15 +120,11 @@ class CrawlRunner:
self._live_stream = None
async def run(self) -> None:
from asgiref.sync import sync_to_async
from archivebox.crawls.models import Crawl
try:
await sync_to_async(self._prepare, thread_sensitive=True)()
snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)()
live_ui = self._create_live_ui()
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(self.bus)
self.abx_services = setup_abx_services(
setup_abx_services(
self.bus,
plugins=self.plugins,
config_overrides={
@@ -278,18 +136,14 @@ class CrawlRunner:
auto_install=True,
emit_jsonl=False,
)
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids:
root_snapshot_id = snapshot_ids[0]
await self._run_crawl_setup(root_snapshot_id)
await self.run_crawl_setup(root_snapshot_id)
for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id)
await self._wait_for_snapshot_tasks()
await self._run_crawl_cleanup(root_snapshot_id)
if self.abx_services is not None:
await self.abx_services.process.wait_for_background_monitors()
await self.wait_for_snapshot_tasks()
await self.run_crawl_cleanup(root_snapshot_id)
finally:
await _stop_bus_trace(self.bus)
await self.bus.stop()
if self._live_stream is not None:
try:
@@ -297,33 +151,16 @@ class CrawlRunner:
except Exception:
pass
self._live_stream = None
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
if crawl_is_finished:
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
else:
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
await sync_to_async(self.finalize_run_state, thread_sensitive=True)()
async def enqueue_snapshot(self, snapshot_id: str) -> None:
task = self.snapshot_tasks.get(snapshot_id)
if task is not None and not task.done():
return
task = asyncio.create_task(self._run_snapshot(snapshot_id))
task = asyncio.create_task(self.run_snapshot(snapshot_id))
self.snapshot_tasks[snapshot_id] = task
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
return None
async def _wait_for_snapshot_tasks(self) -> None:
async def wait_for_snapshot_tasks(self) -> None:
while True:
pending_tasks: list[asyncio.Task[None]] = []
for snapshot_id, task in list(self.snapshot_tasks.items()):
@@ -339,9 +176,9 @@ class CrawlRunner:
for task in done:
task.result()
def _prepare(self) -> None:
def load_run_state(self) -> list[str]:
from archivebox.config.configset import get_config
from archivebox.machine.models import NetworkInterface, Process
from archivebox.machine.models import Machine, NetworkInterface, Process
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
current_iface = NetworkInterface.current(refresh=True)
@@ -352,17 +189,42 @@ class CrawlRunner:
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
self.derived_config = dict(Machine.current().config)
self.base_config["ABX_RUNTIME"] = "archivebox"
if self.selected_plugins is None:
self.selected_plugins = _selected_plugins_from_config(self.base_config)
raw_plugins = self.base_config["PLUGINS"].strip()
self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] if raw_plugins else None
if self.persona:
chrome_binary = str(self.base_config.get("CHROME_BINARY") or "")
self.base_config.update(self.persona.prepare_runtime_for_crawl(self.crawl, chrome_binary=chrome_binary))
self.base_config.update(
self.persona.prepare_runtime_for_crawl(
self.crawl,
chrome_binary=self.base_config["CHROME_BINARY"],
),
)
if self.initial_snapshot_ids:
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
return [str(snapshot.id) for snapshot in snapshots]
def finalize_run_state(self) -> None:
from archivebox.crawls.models import Crawl
def _cleanup_persona(self) -> None:
if self.persona:
self.persona.cleanup_runtime_for_crawl(self.crawl)
crawl = Crawl.objects.get(id=self.crawl.id)
if crawl.is_finished():
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
return
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
crawl.save(update_fields=["status", "retry_at", "modified_at"])
def _create_live_ui(self) -> LiveBusUI | None:
stdout_is_tty = sys.stdout.isatty()
@@ -373,7 +235,7 @@ class CrawlRunner:
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
stream = self._live_stream
except OSError:
self._live_stream = None
@@ -399,7 +261,7 @@ class CrawlRunner:
live_ui = LiveBusUI(
self.bus,
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
timeout_seconds=self.base_config["TIMEOUT"],
ui_console=ui_console,
interactive_tty=True,
)
@@ -410,128 +272,24 @@ class CrawlRunner:
)
return live_ui
def _create_root_snapshots(self) -> list[str]:
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
return [str(snapshot.id) for snapshot in snapshots]
def _initial_snapshot_ids(self) -> list[str]:
if self.initial_snapshot_ids:
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
return self._create_root_snapshots()
def _snapshot_config(self, snapshot) -> dict[str, Any]:
def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]:
from archivebox.core.models import Snapshot
from archivebox.config.configset import get_config
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
config = get_config(crawl=self.crawl, snapshot=snapshot)
config.update(self.base_config)
config["CRAWL_DIR"] = str(self.crawl.output_dir)
config["SNAP_DIR"] = str(snapshot.output_dir)
config["SNAPSHOT_ID"] = str(snapshot.id)
config["SNAPSHOT_DEPTH"] = snapshot.depth
config["CRAWL_ID"] = str(self.crawl.id)
config["SOURCE_URL"] = snapshot.url
if snapshot.parent_snapshot_id:
config["PARENT_SNAPSHOT_ID"] = str(snapshot.parent_snapshot_id)
return config
async def _run_crawl_setup(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
setup_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
bus=self.bus,
emit_jsonl=False,
snapshot=setup_snapshot,
crawl_setup_only=True,
)
async def _run_crawl_cleanup(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
cleanup_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
bus=self.bus,
emit_jsonl=False,
snapshot=cleanup_snapshot,
crawl_cleanup_only=True,
)
async def _run_snapshot(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
async with self.snapshot_semaphore:
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
if snapshot["status"] == "sealed":
return
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
return
abx_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
try:
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
bus=self.bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
extra_context: dict[str, Any] = {}
if config.get("EXTRA_CONTEXT"):
parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"]))
if not isinstance(parsed_extra_context, dict):
raise TypeError("EXTRA_CONTEXT must decode to an object")
extra_context = parsed_extra_context
extra_context["snapshot_id"] = str(snapshot.id)
extra_context["snapshot_depth"] = snapshot.depth
config["EXTRA_CONTEXT"] = json.dumps(extra_context, separators=(",", ":"), sort_keys=True)
return {
"id": str(snapshot.id),
"url": snapshot.url,
@@ -542,12 +300,91 @@ class CrawlRunner:
"tags": snapshot.tags_str(),
"depth": snapshot.depth,
"status": snapshot.status,
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
"output_dir": str(snapshot.output_dir),
"config": self._snapshot_config(snapshot),
"config": config,
}
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
async def run_crawl_setup(self, snapshot_id: str) -> None:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
bus=self.bus,
emit_jsonl=False,
install_enabled=True,
crawl_setup_enabled=True,
crawl_start_enabled=False,
snapshot_cleanup_enabled=False,
crawl_cleanup_enabled=False,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
async def run_crawl_cleanup(self, snapshot_id: str) -> None:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
await download(
bus=self.bus,
url=snapshot["url"],
output_dir=Path(snapshot["output_dir"]),
plugins=self.plugins,
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
emit_jsonl=False,
install_enabled=False,
crawl_setup_enabled=False,
crawl_start_enabled=False,
snapshot_cleanup_enabled=False,
crawl_cleanup_enabled=True,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
async def run_snapshot(self, snapshot_id: str) -> None:
async with self.snapshot_semaphore:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
if snapshot["status"] == "sealed":
return
if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() == "max_size":
await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
return
try:
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
bus=self.bus,
emit_jsonl=False,
install_enabled=False,
crawl_setup_enabled=False,
crawl_start_enabled=True,
snapshot_cleanup_enabled=True,
crawl_cleanup_enabled=False,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
@@ -579,21 +416,20 @@ def run_crawl(
async def _run_binary(binary_id: str) -> None:
from asgiref.sync import sync_to_async
from archivebox.config.configset import get_config
from archivebox.machine.models import Binary
from archivebox.machine.models import Binary, Machine
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
binary = await Binary.objects.aget(id=binary_id)
plugins = discover_plugins()
config = get_config()
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
derived_config = dict(machine.config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
process_service = ProcessService(bus)
ProcessService(bus)
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus, process_service=process_service)
ArchiveResultService(bus)
setup_abx_services(
bus,
plugins=plugins,
@@ -605,7 +441,6 @@ async def _run_binary(binary_id: str) -> None:
)
try:
_attach_bus_trace(bus)
await bus.emit(
BinaryRequestEvent(
name=binary.name,
@@ -619,7 +454,6 @@ async def _run_binary(binary_id: str) -> None:
),
)
finally:
await _stop_bus_trace(bus)
await bus.stop()
@@ -628,20 +462,20 @@ def run_binary(binary_id: str) -> None:
async def _run_install(plugin_names: list[str] | None = None) -> None:
from asgiref.sync import sync_to_async
from archivebox.config.configset import get_config
from archivebox.machine.models import Machine
plugins = discover_plugins()
config = get_config()
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
derived_config = dict(machine.config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
process_service = ProcessService(bus)
ProcessService(bus)
BinaryService(bus)
TagService(bus)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
ArchiveResultService(bus)
setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
@@ -657,7 +491,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
if not selected_plugins:
return
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
timeout_seconds = int(config.get("TIMEOUT") or 60)
timeout_seconds = config["TIMEOUT"]
stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty
@@ -668,7 +502,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"):
try:
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
stream = live_stream
except OSError:
live_stream = None
@@ -707,20 +541,21 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
plugins_label=plugins_label,
)
with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(bus)
results = await abx_install_plugins(
plugin_names=plugin_names,
plugins=plugins,
output_dir=output_dir,
config_overrides=config,
derived_config_overrides=derived_config,
emit_jsonl=False,
bus=bus,
machine_service=None,
binary_service=None,
process_service=None,
)
await abx_services.process.wait_for_background_monitors()
if live_ui is not None:
live_ui.print_summary(results, output_dir=output_dir)
finally:
await _stop_bus_trace(bus)
await bus.stop()
try:
if live_stream is not None:
@@ -739,6 +574,12 @@ def recover_orphaned_crawls() -> int:
from archivebox.machine.models import Process
active_crawl_ids: set[str] = set()
orphaned_crawls = list(
Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set"),
)
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
@@ -746,23 +587,27 @@ def recover_orphaned_crawls() -> int:
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
).only("pwd")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
if not proc.pwd:
continue
crawl_id = env.get("CRAWL_ID")
if crawl_id:
active_crawl_ids.add(str(crawl_id))
proc_pwd = Path(proc.pwd)
for crawl in orphaned_crawls:
matched_snapshot = None
for snapshot in crawl.snapshot_set.all():
try:
proc_pwd.relative_to(snapshot.output_dir)
matched_snapshot = snapshot
break
except ValueError:
continue
if matched_snapshot is not None:
active_crawl_ids.add(str(crawl.id))
break
recovered = 0
now = timezone.now()
orphaned_crawls = Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set")
for crawl in orphaned_crawls:
if str(crawl.id) in active_crawl_ids:
continue
@@ -788,6 +633,11 @@ def recover_orphaned_snapshots() -> int:
from archivebox.machine.models import Process
active_snapshot_ids: set[str] = set()
orphaned_snapshots = list(
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set"),
)
running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
process_type__in=[
@@ -795,24 +645,22 @@ def recover_orphaned_snapshots() -> int:
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).only("env")
).only("pwd")
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
if not proc.pwd:
continue
snapshot_id = env.get("SNAPSHOT_ID")
if snapshot_id:
active_snapshot_ids.add(str(snapshot_id))
proc_pwd = Path(proc.pwd)
for snapshot in orphaned_snapshots:
try:
proc_pwd.relative_to(snapshot.output_dir)
active_snapshot_ids.add(str(snapshot.id))
break
except ValueError:
continue
recovered = 0
now = timezone.now()
orphaned_snapshots = (
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set")
)
for snapshot in orphaned_snapshots:
if str(snapshot.id) in active_snapshot_ids:
continue

View File

@@ -7,8 +7,6 @@ from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.services.base import BaseService
from .db import run_db_op
class SnapshotService(BaseService):
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
@@ -18,120 +16,96 @@ class SnapshotService(BaseService):
self.crawl_id = crawl_id
self.schedule_snapshot = schedule_snapshot
super().__init__(bus)
self.bus.on(SnapshotEvent, self.on_SnapshotEvent)
self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent)
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
snapshot_id = await run_db_op(self._project_snapshot, event)
if snapshot_id:
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
if snapshot_id:
await sync_to_async(self._write_snapshot_details)(snapshot_id)
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=self.crawl_id)
crawl = await Crawl.objects.aget(id=self.crawl_id)
snapshot_id: str | None = None
snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).afirst()
if event.depth == 0:
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
if snapshot is None:
return None
if snapshot is not None:
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
return str(snapshot.id)
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
snapshot_id = str(snapshot.id)
elif event.depth > 0:
if event.depth <= crawl.max_depth and self._crawl_limit_stop_reason(crawl) != "max_size":
parent_event = await self.bus.find(
SnapshotEvent,
past=True,
future=False,
where=lambda candidate: candidate.depth == event.depth - 1 and self.bus.event_is_child_of(event, candidate),
)
parent_snapshot = None
if parent_event is not None:
parent_snapshot = await Snapshot.objects.filter(id=parent_event.snapshot_id, crawl=crawl).afirst()
if parent_snapshot is not None and self._url_passes_filters(crawl, parent_snapshot, event.url):
snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)(
{
"url": event.url,
"depth": event.depth,
"parent_snapshot_id": str(parent_snapshot.id),
"crawl_id": str(crawl.id),
},
overrides={
"crawl": crawl,
"snapshot": parent_snapshot,
"created_by_id": crawl.created_by_id,
},
queue_for_extraction=False,
)
if snapshot is not None and snapshot.status != Snapshot.StatusChoices.SEALED:
snapshot.retry_at = None
snapshot.status = Snapshot.StatusChoices.QUEUED
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
snapshot_id = str(snapshot.id)
if event.depth > crawl.max_depth:
return None
if self._crawl_limit_stop_reason(crawl) == "max_size":
return None
if snapshot_id:
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
if snapshot is not None:
await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)()
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
if parent_snapshot is None:
return None
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
return None
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.from_json(
{
"url": event.url,
"depth": event.depth,
"parent_snapshot_id": str(parent_snapshot.id),
"crawl_id": str(crawl.id),
},
overrides={
"crawl": crawl,
"snapshot": parent_snapshot,
"created_by_id": crawl.created_by_id,
},
queue_for_extraction=False,
)
if snapshot is None:
return None
if snapshot.status == Snapshot.StatusChoices.SEALED:
return None
snapshot.retry_at = None
if snapshot.status != Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
return str(snapshot.id)
snapshot = await Snapshot.objects.select_related("crawl").filter(id=event.snapshot_id).afirst()
snapshot_id: str | None = None
if snapshot is not None:
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
await snapshot.asave(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
await (
Snapshot.objects.filter(
crawl_id=snapshot.crawl_id,
status=Snapshot.StatusChoices.QUEUED,
)
.exclude(id=snapshot.id)
.aupdate(
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
modified_at=timezone.now(),
)
)
snapshot_id = str(snapshot.id)
if snapshot_id:
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
if snapshot is not None:
await sync_to_async(snapshot.write_index_jsonl, thread_sensitive=True)()
await sync_to_async(snapshot.write_json_details, thread_sensitive=True)()
await sync_to_async(snapshot.write_html_details, thread_sensitive=True)()
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
def _seal_snapshot(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
if snapshot is None:
return None
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
return str(snapshot.id)
def _crawl_limit_stop_reason(self, crawl) -> str:
config = dict(crawl.config or {})
config["CRAWL_DIR"] = str(crawl.output_dir)
return CrawlLimitState.from_config(config).get_stop_reason()
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
from archivebox.core.models import Snapshot
return (
Snapshot.objects.filter(
crawl_id=crawl_id,
status=Snapshot.StatusChoices.QUEUED,
)
.exclude(id=exclude_snapshot_id)
.update(
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
modified_at=timezone.now(),
)
)
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is not None:
snapshot.ensure_crawl_symlink()
def _write_snapshot_details(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is None:
return
snapshot.write_index_jsonl()
snapshot.write_json_details()
snapshot.write_html_details()

View File

@@ -3,20 +3,20 @@ from __future__ import annotations
from abx_dl.events import TagEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
class TagService(BaseService):
LISTENS_TO = [TagEvent]
EMITS = []
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
await run_db_op(self._project, event)
def __init__(self, bus):
super().__init__(bus)
self.bus.on(TagEvent, self.on_TagEvent__save_to_db)
def _project(self, event: TagEvent) -> None:
from archivebox.core.models import Snapshot, Tag
async def on_TagEvent__save_to_db(self, event: TagEvent) -> None:
from archivebox.core.models import Snapshot, SnapshotTag, Tag
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).afirst()
if snapshot is None:
return
Tag.from_json({"name": event.name}, overrides={"snapshot": snapshot})
tag, _ = await Tag.objects.aget_or_create(name=event.name)
await SnapshotTag.objects.aget_or_create(snapshot=snapshot, tag=tag)

View File

@@ -312,7 +312,7 @@ CREATE TABLE IF NOT EXISTS machine_dependency (
modified_at DATETIME,
bin_name VARCHAR(63) NOT NULL UNIQUE,
bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
custom_cmds TEXT DEFAULT '{}',
overrides TEXT DEFAULT '{}',
config TEXT DEFAULT '{}'
);
@@ -973,7 +973,6 @@ def seed_0_8_data(db_path: Path) -> dict[str, list[dict]]:
("machine", "0003_alter_installedbinary_options_and_more"),
("machine", "0004_alter_installedbinary_abspath_and_more"),
# Then the new migrations after squashing
("machine", "0002_rename_custom_cmds_to_overrides"),
("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"),
("machine", "0004_drop_dependency_table"),
# Crawls must come before core.0024 because 0024_b depends on it

View File

@@ -144,13 +144,13 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
pwd=str(snapshot.output_dir / "wget"),
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
env={
"SOURCE_URL": "https://example.com",
"SAFE_FLAG": "1",
"API_KEY": "super-secret-key",
"ACCESS_TOKEN": "super-secret-token",
"SHARED_SECRET": "super-secret-secret",
},
status=Process.StatusChoices.EXITED,
url="https://example.com",
)
result = ArchiveResult.objects.create(
snapshot=snapshot,
@@ -164,7 +164,7 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
cmd_html = str(admin.cmd_str(result))
assert "SAFE_FLAG=1" in cmd_html
assert "SOURCE_URL=https://example.com" in cmd_html
assert "https://example.com" in cmd_html
assert "API_KEY" not in cmd_html
assert "ACCESS_TOKEN" not in cmd_html
assert "SHARED_SECRET" not in cmd_html

View File

@@ -8,6 +8,7 @@ Tests cover:
- Snapshot progress statistics
"""
import json
import pytest
import uuid
from pathlib import Path
@@ -822,7 +823,6 @@ class TestAdminSnapshotListView:
pwd="/tmp/archivebox",
cmd=["python", "/tmp/job.py", "--url=https://example.com"],
env={
"SNAPSHOT_ID": "abc123",
"ENABLED": True,
"API_KEY": "super-secret-key",
"ACCESS_TOKEN": "super-secret-token",
@@ -843,7 +843,6 @@ class TestAdminSnapshotListView:
assert response.status_code == 200
assert b"Kill" in response.content
assert b"python /tmp/job.py --url=https://example.com" in response.content
assert b"SNAPSHOT_ID=abc123" in response.content
assert b"ENABLED=True" in response.content
assert b"52s" in response.content
assert b"API_KEY=" not in response.content
@@ -1065,7 +1064,7 @@ class TestAdminSnapshotListView:
pid=54321,
exit_code=0,
cmd=["/plugins/title/on_Snapshot__54_title.js", "--url=https://example.com"],
env={"SNAPSHOT_ID": str(snapshot.id)},
env={"EXTRA_CONTEXT": json.dumps({"snapshot_id": str(snapshot.id)})},
started_at=timezone.now(),
ended_at=timezone.now(),
)
@@ -1252,11 +1251,8 @@ class TestLiveProgressView:
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=pid,
pwd=str(snapshot.output_dir / "chrome"),
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(),
)
@@ -1290,11 +1286,8 @@ class TestLiveProgressView:
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=pid,
pwd=str(snapshot.output_dir / "title"),
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(),
)
@@ -1327,11 +1320,8 @@ class TestLiveProgressView:
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
pwd=str(snapshot.output_dir / "chrome"),
cmd=["/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(),
)
ArchiveResult.objects.create(
@@ -1369,11 +1359,8 @@ class TestLiveProgressView:
status=Process.StatusChoices.EXITED,
exit_code=0,
pid=99999,
pwd=str(snapshot.output_dir / "title"),
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(),
ended_at=timezone.now(),
)

View File

@@ -5,12 +5,12 @@ import pytest
from django.db import connection
from abx_dl.events import BinaryRequestEvent, ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.events import ArchiveResultEvent, BinaryRequestEvent, ProcessEvent, ProcessStartedEvent
from abx_dl.orchestrator import create_bus
from abx_dl.output_files import OutputFile
pytestmark = pytest.mark.django_db
pytestmark = pytest.mark.django_db(transaction=True)
def _cleanup_machine_process_rows() -> None:
@@ -75,8 +75,8 @@ def _create_iface(machine):
def test_process_completed_projects_inline_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
from archivebox.services.archive_result_service import ArchiveResultService
import asyncio
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "wget"
@@ -84,37 +84,23 @@ def test_process_completed_projects_inline_archiveresult():
(plugin_dir / "index.html").write_text("<html>ok</html>")
bus = create_bus(name="test_inline_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
service = ArchiveResultService(bus)
event = ProcessCompletedEvent(
plugin_name="wget",
hook_name="on_Snapshot__06_wget.finite.bg",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"wget/index.html"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
process_id="proc-inline",
event = ArchiveResultEvent(
snapshot_id=str(snapshot.id),
plugin="wget",
hook_name="on_Snapshot__06_wget.finite.bg",
status="succeeded",
output_str="wget/index.html",
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "wget",
"hook_name": "on_Snapshot__06_wget.finite.bg",
"status": "succeeded",
"output_str": "wget/index.html",
},
output_files,
output_size,
output_mimetypes,
)
async def emit_event() -> None:
await service.on_ArchiveResultEvent__save_to_db(event)
asyncio.run(emit_event())
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
@@ -127,45 +113,31 @@ def test_process_completed_projects_inline_archiveresult():
def test_process_completed_projects_synthetic_failed_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
from archivebox.services.archive_result_service import ArchiveResultService
import asyncio
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "chrome"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_synthetic_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
service = ArchiveResultService(bus)
event = ProcessCompletedEvent(
plugin_name="chrome",
hook_name="on_Snapshot__11_chrome_wait",
stdout="",
stderr="Hook timed out after 60 seconds",
exit_code=-1,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-failed",
event = ArchiveResultEvent(
snapshot_id=str(snapshot.id),
plugin="chrome",
hook_name="on_Snapshot__11_chrome_wait",
status="failed",
output_str="Hook timed out after 60 seconds",
error="Hook timed out after 60 seconds",
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:01:00+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"plugin": "chrome",
"hook_name": "on_Snapshot__11_chrome_wait",
"status": "failed",
"output_str": "Hook timed out after 60 seconds",
"error": "Hook timed out after 60 seconds",
},
output_files,
output_size,
output_mimetypes,
)
async def emit_event() -> None:
await service.on_ArchiveResultEvent__save_to_db(event)
asyncio.run(emit_event())
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
assert result.status == ArchiveResult.StatusChoices.FAILED
@@ -176,45 +148,30 @@ def test_process_completed_projects_synthetic_failed_archiveresult():
def test_process_completed_projects_noresults_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
from archivebox.services.archive_result_service import ArchiveResultService
import asyncio
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_noresults_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
service = ArchiveResultService(bus)
event = ProcessCompletedEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-noresults",
event = ArchiveResultEvent(
snapshot_id=str(snapshot.id),
plugin="title",
hook_name="on_Snapshot__54_title.js",
status="noresults",
output_str="No title found",
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "noresults",
"output_str": "No title found",
},
output_files,
output_size,
output_mimetypes,
)
async def emit_event() -> None:
await service.on_ArchiveResultEvent__save_to_db(event)
asyncio.run(emit_event())
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
assert result.status == ArchiveResult.StatusChoices.NORESULTS
@@ -258,45 +215,30 @@ def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state():
def test_process_completed_projects_snapshot_title_from_output_str():
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
from archivebox.services.archive_result_service import ArchiveResultService
import asyncio
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_snapshot_title_output_str")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
service = ArchiveResultService(bus)
event = ProcessCompletedEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"Example Domain"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-title-output-str",
event = ArchiveResultEvent(
snapshot_id=str(snapshot.id),
plugin="title",
hook_name="on_Snapshot__54_title.js",
status="succeeded",
output_str="Example Domain",
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "succeeded",
"output_str": "Example Domain",
},
output_files,
output_size,
output_mimetypes,
)
async def emit_event() -> None:
await service.on_ArchiveResultEvent__save_to_db(event)
asyncio.run(emit_event())
snapshot.refresh_from_db()
assert snapshot.title == "Example Domain"
@@ -304,8 +246,8 @@ def test_process_completed_projects_snapshot_title_from_output_str():
def test_process_completed_projects_snapshot_title_from_title_file():
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
from archivebox.services.archive_result_service import ArchiveResultService
import asyncio
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
@@ -313,37 +255,23 @@ def test_process_completed_projects_snapshot_title_from_title_file():
(plugin_dir / "title.txt").write_text("Example Domain")
bus = create_bus(name="test_snapshot_title_file")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
service = ArchiveResultService(bus)
event = ProcessCompletedEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
process_id="proc-title-file",
event = ArchiveResultEvent(
snapshot_id=str(snapshot.id),
plugin="title",
hook_name="on_Snapshot__54_title.js",
status="noresults",
output_str="No title found",
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "noresults",
"output_str": "No title found",
},
output_files,
output_size,
output_mimetypes,
)
async def emit_event() -> None:
await service.on_ArchiveResultEvent__save_to_db(event)
asyncio.run(emit_event())
snapshot.refresh_from_db()
assert snapshot.title == "Example Domain"
@@ -410,9 +338,12 @@ def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path):
assert output_mimetypes == "application/warc"
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
@pytest.mark.django_db(transaction=True)
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch, tmp_path):
from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService
from archivebox.machine.models import Process as MachineProcess
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
from abx_dl.services.process_service import ProcessService as DlProcessService
machine = _create_machine()
iface = _create_iface(machine)
@@ -428,35 +359,60 @@ def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(
status=Binary.StatusChoices.INSTALLED,
)
hook_path = tmp_path / "on_Snapshot__57_mercury.py"
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
hook_path.chmod(0o755)
output_dir = tmp_path / "mercury"
output_dir.mkdir()
bus = create_bus(name="test_process_started_binary_hydration")
service = ProcessService(bus)
event = ProcessStartedEvent(
plugin_name="mercury",
hook_name="on_Snapshot__57_mercury.py",
hook_path="/plugins/mercury/on_Snapshot__57_mercury.py",
hook_args=["--url=https://example.com"],
output_dir="/tmp/mercury",
env={
"MERCURY_BINARY": binary.abspath,
"NODE_BINARY": "/tmp/node",
},
timeout=60,
pid=4321,
process_id="proc-mercury",
snapshot_id="",
start_ts="2026-03-22T12:00:00+00:00",
DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
ArchiveBoxProcessService(bus)
async def run_test() -> None:
await bus.emit(
ProcessEvent(
plugin_name="mercury",
hook_name="on_Snapshot__57_mercury.py",
hook_path=str(hook_path),
hook_args=["--url=https://example.com"],
is_background=False,
output_dir=str(output_dir),
env={
"MERCURY_BINARY": binary.abspath,
"NODE_BINARY": "/tmp/node",
},
timeout=60,
url="https://example.com",
),
)
started = await bus.find(
ProcessStartedEvent,
past=True,
future=False,
hook_name="on_Snapshot__57_mercury.py",
output_dir=str(output_dir),
)
assert started is not None
import asyncio
asyncio.run(run_test())
process = MachineProcess.objects.get(
pwd=str(output_dir),
cmd=[str(hook_path), "--url=https://example.com"],
)
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == binary.id
assert process.iface_id == iface.id
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch):
@pytest.mark.django_db(transaction=True)
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch, tmp_path):
from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService
from archivebox.machine.models import Process as MachineProcess
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
from abx_dl.services.process_service import ProcessService as DlProcessService
machine = _create_machine()
iface = _create_iface(machine)
@@ -472,27 +428,47 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
status=Binary.StatusChoices.INSTALLED,
)
hook_path = tmp_path / "on_Snapshot__75_parse_dom_outlinks.js"
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
hook_path.chmod(0o755)
output_dir = tmp_path / "parse-dom-outlinks"
output_dir.mkdir()
bus = create_bus(name="test_process_started_node_fallback")
service = ProcessService(bus)
event = ProcessStartedEvent(
plugin_name="parse_dom_outlinks",
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
hook_path="/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js",
hook_args=["--url=https://example.com"],
output_dir="/tmp/parse-dom-outlinks",
env={
"NODE_BINARY": node.abspath,
},
timeout=60,
pid=9876,
process_id="proc-parse-dom-outlinks",
snapshot_id="",
start_ts="2026-03-22T12:00:00+00:00",
DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
ArchiveBoxProcessService(bus)
async def run_test() -> None:
await bus.emit(
ProcessEvent(
plugin_name="parse_dom_outlinks",
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
hook_path=str(hook_path),
hook_args=["--url=https://example.com"],
is_background=False,
output_dir=str(output_dir),
env={"NODE_BINARY": node.abspath},
timeout=60,
url="https://example.com",
),
)
started = await bus.find(
ProcessStartedEvent,
past=True,
future=False,
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
output_dir=str(output_dir),
)
assert started is not None
import asyncio
asyncio.run(run_test())
process = MachineProcess.objects.get(
pwd=str(output_dir),
cmd=[str(hook_path), "--url=https://example.com"],
)
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == node.id
assert process.iface_id == iface.id
@@ -500,6 +476,7 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
from archivebox.machine.models import Binary, Machine
from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService
import asyncio
machine = _create_machine()
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
@@ -522,7 +499,7 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
binproviders="provider",
)
service._project_binary(event)
asyncio.run(service.on_BinaryRequestEvent(event))
binary.refresh_from_db()
assert Binary.objects.filter(machine=machine, name="wget").count() == 1

View File

@@ -378,11 +378,8 @@ class TestRecoverOrphanedCrawls:
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pwd=str(snapshot.output_dir / "chrome"),
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js"],
env={
"CRAWL_ID": str(crawl.id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(),
)

View File

@@ -464,23 +464,24 @@ class TestDependencyRecordOutput(unittest.TestCase):
self.assertEqual(data["name"], "wget")
self.assertTrue(data["abspath"].startswith("/"))
def test_dependency_record_outputs_machine_config(self):
"""Dependency resolution should output Machine config update JSONL."""
def test_dependency_record_outputs_binary_jsonl(self):
"""Dependency resolution should output Binary JSONL."""
hook_output = json.dumps(
{
"type": "Machine",
"config": {
"WGET_BINARY": "/usr/bin/wget",
},
"type": "Binary",
"name": "wget",
"abspath": "/usr/bin/wget",
"version": "1.21.3",
"binprovider": "env",
},
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data["type"], "Machine")
self.assertIn("config", data)
self.assertEqual(data["config"]["WGET_BINARY"], "/usr/bin/wget")
self.assertEqual(data["type"], "Binary")
self.assertEqual(data["name"], "wget")
self.assertEqual(data["abspath"], "/usr/bin/wget")
class TestSnapshotHookOutput(unittest.TestCase):

View File

@@ -269,12 +269,12 @@ class TestBinaryModel(TestCase):
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
self.assertGreater(binary.modified_at, old_modified)
def test_binary_from_json_preserves_install_args_overrides(self):
"""Binary.from_json() should persist canonical install_args overrides unchanged."""
def test_binary_from_json_preserves_provider_overrides(self):
"""Binary.from_json() should persist provider overrides unchanged."""
overrides = {
"apt": {"install_args": ["chromium"]},
"npm": {"install_args": "puppeteer"},
"custom": {"install_args": ["bash", "-lc", "echo ok"]},
"custom": {"install": "bash -lc 'echo ok'"},
}
binary = Binary.from_json(

View File

@@ -1,69 +1,4 @@
import asyncio
import json
import pytest
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
from archivebox.services import process_service as process_service_module
from archivebox.services.process_service import ProcessService
bus = create_bus(name="test_process_service_inline_process_event")
ProcessService(bus)
monkeypatch.setattr(
process_service_module,
"_ensure_worker",
lambda event: {
"pid": 4321,
"start": 1711111111.0,
"statename": "RUNNING",
"exitstatus": 0,
},
)
async def run_test():
await bus.emit(
ProcessStdoutEvent(
line=json.dumps(
{
"type": "ProcessEvent",
"plugin_name": "search_backend_sonic",
"hook_name": "worker_sonic",
"hook_path": "/usr/bin/sonic",
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
"is_background": True,
"daemon": True,
"url": "tcp://127.0.0.1:1491",
"output_dir": "/tmp/sonic",
"env": {},
"process_type": "worker",
"worker_type": "sonic",
"process_id": "worker:sonic",
"output_str": "127.0.0.1:1491",
},
),
plugin_name="search_backend_sonic",
hook_name="on_CrawlSetup__55_sonic_start.py",
output_dir="/tmp/search_backend_sonic",
snapshot_id="snap-1",
process_id="proc-hook",
),
)
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
await bus.stop()
return started
started = asyncio.run(run_test())
assert started is not None
assert started.hook_name == "worker_sonic"
assert started.process_type == "worker"
assert started.worker_type == "sonic"
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
assert getattr(started, "output_str", "") == "127.0.0.1:1491"

View File

@@ -34,18 +34,6 @@ class _DummyService:
pass
class _DummyAbxServices:
def __init__(self):
self.process = SimpleNamespace(wait_for_background_monitors=self._wait)
async def _wait(self):
return None
async def _call_sync(func, *args, **kwargs):
return func(*args, **kwargs)
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
@@ -82,18 +70,18 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
download_calls = []
async def fake_download(*, url, bus, snapshot, **kwargs):
async def fake_download(*, url, bus, config_overrides, **kwargs):
extra_context = json.loads(config_overrides["EXTRA_CONTEXT"])
download_calls.append(
{
"url": url,
"bus": bus,
"snapshot_id": snapshot.id,
"source_url": snapshot.url,
"abx_snapshot_id": snapshot.id,
"snapshot_id": extra_context["snapshot_id"],
"source_url": url,
},
)
await asyncio.sleep(0)
@@ -113,9 +101,8 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
"tags": snapshot_a.tags_str(),
"depth": snapshot_a.depth,
"parent_snapshot_id": str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
"output_dir": str(snapshot_a.output_dir),
"config": crawl_runner._snapshot_config(snapshot_a),
"config": crawl_runner.load_snapshot_payload(str(snapshot_a.id))["config"],
},
str(snapshot_b.id): {
"id": str(snapshot_b.id),
@@ -127,17 +114,16 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
"tags": snapshot_b.tags_str(),
"depth": snapshot_b.depth,
"parent_snapshot_id": str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
"output_dir": str(snapshot_b.output_dir),
"config": crawl_runner._snapshot_config(snapshot_b),
"config": crawl_runner.load_snapshot_payload(str(snapshot_b.id))["config"],
},
}
monkeypatch.setattr(crawl_runner, "_load_snapshot_run_data", lambda snapshot_id: snapshot_data[snapshot_id])
monkeypatch.setattr(crawl_runner, "load_snapshot_payload", lambda snapshot_id: snapshot_data[snapshot_id])
async def run_both():
await asyncio.gather(
crawl_runner._run_snapshot(str(snapshot_a.id)),
crawl_runner._run_snapshot(str(snapshot_b.id)),
crawl_runner.run_snapshot(str(snapshot_a.id)),
crawl_runner.run_snapshot(str(snapshot_b.id)),
)
asyncio.run(run_both())
@@ -243,10 +229,10 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
refresh_calls = []
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {})
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner._prepare()
crawl_runner.load_run_state()
assert refresh_calls == [True]
assert proc.iface is not None
@@ -254,10 +240,12 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
assert saved_updates == [("iface", "machine", "modified_at")]
def test_installed_binary_config_overrides_include_valid_installed_binaries(monkeypatch):
from archivebox.machine.models import Binary, Machine
def test_load_run_state_uses_machine_config_as_derived_config(monkeypatch):
from archivebox.machine.models import Machine, NetworkInterface, Process
from archivebox.services import runner as runner_module
from abx_dl.models import Plugin
from archivebox.config import configset as configset_module
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
machine = Machine.objects.create(
guid="test-guid-runner-overrides",
@@ -273,143 +261,30 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
config={"WGET_BINARY": "/tmp/wget", "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}},
)
mercury_binary = Binary.objects.create(
machine=machine,
name="postlight-parser",
abspath=sys.executable,
version="2.0.0",
binprovider="pip",
binproviders="env,pip",
status=Binary.StatusChoices.INSTALLED,
)
wget_binary = Binary.objects.create(
machine=machine,
name="wget",
abspath="/tmp/not-an-executable",
version="1.0.0",
binprovider="env",
binproviders="env",
status=Binary.StatusChoices.INSTALLED,
)
puppeteer_binary = Binary.objects.create(
machine=machine,
name="puppeteer",
abspath="/tmp/shared-lib/npm/node_modules/.bin/puppeteer",
version="24.40.0",
binprovider="npm",
binproviders="npm",
status=Binary.StatusChoices.INSTALLED,
)
ytdlp_binary = Binary.objects.create(
machine=machine,
name="yt-dlp",
abspath="/tmp/shared-lib/pip/venv/bin/yt-dlp",
version="2026.3.17",
binprovider="pip",
binproviders="pip",
status=Binary.StatusChoices.INSTALLED,
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
proc = SimpleNamespace(iface_id=str(machine.id), machine_id=str(machine.id), iface=None, machine=machine, save=lambda **kwargs: None)
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr(
Path,
"is_file",
lambda self: (
str(self) in {sys.executable, mercury_binary.abspath, wget_binary.abspath, puppeteer_binary.abspath, ytdlp_binary.abspath}
),
NetworkInterface,
"current",
classmethod(lambda cls, refresh=False: SimpleNamespace(id=machine.id, machine=machine)),
)
monkeypatch.setattr(
runner_module.os,
"access",
lambda path, mode: str(path) in {sys.executable, puppeteer_binary.abspath, ytdlp_binary.abspath},
)
overrides = runner_module._installed_binary_config_overrides(
{
"mercury": Plugin(
name="mercury",
path=Path("."),
hooks=[],
config_schema={"MERCURY_BINARY": {"type": "string", "default": "postlight-parser"}},
),
},
)
assert overrides["MERCURY_BINARY"] == sys.executable
assert "POSTLIGHT_PARSER_BINARY" not in overrides
assert "WGET_BINARY" not in overrides
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
assert overrides["PIP_HOME"] == "/tmp/shared-lib/pip"
assert overrides["PIP_BIN_DIR"] == "/tmp/shared-lib/pip/venv/bin"
assert overrides["NPM_HOME"] == "/tmp/shared-lib/npm"
assert overrides["NPM_BIN_DIR"] == "/tmp/shared-lib/npm/node_modules/.bin"
assert overrides["NODE_MODULES_DIR"] == "/tmp/shared-lib/npm/node_modules"
assert overrides["NODE_MODULE_DIR"] == "/tmp/shared-lib/npm/node_modules"
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
from archivebox.machine.models import Binary, Machine
from archivebox.services import runner as runner_module
from abx_dl.models import Plugin
machine = Machine.objects.create(
guid="test-guid-runner-singlefile-cache",
hostname="runner-host-singlefile",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid="test-hw-runner-singlefile-cache",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
)
singlefile_extension = Binary.objects.create(
machine=machine,
name="singlefile",
abspath="/tmp/shared-lib/bin/singlefile",
version="1.0.0",
binprovider="chromewebstore",
binproviders="chromewebstore",
status=Binary.StatusChoices.INSTALLED,
)
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath)
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
overrides = runner_module._installed_binary_config_overrides(
{
"singlefile": Plugin(
name="singlefile",
path=Path("."),
hooks=[],
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
binaries=[
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
{"name": "singlefile", "binproviders": "chromewebstore"},
],
),
},
config={"SINGLEFILE_BINARY": "single-file"},
)
crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner.load_run_state()
assert "SINGLEFILE_BINARY" not in overrides
assert "LIB_DIR" not in overrides
assert "LIB_BIN_DIR" not in overrides
assert crawl_runner.derived_config == machine.config
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
import asgiref.sync
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch, tmp_path):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
@@ -428,12 +303,6 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "_limit_stop_reason", lambda config: "max_size")
monkeypatch.setattr(
asgiref.sync,
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(
runner_module,
"download",
@@ -441,8 +310,21 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
)
crawl_runner = runner_module.CrawlRunner(crawl)
state_dir = tmp_path / ".abx-dl"
state_dir.mkdir(parents=True, exist_ok=True)
(state_dir / "limits.json").write_text(
json.dumps(
{
"admitted_snapshot_ids": ["child-1"],
"counted_process_ids": ["proc-1"],
"total_size": 32,
"stop_reason": "max_size",
},
),
encoding="utf-8",
)
cancelled: list[str] = []
crawl_runner._load_snapshot_run_data = lambda snapshot_id: {
crawl_runner.load_snapshot_payload = lambda snapshot_id: {
"id": snapshot_id,
"url": "https://example.com/child",
"title": "",
@@ -452,22 +334,23 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
"tags": "",
"depth": 1,
"status": "queued",
"parent_snapshot_id": None,
"output_dir": "/tmp/child",
"config": {"CRAWL_DIR": "/tmp/crawl", "MAX_SIZE": 16},
"config": {"CRAWL_DIR": str(tmp_path), "MAX_SIZE": 16},
}
crawl_runner._cancel_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
crawl_runner.seal_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
asyncio.run(crawl_runner._run_snapshot("child-1"))
asyncio.run(crawl_runner.run_snapshot("child-1"))
assert cancelled == ["child-1"]
@pytest.mark.django_db(transaction=True)
def test_seal_snapshot_cancels_queued_descendants_after_max_size():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.snapshot_service import SnapshotService
from abx_dl.events import SnapshotCompletedEvent
from abx_dl.orchestrator import create_bus
crawl = Crawl.objects.create(
@@ -505,13 +388,22 @@ def test_seal_snapshot_cancels_queued_descendants_after_max_size():
bus = create_bus(name="test_snapshot_limit_cancel")
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
try:
sealed_id = service._seal_snapshot(str(root.id))
async def emit_event() -> None:
await service.on_SnapshotCompletedEvent(
SnapshotCompletedEvent(
url=root.url,
snapshot_id=str(root.id),
output_dir=str(root.output_dir),
),
)
asyncio.run(emit_event())
finally:
asyncio.run(bus.stop())
root.refresh_from_db()
child.refresh_from_db()
assert sealed_id == str(root.id)
assert root.status == Snapshot.StatusChoices.SEALED
assert child.status == Snapshot.StatusChoices.SEALED
assert child.retry_at is None
@@ -548,7 +440,6 @@ def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
@@ -565,35 +456,23 @@ def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "is_finished", lambda: False)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status != Crawl.StatusChoices.SEALED
assert crawl.retry_at is not None
def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
import asgiref.sync
def test_crawl_runner_calls_load_and_finalize_run_state(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
@@ -618,50 +497,34 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(crawl, "cleanup", lambda: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
sync_to_async_wrapped: list[str] = []
sync_to_async_active = False
method_calls: list[str] = []
def fake_sync_to_async(func, thread_sensitive=True):
async def wrapper(*args, **kwargs):
nonlocal sync_to_async_active
sync_to_async_wrapped.append(getattr(func, "__name__", repr(func)))
previous = sync_to_async_active
sync_to_async_active = True
try:
return func(*args, **kwargs)
finally:
sync_to_async_active = previous
def wrapped_finalize(self):
method_calls.append("finalize_run_state")
return None
return wrapper
def wrapped_load(self):
method_calls.append("load_run_state")
return [str(snapshot.id)]
def guarded_is_finished():
assert sync_to_async_active is True
return False
monkeypatch.setattr(asgiref.sync, "sync_to_async", fake_sync_to_async)
monkeypatch.setattr(crawl, "is_finished", guarded_is_finished)
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", wrapped_finalize)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", wrapped_load)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None
assert "guarded_is_finished" in sync_to_async_wrapped
assert method_calls == ["load_run_state", "finalize_run_state"]
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
@@ -680,7 +543,7 @@ def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
task.set_exception(RuntimeError("snapshot failed"))
crawl_runner.snapshot_tasks["snap-1"] = task
with pytest.raises(RuntimeError, match="snapshot failed"):
await crawl_runner._wait_for_snapshot_tasks()
await crawl_runner.wait_for_snapshot_tasks()
asyncio.run(run_test())
@@ -702,14 +565,13 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
async def run_test():
task = asyncio.create_task(finish_snapshot())
crawl_runner.snapshot_tasks["snap-1"] = task
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
await asyncio.wait_for(crawl_runner.wait_for_snapshot_tasks(), timeout=0.5)
assert crawl_runner.snapshot_tasks == {}
asyncio.run(run_test())
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
@@ -726,30 +588,18 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "is_finished", lambda: False)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
cleanup_calls = []
monkeypatch.setattr(
runner_module.CrawlRunner,
"_run_crawl_cleanup",
"run_crawl_cleanup",
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
@@ -757,17 +607,20 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
assert cleanup_calls == ["abx_cleanup"]
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
def test_abx_process_service_background_process_finishes_after_process_exit(monkeypatch, tmp_path):
from abx_dl.models import Process as AbxProcess, now_iso
from abx_dl.services.process_service import ProcessService
from abx_dl.events import ProcessCompletedEvent
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
service = object.__new__(ProcessService)
service.emit_jsonl = False
emitted_events = []
async def fake_emit_event(event, *, detach_from_parent):
emitted_events.append((event, detach_from_parent))
class FakeBus:
async def emit(self, event):
emitted_events.append(event)
service.bus = FakeBus()
async def fake_stream_stdout(**kwargs):
try:
@@ -775,19 +628,8 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
except asyncio.CancelledError:
return ["daemon output\n"]
service._emit_event = fake_emit_event
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
class FakeAsyncProcess:
def __init__(self):
self.pid = 42424
self.returncode = None
async def wait(self):
await asyncio.sleep(0)
self.returncode = 0
return 0
plugin_output_dir = tmp_path / "chrome"
plugin_output_dir.mkdir()
stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log"
@@ -804,41 +646,45 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
plugin="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
)
process = FakeAsyncProcess()
event = SimpleNamespace(
plugin_name="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
hook_path="hook",
hook_args=["--url=https://example.org/"],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
snapshot_id="snap-1",
is_background=True,
url="https://example.org/",
process_type="hook",
worker_type="hook",
)
async def run_test():
process = await asyncio.create_subprocess_exec(
sys.executable,
"-c",
"pass",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
event = ProcessStartedEvent(
plugin_name="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
hook_path="hook",
hook_args=["--url=https://example.org/"],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
pid=process.pid,
is_background=True,
url="https://example.org/",
process_type="hook",
worker_type="hook",
start_ts=proc.started_at or "",
subprocess=process,
stdout_file=stdout_file,
stderr_file=stderr_file,
pid_file=pid_file,
cmd_file=plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.sh",
files_before=set(),
)
await asyncio.wait_for(
service._monitor_background_process(
event=event,
proc=proc,
process=process,
plugin_output_dir=plugin_output_dir,
stdout_file=stdout_file,
stderr_file=stderr_file,
pid_file=pid_file,
files_before=set(),
),
service.on_ProcessStartedEvent(event),
timeout=0.5,
)
asyncio.run(run_test())
assert pid_file.exists() is False
assert any(isinstance(event, ProcessCompletedEvent) for event, _ in emitted_events)
assert any(isinstance(event, ProcessCompletedEvent) for event in emitted_events)
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):

View File

@@ -0,0 +1,48 @@
import asyncio
import pytest
from abx_dl.events import TagEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db(transaction=True)
def _create_snapshot():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
return Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
def test_tag_event_projects_tag_to_snapshot():
from archivebox.core.models import Tag
from archivebox.services.tag_service import TagService
snapshot = _create_snapshot()
bus = create_bus(name="test_tag_service")
TagService(bus)
async def emit_tag_event() -> None:
await bus.emit(
TagEvent(
name="example",
snapshot_id=str(snapshot.id),
),
)
asyncio.run(emit_tag_event())
snapshot.refresh_from_db()
assert snapshot.tags.filter(name="example").exists()
assert Tag.objects.filter(name="example").exists()

2
docs

Submodule docs updated: be25d9bfa2...7244076ece

View File

@@ -42,7 +42,7 @@ Crawl.run()
{'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}}
# ❌ WRONG - uses different field names
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'custom_cmds': {...}}
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'overrides': {...}}
```
4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery.
@@ -84,7 +84,7 @@ Crawl.run()
# ❌ WRONG - complex transformation logic
if obj.get('type') == 'Dependency':
dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields
dep.custom_commands = transform_overrides(obj['overrides']) # transforming data
dep.overrides = transform_overrides(obj['overrides']) # transforming data
```
### Pattern Consistency

View File

@@ -159,6 +159,11 @@ environments = ["sys_platform == 'darwin'", "sys_platform == 'linux'"]
package = true
# compile-bytecode = true
[tool.uv.sources]
abx-pkg = { path = "../abx-pkg", editable = true }
abx-plugins = { path = "../abx-plugins", editable = true }
abx-dl = { path = "../abx-dl", editable = true }
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"