update working changes

This commit is contained in:
Nick Sweeting
2026-03-25 05:36:07 -07:00
parent 80243accfd
commit f3622d8cd3
29 changed files with 985 additions and 1666 deletions

View File

@@ -26,7 +26,7 @@ EVENT_FLOW_DIAGRAM = """
│ CrawlStartEvent │ │ CrawlStartEvent │
│ └─ SnapshotEvent │ │ └─ SnapshotEvent │
│ └─ on_Snapshot__* │ │ └─ on_Snapshot__* │
│ └─ Snapshot / ArchiveResult / Tag / Machine / BinaryRequest │ └─ ArchiveResult / Snapshot / Tag
│ │ │ │
│ SnapshotCleanupEvent -> internal cleanup, no direct hook family │ │ SnapshotCleanupEvent -> internal cleanup, no direct hook family │
│ CrawlCleanupEvent -> internal cleanup, no direct hook family │ │ CrawlCleanupEvent -> internal cleanup, no direct hook family │
@@ -89,8 +89,8 @@ def pluginmap(
"emits": ["ProcessEvent"], "emits": ["ProcessEvent"],
}, },
"SnapshotEvent": { "SnapshotEvent": {
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, and BinaryRequest records.", "description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, and Tag records.",
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "BinaryRequestEvent", "ProcessEvent"], "emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "ProcessEvent"],
}, },
"SnapshotCleanupEvent": { "SnapshotCleanupEvent": {
"description": "Internal snapshot cleanup phase.", "description": "Internal snapshot cleanup phase.",

View File

@@ -267,19 +267,13 @@ def get_config(
if crawl and hasattr(crawl, "output_dir"): if crawl and hasattr(crawl, "output_dir"):
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir) config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
config["CRAWL_DIR"] = str(crawl.output_dir) config["CRAWL_DIR"] = str(crawl.output_dir)
config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID")
# Apply snapshot config overrides (highest priority) # Apply snapshot config overrides (highest priority)
if snapshot and hasattr(snapshot, "config") and snapshot.config: if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config) config.update(snapshot.config)
if snapshot: if snapshot and hasattr(snapshot, "output_dir"):
config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID") config["SNAP_DIR"] = str(snapshot.output_dir)
config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0)
if hasattr(snapshot, "output_dir"):
config["SNAP_DIR"] = str(snapshot.output_dir)
if getattr(snapshot, "crawl_id", None):
config["CRAWL_ID"] = str(snapshot.crawl_id)
# Normalize all aliases to canonical names (after all sources merged) # Normalize all aliases to canonical names (after all sources merged)
# This handles aliases that came from user/crawl/snapshot configs, not just env # This handles aliases that came from user/crawl/snapshot configs, not just env

View File

@@ -38,8 +38,8 @@ def _quote_shell_string(value: str) -> str:
def _get_replay_source_url(result: ArchiveResult) -> str: def _get_replay_source_url(result: ArchiveResult) -> str:
process_env = getattr(getattr(result, "process", None), "env", None) or {} process = getattr(result, "process", None)
return str(process_env.get("SOURCE_URL") or result.snapshot.url or "") return str(getattr(process, "url", None) or result.snapshot.url or "")
def build_abx_dl_display_command(result: ArchiveResult) -> str: def build_abx_dl_display_command(result: ArchiveResult) -> str:

View File

@@ -1322,6 +1322,17 @@ def live_progress_view(request):
# Build hierarchical active crawls with nested snapshots and archive results # Build hierarchical active crawls with nested snapshots and archive results
active_crawls_qs = (
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
.prefetch_related(
"snapshot_set",
"snapshot_set__archiveresult_set",
"snapshot_set__archiveresult_set__process",
)
.distinct()
.order_by("-modified_at")[:10]
)
running_processes = Process.objects.filter( running_processes = Process.objects.filter(
machine=machine, machine=machine,
status=Process.StatusChoices.RUNNING, status=Process.StatusChoices.RUNNING,
@@ -1343,28 +1354,45 @@ def live_progress_view(request):
process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {} process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {}
process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {} process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {}
seen_process_records: set[str] = set() seen_process_records: set[str] = set()
snapshots = [snapshot for crawl in active_crawls_qs for snapshot in crawl.snapshot_set.all()]
for proc in running_processes: for proc in running_processes:
env = proc.env or {} if not proc.pwd:
if not isinstance(env, dict): continue
env = {} proc_pwd = Path(proc.pwd)
matched_snapshot = None
crawl_id = env.get("CRAWL_ID") for snapshot in snapshots:
snapshot_id = env.get("SNAPSHOT_ID") try:
proc_pwd.relative_to(snapshot.output_dir)
matched_snapshot = snapshot
break
except ValueError:
continue
if matched_snapshot is None:
continue
crawl_id = str(matched_snapshot.crawl_id)
snapshot_id = str(matched_snapshot.id)
_plugin, _label, phase, _hook_name = process_label(proc.cmd) _plugin, _label, phase, _hook_name = process_label(proc.cmd)
if crawl_id and proc.pid: if crawl_id and proc.pid:
crawl_process_pids.setdefault(str(crawl_id), proc.pid) crawl_process_pids.setdefault(crawl_id, proc.pid)
if phase == "snapshot" and snapshot_id and proc.pid: if phase == "snapshot" and snapshot_id and proc.pid:
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid) snapshot_process_pids.setdefault(snapshot_id, proc.pid)
for proc in recent_processes: for proc in recent_processes:
env = proc.env or {} if not proc.pwd:
if not isinstance(env, dict):
env = {}
crawl_id = env.get("CRAWL_ID")
snapshot_id = env.get("SNAPSHOT_ID")
if not crawl_id and not snapshot_id:
continue continue
proc_pwd = Path(proc.pwd)
matched_snapshot = None
for snapshot in snapshots:
try:
proc_pwd.relative_to(snapshot.output_dir)
matched_snapshot = snapshot
break
except ValueError:
continue
if matched_snapshot is None:
continue
crawl_id = str(matched_snapshot.crawl_id)
snapshot_id = str(matched_snapshot.id)
plugin, label, phase, hook_name = process_label(proc.cmd) plugin, label, phase, hook_name = process_label(proc.cmd)
@@ -1393,20 +1421,9 @@ def live_progress_view(request):
payload["pid"] = proc.pid payload["pid"] = proc.pid
proc_started_at = proc.started_at or proc.modified_at proc_started_at = proc.started_at or proc.modified_at
if phase == "snapshot" and snapshot_id: if phase == "snapshot" and snapshot_id:
process_records_by_snapshot.setdefault(str(snapshot_id), []).append((payload, proc_started_at)) process_records_by_snapshot.setdefault(snapshot_id, []).append((payload, proc_started_at))
elif crawl_id: elif crawl_id:
process_records_by_crawl.setdefault(str(crawl_id), []).append((payload, proc_started_at)) process_records_by_crawl.setdefault(crawl_id, []).append((payload, proc_started_at))
active_crawls_qs = (
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
.prefetch_related(
"snapshot_set",
"snapshot_set__archiveresult_set",
"snapshot_set__archiveresult_set__process",
)
.distinct()
.order_by("-modified_at")[:10]
)
active_crawls = [] active_crawls = []
total_workers = 0 total_workers = 0

View File

@@ -827,7 +827,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
for record in records[:3]: for record in records[:3]:
print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}") print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}")
if system_task: if system_task:
records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary", "Machine")] records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary")]
overrides = {"crawl": self} overrides = {"crawl": self}
stats = process_hook_records(records, overrides=overrides) stats = process_hook_records(records, overrides=overrides)
if stats: if stats:

View File

@@ -13,13 +13,9 @@ Hook-backed event families are discovered from filenames like:
on_CrawlSetup__* on_CrawlSetup__*
on_Snapshot__* on_Snapshot__*
InstallEvent itself is still part of the runtime lifecycle, but it has no Internal bus event names are normalized to the corresponding
corresponding hook family. Its dependency declarations come directly from each `on_{EventFamily}__*` prefix by a simple string transform. If no scripts exist
plugin's `config.json > required_binaries`. for that prefix, discovery returns `[]`.
Lifecycle event names like `InstallEvent` or `SnapshotCleanupEvent` are
normalized to the corresponding `on_{EventFamily}__*` prefix by a simple
string transform. If no scripts exist for that prefix, discovery returns `[]`.
Directory structure: Directory structure:
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package) abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
@@ -120,7 +116,6 @@ def normalize_hook_event_name(event_name: str) -> str | None:
Normalize a hook event family or event class name to its on_* prefix. Normalize a hook event family or event class name to its on_* prefix.
Examples: Examples:
InstallEvent -> Install
BinaryRequestEvent -> BinaryRequest BinaryRequestEvent -> BinaryRequest
CrawlSetupEvent -> CrawlSetup CrawlSetupEvent -> CrawlSetup
SnapshotEvent -> Snapshot SnapshotEvent -> Snapshot
@@ -171,7 +166,7 @@ def discover_hooks(
Args: Args:
event_name: Hook event family or event class name. event_name: Hook event family or event class name.
Examples: 'Install', 'InstallEvent', 'BinaryRequestEvent', 'Snapshot'. Examples: 'BinaryRequestEvent', 'Snapshot'.
Event names are normalized by stripping a trailing `Event`. Event names are normalized by stripping a trailing `Event`.
If no matching `on_{EventFamily}__*` scripts exist, returns []. If no matching `on_{EventFamily}__*` scripts exist, returns [].
filter_disabled: If True, skip hooks from disabled plugins (default: True) filter_disabled: If True, skip hooks from disabled plugins (default: True)
@@ -1070,9 +1065,8 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
Process JSONL records emitted by hook stdout. Process JSONL records emitted by hook stdout.
This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest, This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest,
Binary, and Machine. It does not process bus lifecycle events like and Binary. It does not process internal bus lifecycle events, since those
InstallEvent, CrawlEvent, CrawlCleanupEvent, or SnapshotCleanupEvent, since are not emitted as JSONL records by hook subprocesses.
those are not emitted as JSONL records by hook subprocesses.
Args: Args:
records: List of JSONL record dicts from result['records'] records: List of JSONL record dicts from result['records']
@@ -1131,13 +1125,6 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
if obj: if obj:
stats[record_type] = stats.get(record_type, 0) + 1 stats[record_type] = stats.get(record_type, 0) + 1
elif record_type == "Machine":
from archivebox.machine.models import Machine
obj = Machine.from_json(record.copy(), overrides)
if obj:
stats["Machine"] = stats.get("Machine", 0) + 1
else: else:
import sys import sys

View File

@@ -566,33 +566,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
return None return None
return {provider.strip() for provider in providers.split(",") if provider.strip()} return {provider.strip() for provider in providers.split(",") if provider.strip()}
def _get_custom_install_command(self) -> str | None:
"""Extract a custom install command from overrides when the custom provider is used."""
import shlex
if not isinstance(self.overrides, dict):
return None
for key in ("custom_cmd", "cmd", "command"):
value = self.overrides.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
custom_overrides = self.overrides.get("custom")
if isinstance(custom_overrides, dict):
for key in ("custom_cmd", "cmd", "command"):
value = custom_overrides.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
install_args = custom_overrides.get("install_args")
if isinstance(install_args, str) and install_args.strip():
return install_args.strip()
if isinstance(install_args, list) and install_args:
return " ".join(shlex.quote(str(arg)) for arg in install_args if str(arg).strip())
return None
def run(self): def run(self):
""" """
Execute binary installation by running on_BinaryRequest__* hooks. Execute binary installation by running on_BinaryRequest__* hooks.
@@ -637,13 +610,8 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
plugin_output_dir = output_dir / plugin_name plugin_output_dir = output_dir / plugin_name
plugin_output_dir.mkdir(parents=True, exist_ok=True) plugin_output_dir.mkdir(parents=True, exist_ok=True)
custom_cmd = None
overrides_json = None overrides_json = None
if plugin_name == "custom": if self.overrides:
custom_cmd = self._get_custom_install_command()
if not custom_cmd:
continue
elif self.overrides:
overrides_json = json.dumps(self.overrides) overrides_json = json.dumps(self.overrides)
# Run the hook # Run the hook
@@ -656,7 +624,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
machine_id=str(self.machine_id), machine_id=str(self.machine_id),
name=self.name, name=self.name,
binproviders=self.binproviders, binproviders=self.binproviders,
custom_cmd=custom_cmd,
overrides=overrides_json, overrides=overrides_json,
) )

View File

@@ -9,12 +9,11 @@ from typing import Any
from asgiref.sync import sync_to_async from asgiref.sync import sync_to_async
from django.utils import timezone from django.utils import timezone
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent
from abx_dl.output_files import guess_mimetype from abx_dl.output_files import guess_mimetype
from abx_dl.services.base import BaseService from abx_dl.services.base import BaseService
from .db import run_db_op from .process_service import parse_event_datetime
from .process_service import ProcessService, parse_event_datetime
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]: def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
@@ -209,79 +208,41 @@ class ArchiveResultService(BaseService):
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent] LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
EMITS = [] EMITS = []
def __init__(self, bus, *, process_service: ProcessService): def __init__(self, bus):
self.process_service = process_service
super().__init__(bus) super().__init__(bus)
self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db)
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None: async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None:
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
if snapshot_output_dir is None:
return
plugin_dir = Path(snapshot_output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
return
plugin_dir = Path(event.output_dir)
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
records = _iter_archiveresult_records(event.stdout)
if records:
for record in records:
await run_db_op(
self._project_from_process_completed,
event,
record,
output_files,
output_size,
output_mimetypes,
)
return
synthetic_record = {
"plugin": event.plugin_name,
"hook_name": event.hook_name,
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
"output_str": event.stderr if event.exit_code != 0 else "",
"error": event.stderr if event.exit_code != 0 else "",
}
await run_db_op(
self._project_from_process_completed,
event,
synthetic_record,
output_files,
output_size,
output_mimetypes,
)
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
return str(snapshot.output_dir) if snapshot is not None else None
def _project(
self,
event: ArchiveResultEvent,
output_files: dict[str, dict],
output_size: int,
output_mimetypes: str,
) -> None:
from archivebox.core.models import ArchiveResult, Snapshot from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.machine.models import Process from archivebox.machine.models import Process
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first() snapshot = await Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").afirst()
if snapshot is None: if snapshot is None:
return return
plugin_dir = Path(snapshot.output_dir) / event.plugin
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
process_started = await self.bus.find(
ProcessStartedEvent,
past=True,
future=False,
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
)
process = None process = None
db_process_id = self.process_service.get_db_process_id(event.process_id) if process_started is not None:
if db_process_id: started_at = parse_event_datetime(process_started.start_ts)
process = Process.objects.filter(id=db_process_id).first() if started_at is None:
raise ValueError("ProcessStartedEvent.start_ts is required")
process_query = Process.objects.filter(
pwd=process_started.output_dir,
cmd=[process_started.hook_path, *process_started.hook_args],
started_at=started_at,
)
if process_started.pid:
process_query = process_query.filter(pid=process_started.pid)
process = await process_query.order_by("-modified_at").afirst()
result, _created = ArchiveResult.objects.get_or_create( result, _created = await ArchiveResult.objects.aget_or_create(
snapshot=snapshot, snapshot=snapshot,
plugin=event.plugin, plugin=event.plugin,
hook_name=event.hook_name, hook_name=event.hook_name,
@@ -302,32 +263,54 @@ class ArchiveResultService(BaseService):
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now() result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
if event.error: if event.error:
result.notes = event.error result.notes = event.error
result.save() await result.asave()
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url) next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url): if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
snapshot.title = next_title snapshot.title = next_title
snapshot.save(update_fields=["title", "modified_at"]) await snapshot.asave(update_fields=["title", "modified_at"])
def _project_from_process_completed( async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
self, if not event.hook_name.startswith("on_Snapshot"):
event: ProcessCompletedEvent, return
record: dict, snapshot_event = await self.bus.find(
output_files: dict[str, dict], SnapshotEvent,
output_size: int, past=True,
output_mimetypes: str, future=False,
) -> None: where=lambda candidate: self.bus.event_is_child_of(event, candidate),
archive_result_event = ArchiveResultEvent( )
snapshot_id=record.get("snapshot_id") or event.snapshot_id, if snapshot_event is None:
plugin=record.get("plugin") or event.plugin_name, return
hook_name=record.get("hook_name") or event.hook_name,
status=record.get("status") or "", records = _iter_archiveresult_records(event.stdout)
process_id=event.process_id, if records:
output_str=record.get("output_str") or "", for record in records:
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None, await self.bus.emit(
output_files=event.output_files, ArchiveResultEvent(
start_ts=event.start_ts, snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id,
end_ts=event.end_ts, plugin=record.get("plugin") or event.plugin_name,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""), hook_name=record.get("hook_name") or event.hook_name,
status=record.get("status") or "",
output_str=record.get("output_str") or "",
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
),
)
return
await self.bus.emit(
ArchiveResultEvent(
snapshot_id=snapshot_event.snapshot_id,
plugin=event.plugin_name,
hook_name=event.hook_name,
status="failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
output_str=event.stderr if event.exit_code != 0 else "",
output_files=event.output_files,
start_ts=event.start_ts,
end_ts=event.end_ts,
error=event.stderr if event.exit_code != 0 else "",
),
) )
self._project(archive_result_event, output_files, output_size, output_mimetypes)

View File

@@ -1,20 +1,62 @@
from __future__ import annotations from __future__ import annotations
import asyncio from asgiref.sync import sync_to_async
from abx_dl.events import BinaryRequestEvent, BinaryEvent from abx_dl.events import BinaryRequestEvent, BinaryEvent
from abx_dl.services.base import BaseService from abx_dl.services.base import BaseService
from .db import run_db_op
class BinaryService(BaseService): class BinaryService(BaseService):
LISTENS_TO = [BinaryRequestEvent, BinaryEvent] LISTENS_TO = [BinaryRequestEvent, BinaryEvent]
EMITS = [] EMITS = []
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None: def __init__(self, bus):
await run_db_op(self._project_binary, event) super().__init__(bus)
cached = await run_db_op(self._load_cached_binary, event) self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent)
self.bus.on(BinaryEvent, self.on_BinaryEvent)
async def on_BinaryRequestEvent(self, event: BinaryRequestEvent) -> None:
from archivebox.machine.models import Binary, Machine
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
existing = await Binary.objects.filter(machine=machine, name=event.name).afirst()
if existing and existing.status == Binary.StatusChoices.INSTALLED:
changed = False
if event.binproviders and existing.binproviders != event.binproviders:
existing.binproviders = event.binproviders
changed = True
if event.overrides and existing.overrides != event.overrides:
existing.overrides = event.overrides
changed = True
if changed:
await existing.asave(update_fields=["binproviders", "overrides", "modified_at"])
elif existing is None:
await Binary.objects.acreate(
machine=machine,
name=event.name,
binproviders=event.binproviders,
overrides=event.overrides or {},
status=Binary.StatusChoices.QUEUED,
)
installed = (
await Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("-modified_at")
.afirst()
)
cached = None
if installed is not None:
cached = {
"abspath": installed.abspath,
"version": installed.version or "",
"sha256": installed.sha256 or "",
"binproviders": installed.binproviders or "",
"binprovider": installed.binprovider or "",
"machine_id": str(installed.machine_id),
"overrides": installed.overrides or {},
}
if cached is not None: if cached is not None:
await self.bus.emit( await self.bus.emit(
BinaryEvent( BinaryEvent(
@@ -28,126 +70,34 @@ class BinaryService(BaseService):
binprovider=cached["binprovider"], binprovider=cached["binprovider"],
overrides=event.overrides or cached["overrides"], overrides=event.overrides or cached["overrides"],
binary_id=event.binary_id, binary_id=event.binary_id,
machine_id=event.machine_id or cached["machine_id"], machine_id=cached["machine_id"],
), ),
) )
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None: async def on_BinaryEvent(self, event: BinaryEvent) -> None:
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
await run_db_op(self._project_installed_binary, event, resolved)
def _project_binary(self, event: BinaryRequestEvent) -> None:
from archivebox.machine.models import Binary, Machine from archivebox.machine.models import Binary, Machine
machine = Machine.current() machine = await sync_to_async(Machine.current, thread_sensitive=True)()
existing = Binary.objects.filter(machine=machine, name=event.name).first() binary, _ = await Binary.objects.aget_or_create(
if existing and existing.status == Binary.StatusChoices.INSTALLED:
changed = False
if event.binproviders and existing.binproviders != event.binproviders:
existing.binproviders = event.binproviders
changed = True
if event.overrides and existing.overrides != event.overrides:
existing.overrides = event.overrides
changed = True
if changed:
existing.save(update_fields=["binproviders", "overrides", "modified_at"])
return
Binary.from_json(
{
"name": event.name,
"binproviders": event.binproviders,
"overrides": event.overrides or {},
},
)
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
installed = (
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("-modified_at")
.first()
)
if installed is None:
return None
return {
"abspath": installed.abspath,
"version": installed.version or "",
"sha256": installed.sha256 or "",
"binproviders": installed.binproviders or "",
"binprovider": installed.binprovider or "",
"machine_id": str(installed.machine_id),
"overrides": installed.overrides or {},
}
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
resolved = {
"abspath": event.abspath or "",
"version": event.version or "",
"sha256": event.sha256 or "",
"binproviders": event.binproviders or "",
"binprovider": event.binprovider or "",
}
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
return resolved
if resolved["abspath"] and not resolved["version"]:
try:
from abx_pkg.semver import bin_version
detected_version = bin_version(resolved["abspath"])
except Exception:
detected_version = None
if detected_version:
resolved["version"] = str(detected_version)
if resolved["version"] and resolved["binprovider"]:
return resolved
try:
from abx_dl.dependencies import load_binary
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
spec = {
"name": event.name,
"binproviders": allowed_providers,
"overrides": event.overrides or {},
}
binary = load_binary(spec)
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
resolved["version"] = str(binary.version or resolved["version"] or "")
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
resolved["binprovider"] = str(binary.loaded_binprovider.name)
except Exception:
pass
return resolved
def _project_installed_binary(self, event: BinaryEvent, resolved: dict[str, str]) -> None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
binary, _ = Binary.objects.get_or_create(
machine=machine, machine=machine,
name=event.name, name=event.name,
defaults={ defaults={
"status": Binary.StatusChoices.QUEUED, "status": Binary.StatusChoices.QUEUED,
}, },
) )
binary.abspath = resolved["abspath"] or binary.abspath binary.abspath = event.abspath
binary.version = resolved["version"] or binary.version if event.version:
binary.sha256 = resolved["sha256"] or binary.sha256 binary.version = event.version
if resolved["binproviders"]: if event.sha256:
binary.binproviders = resolved["binproviders"] binary.sha256 = event.sha256
binary.binprovider = resolved["binprovider"] or binary.binprovider if event.binproviders:
binary.binproviders = event.binproviders
if event.binprovider:
binary.binprovider = event.binprovider
if event.overrides and binary.overrides != event.overrides: if event.overrides and binary.overrides != event.overrides:
binary.overrides = event.overrides binary.overrides = event.overrides
binary.status = Binary.StatusChoices.INSTALLED binary.status = Binary.StatusChoices.INSTALLED
binary.retry_at = None binary.retry_at = None
binary.save( await binary.asave(
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"], update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
) )

View File

@@ -3,8 +3,6 @@ from __future__ import annotations
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
from abx_dl.services.base import BaseService from abx_dl.services.base import BaseService
from .db import run_db_op
class CrawlService(BaseService): class CrawlService(BaseService):
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent] LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
@@ -13,32 +11,42 @@ class CrawlService(BaseService):
def __init__(self, bus, *, crawl_id: str): def __init__(self, bus, *, crawl_id: str):
self.crawl_id = crawl_id self.crawl_id = crawl_id
super().__init__(bus) super().__init__(bus)
self.bus.on(CrawlSetupEvent, self.on_CrawlSetupEvent__save_to_db)
self.bus.on(CrawlStartEvent, self.on_CrawlStartEvent__save_to_db)
self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__save_to_db)
self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__save_to_db)
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None: async def on_CrawlSetupEvent__save_to_db(self, event: CrawlSetupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
await run_db_op(self._mark_started)
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
await run_db_op(self._mark_completed)
def _mark_started(self) -> None:
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=self.crawl_id) crawl = await Crawl.objects.aget(id=self.crawl_id)
if crawl.status != Crawl.StatusChoices.SEALED: if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.STARTED crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = None crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"]) await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
def _mark_completed(self) -> None: async def on_CrawlStartEvent__save_to_db(self, event: CrawlStartEvent) -> None:
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=self.crawl_id) crawl = await Crawl.objects.aget(id=self.crawl_id)
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = None
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
async def on_CrawlCleanupEvent__save_to_db(self, event: CrawlCleanupEvent) -> None:
from archivebox.crawls.models import Crawl
crawl = await Crawl.objects.aget(id=self.crawl_id)
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = None
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
async def on_CrawlCompletedEvent__save_to_db(self, event: CrawlCompletedEvent) -> None:
from archivebox.crawls.models import Crawl
crawl = await Crawl.objects.aget(id=self.crawl_id)
crawl.status = Crawl.StatusChoices.SEALED crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"]) await crawl.asave(update_fields=["status", "retry_at", "modified_at"])

View File

@@ -1,16 +0,0 @@
from __future__ import annotations
from asgiref.sync import sync_to_async
from django.db import close_old_connections
def _run_db_op(func, *args, **kwargs):
close_old_connections()
try:
return func(*args, **kwargs)
finally:
close_old_connections()
async def run_db_op(func, *args, **kwargs):
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)

View File

@@ -1,22 +1,23 @@
from __future__ import annotations from __future__ import annotations
from asgiref.sync import sync_to_async
from abx_dl.events import MachineEvent from abx_dl.events import MachineEvent
from abx_dl.services.base import BaseService from abx_dl.services.base import BaseService
from .db import run_db_op
class MachineService(BaseService): class MachineService(BaseService):
LISTENS_TO = [MachineEvent] LISTENS_TO = [MachineEvent]
EMITS = [] EMITS = []
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None: def __init__(self, bus):
await run_db_op(self._project, event) super().__init__(bus)
self.bus.on(MachineEvent, self.on_MachineEvent__save_to_db)
def _project(self, event: MachineEvent) -> None: async def on_MachineEvent__save_to_db(self, event: MachineEvent) -> None:
from archivebox.machine.models import Machine, _sanitize_machine_config from archivebox.machine.models import Machine, _sanitize_machine_config
machine = Machine.current() machine = await sync_to_async(Machine.current, thread_sensitive=True)()
config = dict(machine.config or {}) config = dict(machine.config or {})
if event.config is not None: if event.config is not None:
@@ -29,4 +30,4 @@ class MachineService(BaseService):
return return
machine.config = _sanitize_machine_config(config) machine.config = _sanitize_machine_config(config)
machine.save(update_fields=["config", "modified_at"]) await machine.asave(update_fields=["config", "modified_at"])

View File

@@ -1,29 +1,15 @@
from __future__ import annotations from __future__ import annotations
import asyncio from datetime import datetime
from datetime import datetime, timezone as datetime_timezone from typing import ClassVar
import json
from pathlib import Path
import shlex
import socket
import time
from typing import TYPE_CHECKING, Any, ClassVar
from urllib.parse import urlparse
from asgiref.sync import sync_to_async
from django.utils import timezone from django.utils import timezone
from abxbus import BaseEvent from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.services.base import BaseService from abx_dl.services.base import BaseService
from .db import run_db_op
if TYPE_CHECKING:
from archivebox.machine.models import Process
WORKER_READY_TIMEOUT = 10.0
def parse_event_datetime(value: str | None): def parse_event_datetime(value: str | None):
if not value: if not value:
@@ -37,308 +23,133 @@ def parse_event_datetime(value: str | None):
return dt return dt
def _is_port_listening(host: str, port: int) -> bool:
if not host or not port:
return False
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except OSError:
return False
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
if not url:
return None
parsed = urlparse(url)
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
return None
return parsed.hostname, parsed.port
def _supervisor_env(env: dict[str, str]) -> str:
pairs = []
for key, value in env.items():
escaped = value.replace('"', '\\"')
pairs.append(f'{key}="{escaped}"')
return ",".join(pairs)
def _iso_from_epoch(value: object) -> str:
if not isinstance(value, (int, float)) or value <= 0:
return ""
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
def _int_from_object(value: object) -> int:
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
if isinstance(value, str):
try:
return int(value)
except ValueError:
return 0
return 0
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
output_dir = Path(process_event.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
worker_name = process_event.hook_name
supervisor = get_or_create_supervisord_process(daemonize=True)
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
existing = get_worker(supervisor, worker_name)
if (
isinstance(existing, dict)
and existing.get("statename") == "RUNNING"
and (worker_socket is None or _is_port_listening(*worker_socket))
):
return existing
daemon = {
"name": worker_name,
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
"directory": str(output_dir),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
"redirect_stderr": "true",
}
if process_event.env:
daemon["environment"] = _supervisor_env(process_event.env)
proc = start_worker(supervisor, daemon)
deadline = time.monotonic() + WORKER_READY_TIMEOUT
while time.monotonic() < deadline:
current = get_worker(supervisor, worker_name)
if isinstance(current, dict) and current.get("statename") == "RUNNING":
if worker_socket is None or _is_port_listening(*worker_socket):
return current
time.sleep(0.1)
return proc if isinstance(proc, dict) else {}
class ProcessService(BaseService): class ProcessService(BaseService):
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent] LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStartedEvent, ProcessCompletedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent] EMITS: ClassVar[list[type[BaseEvent]]] = []
def __init__(self, bus): def __init__(self, bus):
self.process_ids: dict[str, str] = {}
super().__init__(bus) super().__init__(bus)
self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db)
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None: async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None:
try:
record = json.loads(event.line)
except (json.JSONDecodeError, ValueError):
return
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
return
passthrough_fields: dict[str, Any] = {
key: value
for key, value in record.items()
if key
not in {
"type",
"plugin_name",
"hook_name",
"hook_path",
"hook_args",
"is_background",
"output_dir",
"env",
"snapshot_id",
"process_id",
"url",
"timeout",
"daemon",
"process_type",
"worker_type",
"event_timeout",
"event_handler_timeout",
}
}
process_event = ProcessEvent(
plugin_name=record.get("plugin_name") or event.plugin_name,
hook_name=record.get("hook_name") or "process",
hook_path=record["hook_path"],
hook_args=[str(arg) for arg in record.get("hook_args", [])],
is_background=bool(record.get("is_background", True)),
output_dir=record.get("output_dir") or event.output_dir,
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
timeout=int(record.get("timeout") or 60),
daemon=bool(record.get("daemon", False)),
url=str(record.get("url") or ""),
process_type=str(record.get("process_type") or ""),
worker_type=str(record.get("worker_type") or ""),
event_timeout=float(record.get("event_timeout") or 360.0),
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
**passthrough_fields,
)
if not process_event.daemon:
await self.bus.emit(process_event)
return
proc = await asyncio.to_thread(_ensure_worker, process_event)
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
start_ts = _iso_from_epoch(proc.get("start"))
pid = _int_from_object(proc.get("pid"))
statename = str(proc.get("statename") or "")
exitstatus = _int_from_object(proc.get("exitstatus"))
process_type = process_event.process_type or "worker"
worker_type = process_event.worker_type or process_event.plugin_name
if statename == "RUNNING" and pid:
await self.bus.emit(
ProcessStartedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
output_dir=process_event.output_dir,
env=process_event.env,
timeout=process_event.timeout,
pid=pid,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
is_background=True,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
**passthrough_fields,
),
)
return
stderr = (
f"Worker {process_event.hook_name} failed to start"
if not statename
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
)
await self.bus.emit(
ProcessCompletedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
env=process_event.env,
stdout="",
stderr=stderr,
exit_code=exitstatus or 1,
output_dir=process_event.output_dir,
is_background=True,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
pid=pid,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
**passthrough_fields,
),
)
raise RuntimeError(stderr)
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
await run_db_op(self._project_started, event)
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
await run_db_op(self._project_completed, event)
def get_db_process_id(self, process_id: str) -> str | None:
return self.process_ids.get(process_id)
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
from archivebox.machine.models import NetworkInterface, Process from archivebox.machine.models import NetworkInterface, Process
db_process_id = self.process_ids.get(event.process_id) iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
iface = NetworkInterface.current(refresh=True) process_type = event.process_type or (
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process is not None:
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
process.save(update_fields=["iface", "machine", "modified_at"])
return process
process_type = getattr(event, "process_type", "") or (
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
) )
worker_type = getattr(event, "worker_type", "") or "" worker_type = event.worker_type or ""
if process_type == Process.TypeChoices.WORKER and worker_type: started_at = parse_event_datetime(event.start_ts)
existing = ( if started_at is None:
Process.objects.filter( raise ValueError("ProcessStartedEvent.start_ts is required")
process_type=Process.TypeChoices.WORKER, process_query = Process.objects.filter(
worker_type=worker_type,
pwd=event.output_dir,
)
.order_by("-modified_at")
.first()
)
if existing is not None:
self.process_ids[event.process_id] = str(existing.id)
return existing
process = Process.objects.create(
machine=iface.machine,
iface=iface,
process_type=process_type, process_type=process_type,
worker_type=worker_type, worker_type=worker_type,
pwd=event.output_dir, pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args], cmd=[event.hook_path, *event.hook_args],
env=event.env, started_at=started_at,
timeout=getattr(event, "timeout", 60),
pid=event.pid or None,
url=getattr(event, "url", "") or None,
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
status=Process.StatusChoices.RUNNING,
retry_at=None,
) )
self.process_ids[event.process_id] = str(process.id) if event.pid:
return process process_query = process_query.filter(pid=event.pid)
process = await process_query.order_by("-modified_at").afirst()
if process is None:
process = await Process.objects.acreate(
machine=iface.machine,
iface=iface,
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
started_at=started_at,
status=Process.StatusChoices.RUNNING,
retry_at=None,
)
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
await process.asave(update_fields=["iface", "machine", "modified_at"])
def _project_started(self, event: ProcessStartedEvent) -> None:
process = self._get_or_create_process(event)
process.pwd = event.output_dir process.pwd = event.output_dir
process.cmd = [event.hook_path, *event.hook_args] process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env process.env = event.env
process.timeout = event.timeout process.timeout = event.timeout
process.pid = event.pid or None process.pid = event.pid or None
process.url = getattr(event, "url", "") or process.url process.url = event.url or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type process.process_type = process_type or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type process.worker_type = worker_type or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now() process.started_at = started_at
process.status = process.StatusChoices.RUNNING process.status = process.StatusChoices.RUNNING
process.retry_at = None process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path) await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
process.save() plugin_name=event.plugin_name,
hook_path=event.hook_path,
)
await process.asave()
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
from archivebox.machine.models import NetworkInterface, Process
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
process_type = event.process_type or (
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
)
worker_type = event.worker_type or ""
started_at = parse_event_datetime(event.start_ts)
if started_at is None:
raise ValueError("ProcessCompletedEvent.start_ts is required")
process_query = Process.objects.filter(
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
started_at=started_at,
)
if event.pid:
process_query = process_query.filter(pid=event.pid)
process = await process_query.order_by("-modified_at").afirst()
if process is None:
process = await Process.objects.acreate(
machine=iface.machine,
iface=iface,
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
started_at=started_at,
status=Process.StatusChoices.RUNNING,
retry_at=None,
)
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
await process.asave(update_fields=["iface", "machine", "modified_at"])
def _project_completed(self, event: ProcessCompletedEvent) -> None:
process = self._get_or_create_process(event)
process.pwd = event.output_dir process.pwd = event.output_dir
if not process.cmd: if not process.cmd:
process.cmd = [event.hook_path, *event.hook_args] process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env process.env = event.env
process.pid = event.pid or process.pid process.pid = event.pid or process.pid
process.url = getattr(event, "url", "") or process.url process.url = event.url or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type process.process_type = process_type or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type process.worker_type = worker_type or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at process.started_at = started_at
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now() process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
process.stdout = event.stdout process.stdout = event.stdout
process.stderr = event.stderr process.stderr = event.stderr
process.exit_code = event.exit_code process.exit_code = event.exit_code
process.status = process.StatusChoices.EXITED process.status = process.StatusChoices.EXITED
process.retry_at = None process.retry_at = None
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path) await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
process.save() plugin_name=event.plugin_name,
hook_path=event.hook_path,
)
await process.asave()

View File

@@ -3,7 +3,6 @@ from __future__ import annotations
import asyncio import asyncio
import json import json
import os import os
import re
import shutil import shutil
import subprocess import subprocess
import sys import sys
@@ -13,12 +12,13 @@ from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from typing import Any from typing import Any
from asgiref.sync import sync_to_async
from django.utils import timezone from django.utils import timezone
from rich.console import Console from rich.console import Console
from abx_dl.events import BinaryRequestEvent from abx_dl.events import BinaryRequestEvent
from abx_dl.limits import CrawlLimitState from abx_dl.limits import CrawlLimitState
from abx_dl.models import Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins from abx_dl.models import Plugin, discover_plugins, filter_plugins
from abx_dl.orchestrator import ( from abx_dl.orchestrator import (
create_bus, create_bus,
download, download,
@@ -40,150 +40,9 @@ def _bus_name(prefix: str, identifier: str) -> str:
return f"{prefix}_{normalized}" return f"{prefix}_{normalized}"
def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
raw = str(config.get("PLUGINS") or "").strip()
if not raw:
return None
return [name.strip() for name in raw.split(",") if name.strip()]
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int: def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
return sum( return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name)
1
for plugin in selected.values()
for hook in plugin.hooks
if "Install" in hook.name or "CrawlSetup" in hook.name or "Snapshot" in hook.name
)
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
keys: list[str] = []
for plugin in plugins.values():
for spec in plugin.binaries:
template_name = str(spec.get("name") or "").strip()
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
if match is None:
continue
key = match.group(1)
configured_value = config.get(key)
if configured_value is not None and str(configured_value).strip() == binary_name:
keys.append(key)
for key, prop in plugin.config_schema.items():
if key.endswith("_BINARY") and prop.get("default") == binary_name:
keys.append(key)
return list(dict.fromkeys(keys))
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
active_config = dict(config or {})
overrides: dict[str, str] = {}
shared_lib_dir: Path | None = None
pip_home: Path | None = None
pip_bin_dir: Path | None = None
npm_home: Path | None = None
node_modules_dir: Path | None = None
npm_bin_dir: Path | None = None
binaries = (
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
)
for binary in binaries:
try:
resolved_path = Path(binary.abspath).expanduser()
except (TypeError, ValueError):
continue
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
continue
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
overrides[key] = binary.abspath
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
npm_bin_dir = npm_bin_dir or resolved_path.parent
node_modules_dir = node_modules_dir or resolved_path.parent.parent
npm_home = npm_home or resolved_path.parent.parent.parent
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
elif (
resolved_path.parent.name == "bin"
and resolved_path.parent.parent.name == "venv"
and resolved_path.parent.parent.parent.name == "pip"
):
pip_bin_dir = pip_bin_dir or resolved_path.parent
pip_home = pip_home or resolved_path.parent.parent.parent
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
if shared_lib_dir is not None:
overrides["LIB_DIR"] = str(shared_lib_dir)
overrides["LIB_BIN_DIR"] = str(shared_lib_dir / "bin")
if pip_home is not None:
overrides["PIP_HOME"] = str(pip_home)
if pip_bin_dir is not None:
overrides["PIP_BIN_DIR"] = str(pip_bin_dir)
if npm_home is not None:
overrides["NPM_HOME"] = str(npm_home)
if node_modules_dir is not None:
overrides["NODE_MODULES_DIR"] = str(node_modules_dir)
overrides["NODE_MODULE_DIR"] = str(node_modules_dir)
overrides["NODE_PATH"] = str(node_modules_dir)
if npm_bin_dir is not None:
overrides["NPM_BIN_DIR"] = str(npm_bin_dir)
return overrides
def _limit_stop_reason(config: dict[str, Any]) -> str:
return CrawlLimitState.from_config(config).get_stop_reason()
def _attach_bus_trace(bus) -> None:
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
if not trace_target:
return
if getattr(bus, "_archivebox_trace_task", None) is not None:
return
trace_path = None if trace_target in {"1", "-", "stderr"} else Path(trace_target)
stop_event = asyncio.Event()
async def trace_loop() -> None:
seen_event_ids: set[str] = set()
while not stop_event.is_set():
for event_id, event in list(bus.event_history.items()):
if event_id in seen_event_ids:
continue
seen_event_ids.add(event_id)
payload = event.model_dump(mode="json")
payload["bus_name"] = bus.name
line = json.dumps(payload, ensure_ascii=False, default=str, separators=(",", ":"))
if trace_path is None:
print(line, file=sys.stderr, flush=True)
else:
trace_path.parent.mkdir(parents=True, exist_ok=True)
with trace_path.open("a", encoding="utf-8") as handle:
handle.write(line + "\n")
await asyncio.sleep(0.05)
bus._archivebox_trace_stop = stop_event
bus._archivebox_trace_task = asyncio.create_task(trace_loop())
async def _stop_bus_trace(bus) -> None:
stop_event = getattr(bus, "_archivebox_trace_stop", None)
trace_task = getattr(bus, "_archivebox_trace_task", None)
if stop_event is None or trace_task is None:
return
stop_event.set()
await asyncio.gather(trace_task, return_exceptions=True)
bus._archivebox_trace_stop = None
bus._archivebox_trace_task = None
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool: def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
@@ -235,22 +94,25 @@ class CrawlRunner:
self.crawl = crawl self.crawl = crawl
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0) self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins() self.plugins = discover_plugins()
self.process_service = ProcessService(self.bus) ProcessService(self.bus)
self.binary_service = BinaryService(self.bus) BinaryService(self.bus)
self.tag_service = TagService(self.bus) TagService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id)) CrawlService(self.bus, crawl_id=str(crawl.id))
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
async def ignore_snapshot(_snapshot_id: str) -> None:
return None
SnapshotService(
self.bus, self.bus,
crawl_id=str(crawl.id), crawl_id=str(crawl.id),
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued, schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot,
) )
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service) ArchiveResultService(self.bus)
self.selected_plugins = selected_plugins self.selected_plugins = selected_plugins
self.initial_snapshot_ids = snapshot_ids self.initial_snapshot_ids = snapshot_ids
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {} self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS) self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
self.abx_services = None
self.persona = None self.persona = None
self.base_config: dict[str, Any] = {} self.base_config: dict[str, Any] = {}
self.derived_config: dict[str, Any] = {} self.derived_config: dict[str, Any] = {}
@@ -258,15 +120,11 @@ class CrawlRunner:
self._live_stream = None self._live_stream = None
async def run(self) -> None: async def run(self) -> None:
from asgiref.sync import sync_to_async
from archivebox.crawls.models import Crawl
try: try:
await sync_to_async(self._prepare, thread_sensitive=True)() snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)()
live_ui = self._create_live_ui() live_ui = self._create_live_ui()
with live_ui if live_ui is not None else nullcontext(): with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(self.bus) setup_abx_services(
self.abx_services = setup_abx_services(
self.bus, self.bus,
plugins=self.plugins, plugins=self.plugins,
config_overrides={ config_overrides={
@@ -278,18 +136,14 @@ class CrawlRunner:
auto_install=True, auto_install=True,
emit_jsonl=False, emit_jsonl=False,
) )
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
if snapshot_ids: if snapshot_ids:
root_snapshot_id = snapshot_ids[0] root_snapshot_id = snapshot_ids[0]
await self._run_crawl_setup(root_snapshot_id) await self.run_crawl_setup(root_snapshot_id)
for snapshot_id in snapshot_ids: for snapshot_id in snapshot_ids:
await self.enqueue_snapshot(snapshot_id) await self.enqueue_snapshot(snapshot_id)
await self._wait_for_snapshot_tasks() await self.wait_for_snapshot_tasks()
await self._run_crawl_cleanup(root_snapshot_id) await self.run_crawl_cleanup(root_snapshot_id)
if self.abx_services is not None:
await self.abx_services.process.wait_for_background_monitors()
finally: finally:
await _stop_bus_trace(self.bus)
await self.bus.stop() await self.bus.stop()
if self._live_stream is not None: if self._live_stream is not None:
try: try:
@@ -297,33 +151,16 @@ class CrawlRunner:
except Exception: except Exception:
pass pass
self._live_stream = None self._live_stream = None
await sync_to_async(self._cleanup_persona, thread_sensitive=True)() await sync_to_async(self.finalize_run_state, thread_sensitive=True)()
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
if crawl_is_finished:
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
else:
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
async def enqueue_snapshot(self, snapshot_id: str) -> None: async def enqueue_snapshot(self, snapshot_id: str) -> None:
task = self.snapshot_tasks.get(snapshot_id) task = self.snapshot_tasks.get(snapshot_id)
if task is not None and not task.done(): if task is not None and not task.done():
return return
task = asyncio.create_task(self._run_snapshot(snapshot_id)) task = asyncio.create_task(self.run_snapshot(snapshot_id))
self.snapshot_tasks[snapshot_id] = task self.snapshot_tasks[snapshot_id] = task
async def leave_snapshot_queued(self, snapshot_id: str) -> None: async def wait_for_snapshot_tasks(self) -> None:
return None
async def _wait_for_snapshot_tasks(self) -> None:
while True: while True:
pending_tasks: list[asyncio.Task[None]] = [] pending_tasks: list[asyncio.Task[None]] = []
for snapshot_id, task in list(self.snapshot_tasks.items()): for snapshot_id, task in list(self.snapshot_tasks.items()):
@@ -339,9 +176,9 @@ class CrawlRunner:
for task in done: for task in done:
task.result() task.result()
def _prepare(self) -> None: def load_run_state(self) -> list[str]:
from archivebox.config.configset import get_config from archivebox.config.configset import get_config
from archivebox.machine.models import NetworkInterface, Process from archivebox.machine.models import Machine, NetworkInterface, Process
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else "" self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
current_iface = NetworkInterface.current(refresh=True) current_iface = NetworkInterface.current(refresh=True)
@@ -352,17 +189,42 @@ class CrawlRunner:
current_process.save(update_fields=["iface", "machine", "modified_at"]) current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona() self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl) self.base_config = get_config(crawl=self.crawl)
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config) self.derived_config = dict(Machine.current().config)
self.base_config["ABX_RUNTIME"] = "archivebox" self.base_config["ABX_RUNTIME"] = "archivebox"
if self.selected_plugins is None: if self.selected_plugins is None:
self.selected_plugins = _selected_plugins_from_config(self.base_config) raw_plugins = self.base_config["PLUGINS"].strip()
self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] if raw_plugins else None
if self.persona: if self.persona:
chrome_binary = str(self.base_config.get("CHROME_BINARY") or "") self.base_config.update(
self.base_config.update(self.persona.prepare_runtime_for_crawl(self.crawl, chrome_binary=chrome_binary)) self.persona.prepare_runtime_for_crawl(
self.crawl,
chrome_binary=self.base_config["CHROME_BINARY"],
),
)
if self.initial_snapshot_ids:
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
created = self.crawl.create_snapshots_from_urls()
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
return [str(snapshot.id) for snapshot in snapshots]
def finalize_run_state(self) -> None:
from archivebox.crawls.models import Crawl
def _cleanup_persona(self) -> None:
if self.persona: if self.persona:
self.persona.cleanup_runtime_for_crawl(self.crawl) self.persona.cleanup_runtime_for_crawl(self.crawl)
crawl = Crawl.objects.get(id=self.crawl.id)
if crawl.is_finished():
if crawl.status != Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=["status", "retry_at", "modified_at"])
return
if crawl.status == Crawl.StatusChoices.SEALED:
crawl.status = Crawl.StatusChoices.QUEUED
elif crawl.status != Crawl.StatusChoices.STARTED:
crawl.status = Crawl.StatusChoices.STARTED
crawl.retry_at = crawl.retry_at or timezone.now()
crawl.save(update_fields=["status", "retry_at", "modified_at"])
def _create_live_ui(self) -> LiveBusUI | None: def _create_live_ui(self) -> LiveBusUI | None:
stdout_is_tty = sys.stdout.isatty() stdout_is_tty = sys.stdout.isatty()
@@ -373,7 +235,7 @@ class CrawlRunner:
stream = sys.stderr if stderr_is_tty else sys.stdout stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"): if os.path.exists("/dev/tty"):
try: try:
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8") self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
stream = self._live_stream stream = self._live_stream
except OSError: except OSError:
self._live_stream = None self._live_stream = None
@@ -399,7 +261,7 @@ class CrawlRunner:
live_ui = LiveBusUI( live_ui = LiveBusUI(
self.bus, self.bus,
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins), total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60), timeout_seconds=self.base_config["TIMEOUT"],
ui_console=ui_console, ui_console=ui_console,
interactive_tty=True, interactive_tty=True,
) )
@@ -410,128 +272,24 @@ class CrawlRunner:
) )
return live_ui return live_ui
def _create_root_snapshots(self) -> list[str]: def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]:
created = self.crawl.create_snapshots_from_urls() from archivebox.core.models import Snapshot
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
return [str(snapshot.id) for snapshot in snapshots]
def _initial_snapshot_ids(self) -> list[str]:
if self.initial_snapshot_ids:
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
return self._create_root_snapshots()
def _snapshot_config(self, snapshot) -> dict[str, Any]:
from archivebox.config.configset import get_config from archivebox.config.configset import get_config
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
config = get_config(crawl=self.crawl, snapshot=snapshot) config = get_config(crawl=self.crawl, snapshot=snapshot)
config.update(self.base_config) config.update(self.base_config)
config["CRAWL_DIR"] = str(self.crawl.output_dir) config["CRAWL_DIR"] = str(self.crawl.output_dir)
config["SNAP_DIR"] = str(snapshot.output_dir) config["SNAP_DIR"] = str(snapshot.output_dir)
config["SNAPSHOT_ID"] = str(snapshot.id) extra_context: dict[str, Any] = {}
config["SNAPSHOT_DEPTH"] = snapshot.depth if config.get("EXTRA_CONTEXT"):
config["CRAWL_ID"] = str(self.crawl.id) parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"]))
config["SOURCE_URL"] = snapshot.url if not isinstance(parsed_extra_context, dict):
if snapshot.parent_snapshot_id: raise TypeError("EXTRA_CONTEXT must decode to an object")
config["PARENT_SNAPSHOT_ID"] = str(snapshot.parent_snapshot_id) extra_context = parsed_extra_context
return config extra_context["snapshot_id"] = str(snapshot.id)
extra_context["snapshot_depth"] = snapshot.depth
async def _run_crawl_setup(self, snapshot_id: str) -> None: config["EXTRA_CONTEXT"] = json.dumps(extra_context, separators=(",", ":"), sort_keys=True)
from asgiref.sync import sync_to_async
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
setup_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
bus=self.bus,
emit_jsonl=False,
snapshot=setup_snapshot,
crawl_setup_only=True,
)
async def _run_crawl_cleanup(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
cleanup_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
bus=self.bus,
emit_jsonl=False,
snapshot=cleanup_snapshot,
crawl_cleanup_only=True,
)
async def _run_snapshot(self, snapshot_id: str) -> None:
from asgiref.sync import sync_to_async
async with self.snapshot_semaphore:
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
if snapshot["status"] == "sealed":
return
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
return
abx_snapshot = AbxSnapshot(
url=snapshot["url"],
id=snapshot["id"],
title=snapshot["title"],
timestamp=snapshot["timestamp"],
bookmarked_at=snapshot["bookmarked_at"],
created_at=snapshot["created_at"],
tags=snapshot["tags"],
depth=snapshot["depth"],
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
try:
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
bus=self.bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
return { return {
"id": str(snapshot.id), "id": str(snapshot.id),
"url": snapshot.url, "url": snapshot.url,
@@ -542,12 +300,91 @@ class CrawlRunner:
"tags": snapshot.tags_str(), "tags": snapshot.tags_str(),
"depth": snapshot.depth, "depth": snapshot.depth,
"status": snapshot.status, "status": snapshot.status,
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
"output_dir": str(snapshot.output_dir), "output_dir": str(snapshot.output_dir),
"config": self._snapshot_config(snapshot), "config": config,
} }
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None: async def run_crawl_setup(self, snapshot_id: str) -> None:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
bus=self.bus,
emit_jsonl=False,
install_enabled=True,
crawl_setup_enabled=True,
crawl_start_enabled=False,
snapshot_cleanup_enabled=False,
crawl_cleanup_enabled=False,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
async def run_crawl_cleanup(self, snapshot_id: str) -> None:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
await download(
bus=self.bus,
url=snapshot["url"],
output_dir=Path(snapshot["output_dir"]),
plugins=self.plugins,
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
emit_jsonl=False,
install_enabled=False,
crawl_setup_enabled=False,
crawl_start_enabled=False,
snapshot_cleanup_enabled=False,
crawl_cleanup_enabled=True,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
async def run_snapshot(self, snapshot_id: str) -> None:
async with self.snapshot_semaphore:
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
if snapshot["status"] == "sealed":
return
if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() == "max_size":
await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
return
try:
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
derived_config_overrides=self.derived_config,
bus=self.bus,
emit_jsonl=False,
install_enabled=False,
crawl_setup_enabled=False,
crawl_start_enabled=True,
snapshot_cleanup_enabled=True,
crawl_cleanup_enabled=False,
machine_service=None,
binary_service=None,
process_service=None,
archive_result_service=None,
tag_service=None,
)
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).first() snapshot = Snapshot.objects.filter(id=snapshot_id).first()
@@ -579,21 +416,20 @@ def run_crawl(
async def _run_binary(binary_id: str) -> None: async def _run_binary(binary_id: str) -> None:
from asgiref.sync import sync_to_async
from archivebox.config.configset import get_config from archivebox.config.configset import get_config
from archivebox.machine.models import Binary from archivebox.machine.models import Binary, Machine
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id) binary = await Binary.objects.aget(id=binary_id)
plugins = discover_plugins() plugins = discover_plugins()
config = get_config() config = get_config()
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config) machine = await sync_to_async(Machine.current, thread_sensitive=True)()
derived_config = dict(machine.config)
config["ABX_RUNTIME"] = "archivebox" config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0) bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
process_service = ProcessService(bus) ProcessService(bus)
BinaryService(bus) BinaryService(bus)
TagService(bus) TagService(bus)
ArchiveResultService(bus, process_service=process_service) ArchiveResultService(bus)
setup_abx_services( setup_abx_services(
bus, bus,
plugins=plugins, plugins=plugins,
@@ -605,7 +441,6 @@ async def _run_binary(binary_id: str) -> None:
) )
try: try:
_attach_bus_trace(bus)
await bus.emit( await bus.emit(
BinaryRequestEvent( BinaryRequestEvent(
name=binary.name, name=binary.name,
@@ -619,7 +454,6 @@ async def _run_binary(binary_id: str) -> None:
), ),
) )
finally: finally:
await _stop_bus_trace(bus)
await bus.stop() await bus.stop()
@@ -628,20 +462,20 @@ def run_binary(binary_id: str) -> None:
async def _run_install(plugin_names: list[str] | None = None) -> None: async def _run_install(plugin_names: list[str] | None = None) -> None:
from asgiref.sync import sync_to_async
from archivebox.config.configset import get_config from archivebox.config.configset import get_config
from archivebox.machine.models import Machine
plugins = discover_plugins() plugins = discover_plugins()
config = get_config() config = get_config()
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config) machine = await sync_to_async(Machine.current, thread_sensitive=True)()
derived_config = dict(machine.config)
config["ABX_RUNTIME"] = "archivebox" config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0) bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
process_service = ProcessService(bus) ProcessService(bus)
BinaryService(bus) BinaryService(bus)
TagService(bus) TagService(bus)
ArchiveResultService(bus, process_service=process_service) ArchiveResultService(bus)
abx_services = setup_abx_services( setup_abx_services(
bus, bus,
plugins=plugins, plugins=plugins,
config_overrides=config, config_overrides=config,
@@ -657,7 +491,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
if not selected_plugins: if not selected_plugins:
return return
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)" plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
timeout_seconds = int(config.get("TIMEOUT") or 60) timeout_seconds = config["TIMEOUT"]
stdout_is_tty = sys.stdout.isatty() stdout_is_tty = sys.stdout.isatty()
stderr_is_tty = sys.stderr.isatty() stderr_is_tty = sys.stderr.isatty()
interactive_tty = stdout_is_tty or stderr_is_tty interactive_tty = stdout_is_tty or stderr_is_tty
@@ -668,7 +502,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
stream = sys.stderr if stderr_is_tty else sys.stdout stream = sys.stderr if stderr_is_tty else sys.stdout
if os.path.exists("/dev/tty"): if os.path.exists("/dev/tty"):
try: try:
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8") live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
stream = live_stream stream = live_stream
except OSError: except OSError:
live_stream = None live_stream = None
@@ -707,20 +541,21 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
plugins_label=plugins_label, plugins_label=plugins_label,
) )
with live_ui if live_ui is not None else nullcontext(): with live_ui if live_ui is not None else nullcontext():
_attach_bus_trace(bus)
results = await abx_install_plugins( results = await abx_install_plugins(
plugin_names=plugin_names, plugin_names=plugin_names,
plugins=plugins, plugins=plugins,
output_dir=output_dir, output_dir=output_dir,
config_overrides=config, config_overrides=config,
derived_config_overrides=derived_config,
emit_jsonl=False, emit_jsonl=False,
bus=bus, bus=bus,
machine_service=None,
binary_service=None,
process_service=None,
) )
await abx_services.process.wait_for_background_monitors()
if live_ui is not None: if live_ui is not None:
live_ui.print_summary(results, output_dir=output_dir) live_ui.print_summary(results, output_dir=output_dir)
finally: finally:
await _stop_bus_trace(bus)
await bus.stop() await bus.stop()
try: try:
if live_stream is not None: if live_stream is not None:
@@ -739,6 +574,12 @@ def recover_orphaned_crawls() -> int:
from archivebox.machine.models import Process from archivebox.machine.models import Process
active_crawl_ids: set[str] = set() active_crawl_ids: set[str] = set()
orphaned_crawls = list(
Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set"),
)
running_processes = Process.objects.filter( running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING, status=Process.StatusChoices.RUNNING,
process_type__in=[ process_type__in=[
@@ -746,23 +587,27 @@ def recover_orphaned_crawls() -> int:
Process.TypeChoices.HOOK, Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY, Process.TypeChoices.BINARY,
], ],
).only("env") ).only("pwd")
for proc in running_processes: for proc in running_processes:
env = proc.env or {} if not proc.pwd:
if not isinstance(env, dict):
continue continue
crawl_id = env.get("CRAWL_ID") proc_pwd = Path(proc.pwd)
if crawl_id: for crawl in orphaned_crawls:
active_crawl_ids.add(str(crawl_id)) matched_snapshot = None
for snapshot in crawl.snapshot_set.all():
try:
proc_pwd.relative_to(snapshot.output_dir)
matched_snapshot = snapshot
break
except ValueError:
continue
if matched_snapshot is not None:
active_crawl_ids.add(str(crawl.id))
break
recovered = 0 recovered = 0
now = timezone.now() now = timezone.now()
orphaned_crawls = Crawl.objects.filter(
status=Crawl.StatusChoices.STARTED,
retry_at__isnull=True,
).prefetch_related("snapshot_set")
for crawl in orphaned_crawls: for crawl in orphaned_crawls:
if str(crawl.id) in active_crawl_ids: if str(crawl.id) in active_crawl_ids:
continue continue
@@ -788,6 +633,11 @@ def recover_orphaned_snapshots() -> int:
from archivebox.machine.models import Process from archivebox.machine.models import Process
active_snapshot_ids: set[str] = set() active_snapshot_ids: set[str] = set()
orphaned_snapshots = list(
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set"),
)
running_processes = Process.objects.filter( running_processes = Process.objects.filter(
status=Process.StatusChoices.RUNNING, status=Process.StatusChoices.RUNNING,
process_type__in=[ process_type__in=[
@@ -795,24 +645,22 @@ def recover_orphaned_snapshots() -> int:
Process.TypeChoices.HOOK, Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY, Process.TypeChoices.BINARY,
], ],
).only("env") ).only("pwd")
for proc in running_processes: for proc in running_processes:
env = proc.env or {} if not proc.pwd:
if not isinstance(env, dict):
continue continue
snapshot_id = env.get("SNAPSHOT_ID") proc_pwd = Path(proc.pwd)
if snapshot_id: for snapshot in orphaned_snapshots:
active_snapshot_ids.add(str(snapshot_id)) try:
proc_pwd.relative_to(snapshot.output_dir)
active_snapshot_ids.add(str(snapshot.id))
break
except ValueError:
continue
recovered = 0 recovered = 0
now = timezone.now() now = timezone.now()
orphaned_snapshots = (
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
.select_related("crawl")
.prefetch_related("archiveresult_set")
)
for snapshot in orphaned_snapshots: for snapshot in orphaned_snapshots:
if str(snapshot.id) in active_snapshot_ids: if str(snapshot.id) in active_snapshot_ids:
continue continue

View File

@@ -7,8 +7,6 @@ from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.limits import CrawlLimitState from abx_dl.limits import CrawlLimitState
from abx_dl.services.base import BaseService from abx_dl.services.base import BaseService
from .db import run_db_op
class SnapshotService(BaseService): class SnapshotService(BaseService):
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent] LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
@@ -18,120 +16,96 @@ class SnapshotService(BaseService):
self.crawl_id = crawl_id self.crawl_id = crawl_id
self.schedule_snapshot = schedule_snapshot self.schedule_snapshot = schedule_snapshot
super().__init__(bus) super().__init__(bus)
self.bus.on(SnapshotEvent, self.on_SnapshotEvent)
self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent)
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None: async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
snapshot_id = await run_db_op(self._project_snapshot, event)
if snapshot_id:
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
if snapshot_id:
await sync_to_async(self._write_snapshot_details)(snapshot_id)
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=self.crawl_id) crawl = await Crawl.objects.aget(id=self.crawl_id)
snapshot_id: str | None = None
snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).afirst()
if event.depth == 0: if snapshot is not None:
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
if snapshot is None:
return None
snapshot.status = Snapshot.StatusChoices.STARTED snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = None snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"]) await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
return str(snapshot.id) snapshot_id = str(snapshot.id)
elif event.depth > 0:
if event.depth <= crawl.max_depth and self._crawl_limit_stop_reason(crawl) != "max_size":
parent_event = await self.bus.find(
SnapshotEvent,
past=True,
future=False,
where=lambda candidate: candidate.depth == event.depth - 1 and self.bus.event_is_child_of(event, candidate),
)
parent_snapshot = None
if parent_event is not None:
parent_snapshot = await Snapshot.objects.filter(id=parent_event.snapshot_id, crawl=crawl).afirst()
if parent_snapshot is not None and self._url_passes_filters(crawl, parent_snapshot, event.url):
snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)(
{
"url": event.url,
"depth": event.depth,
"parent_snapshot_id": str(parent_snapshot.id),
"crawl_id": str(crawl.id),
},
overrides={
"crawl": crawl,
"snapshot": parent_snapshot,
"created_by_id": crawl.created_by_id,
},
queue_for_extraction=False,
)
if snapshot is not None and snapshot.status != Snapshot.StatusChoices.SEALED:
snapshot.retry_at = None
snapshot.status = Snapshot.StatusChoices.QUEUED
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
snapshot_id = str(snapshot.id)
if event.depth > crawl.max_depth: if snapshot_id:
return None snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
if self._crawl_limit_stop_reason(crawl) == "max_size": if snapshot is not None:
return None await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)()
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first() async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
if parent_snapshot is None: from archivebox.core.models import Snapshot
return None
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
return None
snapshot = Snapshot.from_json( snapshot = await Snapshot.objects.select_related("crawl").filter(id=event.snapshot_id).afirst()
{ snapshot_id: str | None = None
"url": event.url, if snapshot is not None:
"depth": event.depth, snapshot.status = Snapshot.StatusChoices.SEALED
"parent_snapshot_id": str(parent_snapshot.id), snapshot.retry_at = None
"crawl_id": str(crawl.id), snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
}, await snapshot.asave(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
overrides={ if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
"crawl": crawl, await (
"snapshot": parent_snapshot, Snapshot.objects.filter(
"created_by_id": crawl.created_by_id, crawl_id=snapshot.crawl_id,
}, status=Snapshot.StatusChoices.QUEUED,
queue_for_extraction=False, )
) .exclude(id=snapshot.id)
if snapshot is None: .aupdate(
return None status=Snapshot.StatusChoices.SEALED,
if snapshot.status == Snapshot.StatusChoices.SEALED: retry_at=None,
return None modified_at=timezone.now(),
snapshot.retry_at = None )
if snapshot.status != Snapshot.StatusChoices.SEALED: )
snapshot.status = Snapshot.StatusChoices.QUEUED snapshot_id = str(snapshot.id)
snapshot.save(update_fields=["status", "retry_at", "modified_at"]) if snapshot_id:
return str(snapshot.id) snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
if snapshot is not None:
await sync_to_async(snapshot.write_index_jsonl, thread_sensitive=True)()
await sync_to_async(snapshot.write_json_details, thread_sensitive=True)()
await sync_to_async(snapshot.write_html_details, thread_sensitive=True)()
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool: def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
return crawl.url_passes_filters(url, snapshot=parent_snapshot) return crawl.url_passes_filters(url, snapshot=parent_snapshot)
def _seal_snapshot(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
if snapshot is None:
return None
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
return str(snapshot.id)
def _crawl_limit_stop_reason(self, crawl) -> str: def _crawl_limit_stop_reason(self, crawl) -> str:
config = dict(crawl.config or {}) config = dict(crawl.config or {})
config["CRAWL_DIR"] = str(crawl.output_dir) config["CRAWL_DIR"] = str(crawl.output_dir)
return CrawlLimitState.from_config(config).get_stop_reason() return CrawlLimitState.from_config(config).get_stop_reason()
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
from archivebox.core.models import Snapshot
return (
Snapshot.objects.filter(
crawl_id=crawl_id,
status=Snapshot.StatusChoices.QUEUED,
)
.exclude(id=exclude_snapshot_id)
.update(
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
modified_at=timezone.now(),
)
)
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is not None:
snapshot.ensure_crawl_symlink()
def _write_snapshot_details(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is None:
return
snapshot.write_index_jsonl()
snapshot.write_json_details()
snapshot.write_html_details()

View File

@@ -3,20 +3,20 @@ from __future__ import annotations
from abx_dl.events import TagEvent from abx_dl.events import TagEvent
from abx_dl.services.base import BaseService from abx_dl.services.base import BaseService
from .db import run_db_op
class TagService(BaseService): class TagService(BaseService):
LISTENS_TO = [TagEvent] LISTENS_TO = [TagEvent]
EMITS = [] EMITS = []
async def on_TagEvent__Outer(self, event: TagEvent) -> None: def __init__(self, bus):
await run_db_op(self._project, event) super().__init__(bus)
self.bus.on(TagEvent, self.on_TagEvent__save_to_db)
def _project(self, event: TagEvent) -> None: async def on_TagEvent__save_to_db(self, event: TagEvent) -> None:
from archivebox.core.models import Snapshot, Tag from archivebox.core.models import Snapshot, SnapshotTag, Tag
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first() snapshot = await Snapshot.objects.filter(id=event.snapshot_id).afirst()
if snapshot is None: if snapshot is None:
return return
Tag.from_json({"name": event.name}, overrides={"snapshot": snapshot}) tag, _ = await Tag.objects.aget_or_create(name=event.name)
await SnapshotTag.objects.aget_or_create(snapshot=snapshot, tag=tag)

View File

@@ -312,7 +312,7 @@ CREATE TABLE IF NOT EXISTS machine_dependency (
modified_at DATETIME, modified_at DATETIME,
bin_name VARCHAR(63) NOT NULL UNIQUE, bin_name VARCHAR(63) NOT NULL UNIQUE,
bin_providers VARCHAR(127) NOT NULL DEFAULT '*', bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
custom_cmds TEXT DEFAULT '{}', overrides TEXT DEFAULT '{}',
config TEXT DEFAULT '{}' config TEXT DEFAULT '{}'
); );
@@ -973,7 +973,6 @@ def seed_0_8_data(db_path: Path) -> dict[str, list[dict]]:
("machine", "0003_alter_installedbinary_options_and_more"), ("machine", "0003_alter_installedbinary_options_and_more"),
("machine", "0004_alter_installedbinary_abspath_and_more"), ("machine", "0004_alter_installedbinary_abspath_and_more"),
# Then the new migrations after squashing # Then the new migrations after squashing
("machine", "0002_rename_custom_cmds_to_overrides"),
("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"), ("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"),
("machine", "0004_drop_dependency_table"), ("machine", "0004_drop_dependency_table"),
# Crawls must come before core.0024 because 0024_b depends on it # Crawls must come before core.0024 because 0024_b depends on it

View File

@@ -144,13 +144,13 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
pwd=str(snapshot.output_dir / "wget"), pwd=str(snapshot.output_dir / "wget"),
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"], cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
env={ env={
"SOURCE_URL": "https://example.com",
"SAFE_FLAG": "1", "SAFE_FLAG": "1",
"API_KEY": "super-secret-key", "API_KEY": "super-secret-key",
"ACCESS_TOKEN": "super-secret-token", "ACCESS_TOKEN": "super-secret-token",
"SHARED_SECRET": "super-secret-secret", "SHARED_SECRET": "super-secret-secret",
}, },
status=Process.StatusChoices.EXITED, status=Process.StatusChoices.EXITED,
url="https://example.com",
) )
result = ArchiveResult.objects.create( result = ArchiveResult.objects.create(
snapshot=snapshot, snapshot=snapshot,
@@ -164,7 +164,7 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
cmd_html = str(admin.cmd_str(result)) cmd_html = str(admin.cmd_str(result))
assert "SAFE_FLAG=1" in cmd_html assert "SAFE_FLAG=1" in cmd_html
assert "SOURCE_URL=https://example.com" in cmd_html assert "https://example.com" in cmd_html
assert "API_KEY" not in cmd_html assert "API_KEY" not in cmd_html
assert "ACCESS_TOKEN" not in cmd_html assert "ACCESS_TOKEN" not in cmd_html
assert "SHARED_SECRET" not in cmd_html assert "SHARED_SECRET" not in cmd_html

View File

@@ -8,6 +8,7 @@ Tests cover:
- Snapshot progress statistics - Snapshot progress statistics
""" """
import json
import pytest import pytest
import uuid import uuid
from pathlib import Path from pathlib import Path
@@ -822,7 +823,6 @@ class TestAdminSnapshotListView:
pwd="/tmp/archivebox", pwd="/tmp/archivebox",
cmd=["python", "/tmp/job.py", "--url=https://example.com"], cmd=["python", "/tmp/job.py", "--url=https://example.com"],
env={ env={
"SNAPSHOT_ID": "abc123",
"ENABLED": True, "ENABLED": True,
"API_KEY": "super-secret-key", "API_KEY": "super-secret-key",
"ACCESS_TOKEN": "super-secret-token", "ACCESS_TOKEN": "super-secret-token",
@@ -843,7 +843,6 @@ class TestAdminSnapshotListView:
assert response.status_code == 200 assert response.status_code == 200
assert b"Kill" in response.content assert b"Kill" in response.content
assert b"python /tmp/job.py --url=https://example.com" in response.content assert b"python /tmp/job.py --url=https://example.com" in response.content
assert b"SNAPSHOT_ID=abc123" in response.content
assert b"ENABLED=True" in response.content assert b"ENABLED=True" in response.content
assert b"52s" in response.content assert b"52s" in response.content
assert b"API_KEY=" not in response.content assert b"API_KEY=" not in response.content
@@ -1065,7 +1064,7 @@ class TestAdminSnapshotListView:
pid=54321, pid=54321,
exit_code=0, exit_code=0,
cmd=["/plugins/title/on_Snapshot__54_title.js", "--url=https://example.com"], cmd=["/plugins/title/on_Snapshot__54_title.js", "--url=https://example.com"],
env={"SNAPSHOT_ID": str(snapshot.id)}, env={"EXTRA_CONTEXT": json.dumps({"snapshot_id": str(snapshot.id)})},
started_at=timezone.now(), started_at=timezone.now(),
ended_at=timezone.now(), ended_at=timezone.now(),
) )
@@ -1252,11 +1251,8 @@ class TestLiveProgressView:
process_type=Process.TypeChoices.HOOK, process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING, status=Process.StatusChoices.RUNNING,
pid=pid, pid=pid,
pwd=str(snapshot.output_dir / "chrome"),
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js", "--url=https://example.com"], cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(), started_at=timezone.now(),
) )
@@ -1290,11 +1286,8 @@ class TestLiveProgressView:
process_type=Process.TypeChoices.HOOK, process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING, status=Process.StatusChoices.RUNNING,
pid=pid, pid=pid,
pwd=str(snapshot.output_dir / "title"),
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"], cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(), started_at=timezone.now(),
) )
@@ -1327,11 +1320,8 @@ class TestLiveProgressView:
process_type=Process.TypeChoices.HOOK, process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING, status=Process.StatusChoices.RUNNING,
pid=os.getpid(), pid=os.getpid(),
pwd=str(snapshot.output_dir / "chrome"),
cmd=["/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"], cmd=["/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(), started_at=timezone.now(),
) )
ArchiveResult.objects.create( ArchiveResult.objects.create(
@@ -1369,11 +1359,8 @@ class TestLiveProgressView:
status=Process.StatusChoices.EXITED, status=Process.StatusChoices.EXITED,
exit_code=0, exit_code=0,
pid=99999, pid=99999,
pwd=str(snapshot.output_dir / "title"),
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"], cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
env={
"CRAWL_ID": str(snapshot.crawl_id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(), started_at=timezone.now(),
ended_at=timezone.now(), ended_at=timezone.now(),
) )

View File

@@ -5,12 +5,12 @@ import pytest
from django.db import connection from django.db import connection
from abx_dl.events import BinaryRequestEvent, ProcessCompletedEvent, ProcessStartedEvent from abx_dl.events import ArchiveResultEvent, BinaryRequestEvent, ProcessEvent, ProcessStartedEvent
from abx_dl.orchestrator import create_bus from abx_dl.orchestrator import create_bus
from abx_dl.output_files import OutputFile from abx_dl.output_files import OutputFile
pytestmark = pytest.mark.django_db pytestmark = pytest.mark.django_db(transaction=True)
def _cleanup_machine_process_rows() -> None: def _cleanup_machine_process_rows() -> None:
@@ -75,8 +75,8 @@ def _create_iface(machine):
def test_process_completed_projects_inline_archiveresult(): def test_process_completed_projects_inline_archiveresult():
from archivebox.core.models import ArchiveResult from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata from archivebox.services.archive_result_service import ArchiveResultService
from archivebox.services.process_service import ProcessService import asyncio
snapshot = _create_snapshot() snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "wget" plugin_dir = Path(snapshot.output_dir) / "wget"
@@ -84,37 +84,23 @@ def test_process_completed_projects_inline_archiveresult():
(plugin_dir / "index.html").write_text("<html>ok</html>") (plugin_dir / "index.html").write_text("<html>ok</html>")
bus = create_bus(name="test_inline_archiveresult") bus = create_bus(name="test_inline_archiveresult")
process_service = ProcessService(bus) service = ArchiveResultService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent( event = ArchiveResultEvent(
plugin_name="wget",
hook_name="on_Snapshot__06_wget.finite.bg",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"wget/index.html"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
process_id="proc-inline",
snapshot_id=str(snapshot.id), snapshot_id=str(snapshot.id),
plugin="wget",
hook_name="on_Snapshot__06_wget.finite.bg",
status="succeeded",
output_str="wget/index.html",
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
start_ts="2026-03-22T12:00:00+00:00", start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00", end_ts="2026-03-22T12:00:01+00:00",
) )
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) async def emit_event() -> None:
service._project_from_process_completed( await service.on_ArchiveResultEvent__save_to_db(event)
event,
{ asyncio.run(emit_event())
"snapshot_id": str(snapshot.id),
"plugin": "wget",
"hook_name": "on_Snapshot__06_wget.finite.bg",
"status": "succeeded",
"output_str": "wget/index.html",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg") result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
@@ -127,45 +113,31 @@ def test_process_completed_projects_inline_archiveresult():
def test_process_completed_projects_synthetic_failed_archiveresult(): def test_process_completed_projects_synthetic_failed_archiveresult():
from archivebox.core.models import ArchiveResult from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata from archivebox.services.archive_result_service import ArchiveResultService
from archivebox.services.process_service import ProcessService import asyncio
snapshot = _create_snapshot() snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "chrome" plugin_dir = Path(snapshot.output_dir) / "chrome"
plugin_dir.mkdir(parents=True, exist_ok=True) plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_synthetic_archiveresult") bus = create_bus(name="test_synthetic_archiveresult")
process_service = ProcessService(bus) service = ArchiveResultService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent( event = ArchiveResultEvent(
plugin_name="chrome",
hook_name="on_Snapshot__11_chrome_wait",
stdout="",
stderr="Hook timed out after 60 seconds",
exit_code=-1,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-failed",
snapshot_id=str(snapshot.id), snapshot_id=str(snapshot.id),
plugin="chrome",
hook_name="on_Snapshot__11_chrome_wait",
status="failed",
output_str="Hook timed out after 60 seconds",
error="Hook timed out after 60 seconds",
start_ts="2026-03-22T12:00:00+00:00", start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:01:00+00:00", end_ts="2026-03-22T12:01:00+00:00",
) )
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) async def emit_event() -> None:
service._project_from_process_completed( await service.on_ArchiveResultEvent__save_to_db(event)
event,
{ asyncio.run(emit_event())
"plugin": "chrome",
"hook_name": "on_Snapshot__11_chrome_wait",
"status": "failed",
"output_str": "Hook timed out after 60 seconds",
"error": "Hook timed out after 60 seconds",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait") result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
assert result.status == ArchiveResult.StatusChoices.FAILED assert result.status == ArchiveResult.StatusChoices.FAILED
@@ -176,45 +148,30 @@ def test_process_completed_projects_synthetic_failed_archiveresult():
def test_process_completed_projects_noresults_archiveresult(): def test_process_completed_projects_noresults_archiveresult():
from archivebox.core.models import ArchiveResult from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata from archivebox.services.archive_result_service import ArchiveResultService
from archivebox.services.process_service import ProcessService import asyncio
snapshot = _create_snapshot() snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title" plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True) plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_noresults_archiveresult") bus = create_bus(name="test_noresults_archiveresult")
process_service = ProcessService(bus) service = ArchiveResultService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent( event = ArchiveResultEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-noresults",
snapshot_id=str(snapshot.id), snapshot_id=str(snapshot.id),
plugin="title",
hook_name="on_Snapshot__54_title.js",
status="noresults",
output_str="No title found",
start_ts="2026-03-22T12:00:00+00:00", start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00", end_ts="2026-03-22T12:00:01+00:00",
) )
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) async def emit_event() -> None:
service._project_from_process_completed( await service.on_ArchiveResultEvent__save_to_db(event)
event,
{ asyncio.run(emit_event())
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "noresults",
"output_str": "No title found",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js") result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
assert result.status == ArchiveResult.StatusChoices.NORESULTS assert result.status == ArchiveResult.StatusChoices.NORESULTS
@@ -258,45 +215,30 @@ def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state():
def test_process_completed_projects_snapshot_title_from_output_str(): def test_process_completed_projects_snapshot_title_from_output_str():
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata from archivebox.services.archive_result_service import ArchiveResultService
from archivebox.services.process_service import ProcessService import asyncio
snapshot = _create_snapshot() snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title" plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True) plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_snapshot_title_output_str") bus = create_bus(name="test_snapshot_title_output_str")
process_service = ProcessService(bus) service = ArchiveResultService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent( event = ArchiveResultEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"Example Domain"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-title-output-str",
snapshot_id=str(snapshot.id), snapshot_id=str(snapshot.id),
plugin="title",
hook_name="on_Snapshot__54_title.js",
status="succeeded",
output_str="Example Domain",
start_ts="2026-03-22T12:00:00+00:00", start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00", end_ts="2026-03-22T12:00:01+00:00",
) )
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) async def emit_event() -> None:
service._project_from_process_completed( await service.on_ArchiveResultEvent__save_to_db(event)
event,
{ asyncio.run(emit_event())
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "succeeded",
"output_str": "Example Domain",
},
output_files,
output_size,
output_mimetypes,
)
snapshot.refresh_from_db() snapshot.refresh_from_db()
assert snapshot.title == "Example Domain" assert snapshot.title == "Example Domain"
@@ -304,8 +246,8 @@ def test_process_completed_projects_snapshot_title_from_output_str():
def test_process_completed_projects_snapshot_title_from_title_file(): def test_process_completed_projects_snapshot_title_from_title_file():
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata from archivebox.services.archive_result_service import ArchiveResultService
from archivebox.services.process_service import ProcessService import asyncio
snapshot = _create_snapshot() snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title" plugin_dir = Path(snapshot.output_dir) / "title"
@@ -313,37 +255,23 @@ def test_process_completed_projects_snapshot_title_from_title_file():
(plugin_dir / "title.txt").write_text("Example Domain") (plugin_dir / "title.txt").write_text("Example Domain")
bus = create_bus(name="test_snapshot_title_file") bus = create_bus(name="test_snapshot_title_file")
process_service = ProcessService(bus) service = ArchiveResultService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent( event = ArchiveResultEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
process_id="proc-title-file",
snapshot_id=str(snapshot.id), snapshot_id=str(snapshot.id),
plugin="title",
hook_name="on_Snapshot__54_title.js",
status="noresults",
output_str="No title found",
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
start_ts="2026-03-22T12:00:00+00:00", start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00", end_ts="2026-03-22T12:00:01+00:00",
) )
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir) async def emit_event() -> None:
service._project_from_process_completed( await service.on_ArchiveResultEvent__save_to_db(event)
event,
{ asyncio.run(emit_event())
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "noresults",
"output_str": "No title found",
},
output_files,
output_size,
output_mimetypes,
)
snapshot.refresh_from_db() snapshot.refresh_from_db()
assert snapshot.title == "Example Domain" assert snapshot.title == "Example Domain"
@@ -410,9 +338,12 @@ def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path):
assert output_mimetypes == "application/warc" assert output_mimetypes == "application/warc"
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch): @pytest.mark.django_db(transaction=True)
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch, tmp_path):
from archivebox.machine.models import Binary, NetworkInterface from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService from archivebox.machine.models import Process as MachineProcess
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
from abx_dl.services.process_service import ProcessService as DlProcessService
machine = _create_machine() machine = _create_machine()
iface = _create_iface(machine) iface = _create_iface(machine)
@@ -428,35 +359,60 @@ def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(
status=Binary.StatusChoices.INSTALLED, status=Binary.StatusChoices.INSTALLED,
) )
hook_path = tmp_path / "on_Snapshot__57_mercury.py"
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
hook_path.chmod(0o755)
output_dir = tmp_path / "mercury"
output_dir.mkdir()
bus = create_bus(name="test_process_started_binary_hydration") bus = create_bus(name="test_process_started_binary_hydration")
service = ProcessService(bus) DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
event = ProcessStartedEvent( ArchiveBoxProcessService(bus)
plugin_name="mercury",
hook_name="on_Snapshot__57_mercury.py", async def run_test() -> None:
hook_path="/plugins/mercury/on_Snapshot__57_mercury.py", await bus.emit(
hook_args=["--url=https://example.com"], ProcessEvent(
output_dir="/tmp/mercury", plugin_name="mercury",
env={ hook_name="on_Snapshot__57_mercury.py",
"MERCURY_BINARY": binary.abspath, hook_path=str(hook_path),
"NODE_BINARY": "/tmp/node", hook_args=["--url=https://example.com"],
}, is_background=False,
timeout=60, output_dir=str(output_dir),
pid=4321, env={
process_id="proc-mercury", "MERCURY_BINARY": binary.abspath,
snapshot_id="", "NODE_BINARY": "/tmp/node",
start_ts="2026-03-22T12:00:00+00:00", },
timeout=60,
url="https://example.com",
),
)
started = await bus.find(
ProcessStartedEvent,
past=True,
future=False,
hook_name="on_Snapshot__57_mercury.py",
output_dir=str(output_dir),
)
assert started is not None
import asyncio
asyncio.run(run_test())
process = MachineProcess.objects.get(
pwd=str(output_dir),
cmd=[str(hook_path), "--url=https://example.com"],
) )
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == binary.id assert process.binary_id == binary.id
assert process.iface_id == iface.id assert process.iface_id == iface.id
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch): @pytest.mark.django_db(transaction=True)
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch, tmp_path):
from archivebox.machine.models import Binary, NetworkInterface from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService from archivebox.machine.models import Process as MachineProcess
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
from abx_dl.services.process_service import ProcessService as DlProcessService
machine = _create_machine() machine = _create_machine()
iface = _create_iface(machine) iface = _create_iface(machine)
@@ -472,27 +428,47 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
status=Binary.StatusChoices.INSTALLED, status=Binary.StatusChoices.INSTALLED,
) )
hook_path = tmp_path / "on_Snapshot__75_parse_dom_outlinks.js"
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
hook_path.chmod(0o755)
output_dir = tmp_path / "parse-dom-outlinks"
output_dir.mkdir()
bus = create_bus(name="test_process_started_node_fallback") bus = create_bus(name="test_process_started_node_fallback")
service = ProcessService(bus) DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
event = ProcessStartedEvent( ArchiveBoxProcessService(bus)
plugin_name="parse_dom_outlinks",
hook_name="on_Snapshot__75_parse_dom_outlinks.js", async def run_test() -> None:
hook_path="/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js", await bus.emit(
hook_args=["--url=https://example.com"], ProcessEvent(
output_dir="/tmp/parse-dom-outlinks", plugin_name="parse_dom_outlinks",
env={ hook_name="on_Snapshot__75_parse_dom_outlinks.js",
"NODE_BINARY": node.abspath, hook_path=str(hook_path),
}, hook_args=["--url=https://example.com"],
timeout=60, is_background=False,
pid=9876, output_dir=str(output_dir),
process_id="proc-parse-dom-outlinks", env={"NODE_BINARY": node.abspath},
snapshot_id="", timeout=60,
start_ts="2026-03-22T12:00:00+00:00", url="https://example.com",
),
)
started = await bus.find(
ProcessStartedEvent,
past=True,
future=False,
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
output_dir=str(output_dir),
)
assert started is not None
import asyncio
asyncio.run(run_test())
process = MachineProcess.objects.get(
pwd=str(output_dir),
cmd=[str(hook_path), "--url=https://example.com"],
) )
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == node.id assert process.binary_id == node.id
assert process.iface_id == iface.id assert process.iface_id == iface.id
@@ -500,6 +476,7 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
def test_binary_event_reuses_existing_installed_binary_row(monkeypatch): def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
from archivebox.machine.models import Binary, Machine from archivebox.machine.models import Binary, Machine
from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService
import asyncio
machine = _create_machine() machine = _create_machine()
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine)) monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
@@ -522,7 +499,7 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
binproviders="provider", binproviders="provider",
) )
service._project_binary(event) asyncio.run(service.on_BinaryRequestEvent(event))
binary.refresh_from_db() binary.refresh_from_db()
assert Binary.objects.filter(machine=machine, name="wget").count() == 1 assert Binary.objects.filter(machine=machine, name="wget").count() == 1

View File

@@ -378,11 +378,8 @@ class TestRecoverOrphanedCrawls:
machine=machine, machine=machine,
process_type=Process.TypeChoices.HOOK, process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING, status=Process.StatusChoices.RUNNING,
pwd=str(snapshot.output_dir / "chrome"),
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js"], cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js"],
env={
"CRAWL_ID": str(crawl.id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(), started_at=timezone.now(),
) )

View File

@@ -464,23 +464,24 @@ class TestDependencyRecordOutput(unittest.TestCase):
self.assertEqual(data["name"], "wget") self.assertEqual(data["name"], "wget")
self.assertTrue(data["abspath"].startswith("/")) self.assertTrue(data["abspath"].startswith("/"))
def test_dependency_record_outputs_machine_config(self): def test_dependency_record_outputs_binary_jsonl(self):
"""Dependency resolution should output Machine config update JSONL.""" """Dependency resolution should output Binary JSONL."""
hook_output = json.dumps( hook_output = json.dumps(
{ {
"type": "Machine", "type": "Binary",
"config": { "name": "wget",
"WGET_BINARY": "/usr/bin/wget", "abspath": "/usr/bin/wget",
}, "version": "1.21.3",
"binprovider": "env",
}, },
) )
from archivebox.machine.models import Process from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0] data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data["type"], "Machine") self.assertEqual(data["type"], "Binary")
self.assertIn("config", data) self.assertEqual(data["name"], "wget")
self.assertEqual(data["config"]["WGET_BINARY"], "/usr/bin/wget") self.assertEqual(data["abspath"], "/usr/bin/wget")
class TestSnapshotHookOutput(unittest.TestCase): class TestSnapshotHookOutput(unittest.TestCase):

View File

@@ -269,12 +269,12 @@ class TestBinaryModel(TestCase):
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED) self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
self.assertGreater(binary.modified_at, old_modified) self.assertGreater(binary.modified_at, old_modified)
def test_binary_from_json_preserves_install_args_overrides(self): def test_binary_from_json_preserves_provider_overrides(self):
"""Binary.from_json() should persist canonical install_args overrides unchanged.""" """Binary.from_json() should persist provider overrides unchanged."""
overrides = { overrides = {
"apt": {"install_args": ["chromium"]}, "apt": {"install_args": ["chromium"]},
"npm": {"install_args": "puppeteer"}, "npm": {"install_args": "puppeteer"},
"custom": {"install_args": ["bash", "-lc", "echo ok"]}, "custom": {"install": "bash -lc 'echo ok'"},
} }
binary = Binary.from_json( binary = Binary.from_json(

View File

@@ -1,69 +1,4 @@
import asyncio
import json
import pytest import pytest
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db pytestmark = pytest.mark.django_db
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
from archivebox.services import process_service as process_service_module
from archivebox.services.process_service import ProcessService
bus = create_bus(name="test_process_service_inline_process_event")
ProcessService(bus)
monkeypatch.setattr(
process_service_module,
"_ensure_worker",
lambda event: {
"pid": 4321,
"start": 1711111111.0,
"statename": "RUNNING",
"exitstatus": 0,
},
)
async def run_test():
await bus.emit(
ProcessStdoutEvent(
line=json.dumps(
{
"type": "ProcessEvent",
"plugin_name": "search_backend_sonic",
"hook_name": "worker_sonic",
"hook_path": "/usr/bin/sonic",
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
"is_background": True,
"daemon": True,
"url": "tcp://127.0.0.1:1491",
"output_dir": "/tmp/sonic",
"env": {},
"process_type": "worker",
"worker_type": "sonic",
"process_id": "worker:sonic",
"output_str": "127.0.0.1:1491",
},
),
plugin_name="search_backend_sonic",
hook_name="on_CrawlSetup__55_sonic_start.py",
output_dir="/tmp/search_backend_sonic",
snapshot_id="snap-1",
process_id="proc-hook",
),
)
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
await bus.stop()
return started
started = asyncio.run(run_test())
assert started is not None
assert started.hook_name == "worker_sonic"
assert started.process_type == "worker"
assert started.worker_type == "sonic"
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
assert getattr(started, "output_str", "") == "127.0.0.1:1491"

View File

@@ -34,18 +34,6 @@ class _DummyService:
pass pass
class _DummyAbxServices:
def __init__(self):
self.process = SimpleNamespace(wait_for_background_monitors=self._wait)
async def _wait(self):
return None
async def _call_sync(func, *args, **kwargs):
return func(*args, **kwargs)
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch): def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
@@ -82,18 +70,18 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
monkeypatch.setattr(runner_module, "CrawlService", _DummyService) monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService) monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService) monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices()) monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
download_calls = [] download_calls = []
async def fake_download(*, url, bus, snapshot, **kwargs): async def fake_download(*, url, bus, config_overrides, **kwargs):
extra_context = json.loads(config_overrides["EXTRA_CONTEXT"])
download_calls.append( download_calls.append(
{ {
"url": url, "url": url,
"bus": bus, "bus": bus,
"snapshot_id": snapshot.id, "snapshot_id": extra_context["snapshot_id"],
"source_url": snapshot.url, "source_url": url,
"abx_snapshot_id": snapshot.id,
}, },
) )
await asyncio.sleep(0) await asyncio.sleep(0)
@@ -113,9 +101,8 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "", "created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
"tags": snapshot_a.tags_str(), "tags": snapshot_a.tags_str(),
"depth": snapshot_a.depth, "depth": snapshot_a.depth,
"parent_snapshot_id": str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
"output_dir": str(snapshot_a.output_dir), "output_dir": str(snapshot_a.output_dir),
"config": crawl_runner._snapshot_config(snapshot_a), "config": crawl_runner.load_snapshot_payload(str(snapshot_a.id))["config"],
}, },
str(snapshot_b.id): { str(snapshot_b.id): {
"id": str(snapshot_b.id), "id": str(snapshot_b.id),
@@ -127,17 +114,16 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "", "created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
"tags": snapshot_b.tags_str(), "tags": snapshot_b.tags_str(),
"depth": snapshot_b.depth, "depth": snapshot_b.depth,
"parent_snapshot_id": str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
"output_dir": str(snapshot_b.output_dir), "output_dir": str(snapshot_b.output_dir),
"config": crawl_runner._snapshot_config(snapshot_b), "config": crawl_runner.load_snapshot_payload(str(snapshot_b.id))["config"],
}, },
} }
monkeypatch.setattr(crawl_runner, "_load_snapshot_run_data", lambda snapshot_id: snapshot_data[snapshot_id]) monkeypatch.setattr(crawl_runner, "load_snapshot_payload", lambda snapshot_id: snapshot_data[snapshot_id])
async def run_both(): async def run_both():
await asyncio.gather( await asyncio.gather(
crawl_runner._run_snapshot(str(snapshot_a.id)), crawl_runner.run_snapshot(str(snapshot_a.id)),
crawl_runner._run_snapshot(str(snapshot_b.id)), crawl_runner.run_snapshot(str(snapshot_b.id)),
) )
asyncio.run(run_both()) asyncio.run(run_both())
@@ -243,10 +229,10 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
refresh_calls = [] refresh_calls = []
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface())) monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc)) monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {}) monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
crawl_runner = runner_module.CrawlRunner(crawl) crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner._prepare() crawl_runner.load_run_state()
assert refresh_calls == [True] assert refresh_calls == [True]
assert proc.iface is not None assert proc.iface is not None
@@ -254,10 +240,12 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
assert saved_updates == [("iface", "machine", "modified_at")] assert saved_updates == [("iface", "machine", "modified_at")]
def test_installed_binary_config_overrides_include_valid_installed_binaries(monkeypatch): def test_load_run_state_uses_machine_config_as_derived_config(monkeypatch):
from archivebox.machine.models import Binary, Machine from archivebox.machine.models import Machine, NetworkInterface, Process
from archivebox.services import runner as runner_module from archivebox.services import runner as runner_module
from abx_dl.models import Plugin from archivebox.config import configset as configset_module
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
machine = Machine.objects.create( machine = Machine.objects.create(
guid="test-guid-runner-overrides", guid="test-guid-runner-overrides",
@@ -273,143 +261,30 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
os_release="14.0", os_release="14.0",
os_kernel="Darwin", os_kernel="Darwin",
stats={}, stats={},
config={}, config={"WGET_BINARY": "/tmp/wget", "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}},
) )
mercury_binary = Binary.objects.create( crawl = Crawl.objects.create(
machine=machine, urls="https://example.com",
name="postlight-parser", created_by_id=get_or_create_system_user_pk(),
abspath=sys.executable,
version="2.0.0",
binprovider="pip",
binproviders="env,pip",
status=Binary.StatusChoices.INSTALLED,
)
wget_binary = Binary.objects.create(
machine=machine,
name="wget",
abspath="/tmp/not-an-executable",
version="1.0.0",
binprovider="env",
binproviders="env",
status=Binary.StatusChoices.INSTALLED,
)
puppeteer_binary = Binary.objects.create(
machine=machine,
name="puppeteer",
abspath="/tmp/shared-lib/npm/node_modules/.bin/puppeteer",
version="24.40.0",
binprovider="npm",
binproviders="npm",
status=Binary.StatusChoices.INSTALLED,
)
ytdlp_binary = Binary.objects.create(
machine=machine,
name="yt-dlp",
abspath="/tmp/shared-lib/pip/venv/bin/yt-dlp",
version="2026.3.17",
binprovider="pip",
binproviders="pip",
status=Binary.StatusChoices.INSTALLED,
) )
proc = SimpleNamespace(iface_id=str(machine.id), machine_id=str(machine.id), iface=None, machine=machine, save=lambda **kwargs: None)
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr( monkeypatch.setattr(
Path, NetworkInterface,
"is_file", "current",
lambda self: ( classmethod(lambda cls, refresh=False: SimpleNamespace(id=machine.id, machine=machine)),
str(self) in {sys.executable, mercury_binary.abspath, wget_binary.abspath, puppeteer_binary.abspath, ytdlp_binary.abspath}
),
) )
monkeypatch.setattr( monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
runner_module.os,
"access",
lambda path, mode: str(path) in {sys.executable, puppeteer_binary.abspath, ytdlp_binary.abspath},
)
overrides = runner_module._installed_binary_config_overrides(
{
"mercury": Plugin(
name="mercury",
path=Path("."),
hooks=[],
config_schema={"MERCURY_BINARY": {"type": "string", "default": "postlight-parser"}},
),
},
)
assert overrides["MERCURY_BINARY"] == sys.executable
assert "POSTLIGHT_PARSER_BINARY" not in overrides
assert "WGET_BINARY" not in overrides
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
assert overrides["PIP_HOME"] == "/tmp/shared-lib/pip"
assert overrides["PIP_BIN_DIR"] == "/tmp/shared-lib/pip/venv/bin"
assert overrides["NPM_HOME"] == "/tmp/shared-lib/npm"
assert overrides["NPM_BIN_DIR"] == "/tmp/shared-lib/npm/node_modules/.bin"
assert overrides["NODE_MODULES_DIR"] == "/tmp/shared-lib/npm/node_modules"
assert overrides["NODE_MODULE_DIR"] == "/tmp/shared-lib/npm/node_modules"
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
from archivebox.machine.models import Binary, Machine
from archivebox.services import runner as runner_module
from abx_dl.models import Plugin
machine = Machine.objects.create(
guid="test-guid-runner-singlefile-cache",
hostname="runner-host-singlefile",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid="test-hw-runner-singlefile-cache",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
)
singlefile_extension = Binary.objects.create(
machine=machine,
name="singlefile",
abspath="/tmp/shared-lib/bin/singlefile",
version="1.0.0",
binprovider="chromewebstore",
binproviders="chromewebstore",
status=Binary.StatusChoices.INSTALLED,
)
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine)) monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath) monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
overrides = runner_module._installed_binary_config_overrides( crawl_runner = runner_module.CrawlRunner(crawl)
{ crawl_runner.load_run_state()
"singlefile": Plugin(
name="singlefile",
path=Path("."),
hooks=[],
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
binaries=[
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
{"name": "singlefile", "binproviders": "chromewebstore"},
],
),
},
config={"SINGLEFILE_BINARY": "single-file"},
)
assert "SINGLEFILE_BINARY" not in overrides assert crawl_runner.derived_config == machine.config
assert "LIB_DIR" not in overrides
assert "LIB_BIN_DIR" not in overrides
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch): def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch, tmp_path):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module from archivebox.services import runner as runner_module
@@ -428,12 +303,6 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
monkeypatch.setattr(runner_module, "CrawlService", _DummyService) monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService) monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService) monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "_limit_stop_reason", lambda config: "max_size")
monkeypatch.setattr(
asgiref.sync,
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr( monkeypatch.setattr(
runner_module, runner_module,
"download", "download",
@@ -441,8 +310,21 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
) )
crawl_runner = runner_module.CrawlRunner(crawl) crawl_runner = runner_module.CrawlRunner(crawl)
state_dir = tmp_path / ".abx-dl"
state_dir.mkdir(parents=True, exist_ok=True)
(state_dir / "limits.json").write_text(
json.dumps(
{
"admitted_snapshot_ids": ["child-1"],
"counted_process_ids": ["proc-1"],
"total_size": 32,
"stop_reason": "max_size",
},
),
encoding="utf-8",
)
cancelled: list[str] = [] cancelled: list[str] = []
crawl_runner._load_snapshot_run_data = lambda snapshot_id: { crawl_runner.load_snapshot_payload = lambda snapshot_id: {
"id": snapshot_id, "id": snapshot_id,
"url": "https://example.com/child", "url": "https://example.com/child",
"title": "", "title": "",
@@ -452,22 +334,23 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
"tags": "", "tags": "",
"depth": 1, "depth": 1,
"status": "queued", "status": "queued",
"parent_snapshot_id": None,
"output_dir": "/tmp/child", "output_dir": "/tmp/child",
"config": {"CRAWL_DIR": "/tmp/crawl", "MAX_SIZE": 16}, "config": {"CRAWL_DIR": str(tmp_path), "MAX_SIZE": 16},
} }
crawl_runner._cancel_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id) crawl_runner.seal_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
asyncio.run(crawl_runner._run_snapshot("child-1")) asyncio.run(crawl_runner.run_snapshot("child-1"))
assert cancelled == ["child-1"] assert cancelled == ["child-1"]
@pytest.mark.django_db(transaction=True)
def test_seal_snapshot_cancels_queued_descendants_after_max_size(): def test_seal_snapshot_cancels_queued_descendants_after_max_size():
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot
from archivebox.services.snapshot_service import SnapshotService from archivebox.services.snapshot_service import SnapshotService
from abx_dl.events import SnapshotCompletedEvent
from abx_dl.orchestrator import create_bus from abx_dl.orchestrator import create_bus
crawl = Crawl.objects.create( crawl = Crawl.objects.create(
@@ -505,13 +388,22 @@ def test_seal_snapshot_cancels_queued_descendants_after_max_size():
bus = create_bus(name="test_snapshot_limit_cancel") bus = create_bus(name="test_snapshot_limit_cancel")
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None) service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
try: try:
sealed_id = service._seal_snapshot(str(root.id))
async def emit_event() -> None:
await service.on_SnapshotCompletedEvent(
SnapshotCompletedEvent(
url=root.url,
snapshot_id=str(root.id),
output_dir=str(root.output_dir),
),
)
asyncio.run(emit_event())
finally: finally:
asyncio.run(bus.stop()) asyncio.run(bus.stop())
root.refresh_from_db() root.refresh_from_db()
child.refresh_from_db() child.refresh_from_db()
assert sealed_id == str(root.id)
assert root.status == Snapshot.StatusChoices.SEALED assert root.status == Snapshot.StatusChoices.SEALED
assert child.status == Snapshot.StatusChoices.SEALED assert child.status == Snapshot.StatusChoices.SEALED
assert child.retry_at is None assert child.retry_at is None
@@ -548,7 +440,6 @@ def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch): def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot
@@ -565,35 +456,23 @@ def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
status=Snapshot.StatusChoices.STARTED, status=Snapshot.StatusChoices.STARTED,
) )
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None) monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "is_finished", lambda: False)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None) monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)]) monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None) monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run()) asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status != Crawl.StatusChoices.SEALED assert crawl.status != Crawl.StatusChoices.SEALED
assert crawl.retry_at is not None assert crawl.retry_at is not None
def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch): def test_crawl_runner_calls_load_and_finalize_run_state(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot
@@ -618,50 +497,34 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
monkeypatch.setattr(runner_module, "CrawlService", _DummyService) monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService) monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService) monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None) monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(crawl, "cleanup", lambda: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None) monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)]) monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None) monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
sync_to_async_wrapped: list[str] = [] method_calls: list[str] = []
sync_to_async_active = False
def fake_sync_to_async(func, thread_sensitive=True): def wrapped_finalize(self):
async def wrapper(*args, **kwargs): method_calls.append("finalize_run_state")
nonlocal sync_to_async_active return None
sync_to_async_wrapped.append(getattr(func, "__name__", repr(func)))
previous = sync_to_async_active
sync_to_async_active = True
try:
return func(*args, **kwargs)
finally:
sync_to_async_active = previous
return wrapper def wrapped_load(self):
method_calls.append("load_run_state")
return [str(snapshot.id)]
def guarded_is_finished(): monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", wrapped_finalize)
assert sync_to_async_active is True monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", wrapped_load)
return False
monkeypatch.setattr(asgiref.sync, "sync_to_async", fake_sync_to_async)
monkeypatch.setattr(crawl, "is_finished", guarded_is_finished)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run()) asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db() crawl.refresh_from_db()
assert crawl.status == Crawl.StatusChoices.STARTED assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None assert crawl.retry_at is not None
assert "guarded_is_finished" in sync_to_async_wrapped assert method_calls == ["load_run_state", "finalize_run_state"]
def test_wait_for_snapshot_tasks_surfaces_already_failed_task(): def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
@@ -680,7 +543,7 @@ def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
task.set_exception(RuntimeError("snapshot failed")) task.set_exception(RuntimeError("snapshot failed"))
crawl_runner.snapshot_tasks["snap-1"] = task crawl_runner.snapshot_tasks["snap-1"] = task
with pytest.raises(RuntimeError, match="snapshot failed"): with pytest.raises(RuntimeError, match="snapshot failed"):
await crawl_runner._wait_for_snapshot_tasks() await crawl_runner.wait_for_snapshot_tasks()
asyncio.run(run_test()) asyncio.run(run_test())
@@ -702,14 +565,13 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
async def run_test(): async def run_test():
task = asyncio.create_task(finish_snapshot()) task = asyncio.create_task(finish_snapshot())
crawl_runner.snapshot_tasks["snap-1"] = task crawl_runner.snapshot_tasks["snap-1"] = task
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5) await asyncio.wait_for(crawl_runner.wait_for_snapshot_tasks(), timeout=0.5)
assert crawl_runner.snapshot_tasks == {} assert crawl_runner.snapshot_tasks == {}
asyncio.run(run_test()) asyncio.run(run_test())
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch): def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot
@@ -726,30 +588,18 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
status=Snapshot.StatusChoices.STARTED, status=Snapshot.StatusChoices.STARTED,
) )
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None) monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "is_finished", lambda: False)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None) monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)]) monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0)) monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None) monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
cleanup_calls = [] cleanup_calls = []
monkeypatch.setattr( monkeypatch.setattr(
runner_module.CrawlRunner, runner_module.CrawlRunner,
"_run_crawl_cleanup", "run_crawl_cleanup",
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0), lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
) )
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run()) asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
@@ -757,17 +607,20 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
assert cleanup_calls == ["abx_cleanup"] assert cleanup_calls == ["abx_cleanup"]
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path): def test_abx_process_service_background_process_finishes_after_process_exit(monkeypatch, tmp_path):
from abx_dl.models import Process as AbxProcess, now_iso from abx_dl.models import Process as AbxProcess, now_iso
from abx_dl.services.process_service import ProcessService from abx_dl.services.process_service import ProcessService
from abx_dl.events import ProcessCompletedEvent from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
service = object.__new__(ProcessService) service = object.__new__(ProcessService)
service.emit_jsonl = False service.emit_jsonl = False
emitted_events = [] emitted_events = []
async def fake_emit_event(event, *, detach_from_parent): class FakeBus:
emitted_events.append((event, detach_from_parent)) async def emit(self, event):
emitted_events.append(event)
service.bus = FakeBus()
async def fake_stream_stdout(**kwargs): async def fake_stream_stdout(**kwargs):
try: try:
@@ -775,19 +628,8 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
except asyncio.CancelledError: except asyncio.CancelledError:
return ["daemon output\n"] return ["daemon output\n"]
service._emit_event = fake_emit_event
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout) monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
class FakeAsyncProcess:
def __init__(self):
self.pid = 42424
self.returncode = None
async def wait(self):
await asyncio.sleep(0)
self.returncode = 0
return 0
plugin_output_dir = tmp_path / "chrome" plugin_output_dir = tmp_path / "chrome"
plugin_output_dir.mkdir() plugin_output_dir.mkdir()
stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log" stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log"
@@ -804,41 +646,45 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
plugin="chrome", plugin="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg", hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
) )
process = FakeAsyncProcess()
event = SimpleNamespace(
plugin_name="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
hook_path="hook",
hook_args=["--url=https://example.org/"],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
snapshot_id="snap-1",
is_background=True,
url="https://example.org/",
process_type="hook",
worker_type="hook",
)
async def run_test(): async def run_test():
process = await asyncio.create_subprocess_exec(
sys.executable,
"-c",
"pass",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
event = ProcessStartedEvent(
plugin_name="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
hook_path="hook",
hook_args=["--url=https://example.org/"],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
pid=process.pid,
is_background=True,
url="https://example.org/",
process_type="hook",
worker_type="hook",
start_ts=proc.started_at or "",
subprocess=process,
stdout_file=stdout_file,
stderr_file=stderr_file,
pid_file=pid_file,
cmd_file=plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.sh",
files_before=set(),
)
await asyncio.wait_for( await asyncio.wait_for(
service._monitor_background_process( service.on_ProcessStartedEvent(event),
event=event,
proc=proc,
process=process,
plugin_output_dir=plugin_output_dir,
stdout_file=stdout_file,
stderr_file=stderr_file,
pid_file=pid_file,
files_before=set(),
),
timeout=0.5, timeout=0.5,
) )
asyncio.run(run_test()) asyncio.run(run_test())
assert pid_file.exists() is False assert pid_file.exists() is False
assert any(isinstance(event, ProcessCompletedEvent) for event, _ in emitted_events) assert any(isinstance(event, ProcessCompletedEvent) for event in emitted_events)
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch): def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):

View File

@@ -0,0 +1,48 @@
import asyncio
import pytest
from abx_dl.events import TagEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db(transaction=True)
def _create_snapshot():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
return Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
def test_tag_event_projects_tag_to_snapshot():
from archivebox.core.models import Tag
from archivebox.services.tag_service import TagService
snapshot = _create_snapshot()
bus = create_bus(name="test_tag_service")
TagService(bus)
async def emit_tag_event() -> None:
await bus.emit(
TagEvent(
name="example",
snapshot_id=str(snapshot.id),
),
)
asyncio.run(emit_tag_event())
snapshot.refresh_from_db()
assert snapshot.tags.filter(name="example").exists()
assert Tag.objects.filter(name="example").exists()

2
docs

Submodule docs updated: be25d9bfa2...7244076ece

View File

@@ -42,7 +42,7 @@ Crawl.run()
{'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}} {'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}}
# ❌ WRONG - uses different field names # ❌ WRONG - uses different field names
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'custom_cmds': {...}} {'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'overrides': {...}}
``` ```
4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery. 4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery.
@@ -84,7 +84,7 @@ Crawl.run()
# ❌ WRONG - complex transformation logic # ❌ WRONG - complex transformation logic
if obj.get('type') == 'Dependency': if obj.get('type') == 'Dependency':
dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields
dep.custom_commands = transform_overrides(obj['overrides']) # transforming data dep.overrides = transform_overrides(obj['overrides']) # transforming data
``` ```
### Pattern Consistency ### Pattern Consistency

View File

@@ -159,6 +159,11 @@ environments = ["sys_platform == 'darwin'", "sys_platform == 'linux'"]
package = true package = true
# compile-bytecode = true # compile-bytecode = true
[tool.uv.sources]
abx-pkg = { path = "../abx-pkg", editable = true }
abx-plugins = { path = "../abx-plugins", editable = true }
abx-dl = { path = "../abx-dl", editable = true }
[build-system] [build-system]
requires = ["pdm-backend"] requires = ["pdm-backend"]
build-backend = "pdm.backend" build-backend = "pdm.backend"