mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
update working changes
This commit is contained in:
@@ -26,7 +26,7 @@ EVENT_FLOW_DIAGRAM = """
|
||||
│ CrawlStartEvent │
|
||||
│ └─ SnapshotEvent │
|
||||
│ └─ on_Snapshot__* │
|
||||
│ └─ Snapshot / ArchiveResult / Tag / Machine / BinaryRequest │
|
||||
│ └─ ArchiveResult / Snapshot / Tag │
|
||||
│ │
|
||||
│ SnapshotCleanupEvent -> internal cleanup, no direct hook family │
|
||||
│ CrawlCleanupEvent -> internal cleanup, no direct hook family │
|
||||
@@ -89,8 +89,8 @@ def pluginmap(
|
||||
"emits": ["ProcessEvent"],
|
||||
},
|
||||
"SnapshotEvent": {
|
||||
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, and BinaryRequest records.",
|
||||
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "BinaryRequestEvent", "ProcessEvent"],
|
||||
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, and Tag records.",
|
||||
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "ProcessEvent"],
|
||||
},
|
||||
"SnapshotCleanupEvent": {
|
||||
"description": "Internal snapshot cleanup phase.",
|
||||
|
||||
@@ -267,19 +267,13 @@ def get_config(
|
||||
if crawl and hasattr(crawl, "output_dir"):
|
||||
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
|
||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||
config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID")
|
||||
|
||||
# Apply snapshot config overrides (highest priority)
|
||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||
config.update(snapshot.config)
|
||||
|
||||
if snapshot:
|
||||
config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID")
|
||||
config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0)
|
||||
if hasattr(snapshot, "output_dir"):
|
||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||
if getattr(snapshot, "crawl_id", None):
|
||||
config["CRAWL_ID"] = str(snapshot.crawl_id)
|
||||
if snapshot and hasattr(snapshot, "output_dir"):
|
||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||
|
||||
# Normalize all aliases to canonical names (after all sources merged)
|
||||
# This handles aliases that came from user/crawl/snapshot configs, not just env
|
||||
|
||||
@@ -38,8 +38,8 @@ def _quote_shell_string(value: str) -> str:
|
||||
|
||||
|
||||
def _get_replay_source_url(result: ArchiveResult) -> str:
|
||||
process_env = getattr(getattr(result, "process", None), "env", None) or {}
|
||||
return str(process_env.get("SOURCE_URL") or result.snapshot.url or "")
|
||||
process = getattr(result, "process", None)
|
||||
return str(getattr(process, "url", None) or result.snapshot.url or "")
|
||||
|
||||
|
||||
def build_abx_dl_display_command(result: ArchiveResult) -> str:
|
||||
|
||||
@@ -1322,6 +1322,17 @@ def live_progress_view(request):
|
||||
|
||||
# Build hierarchical active crawls with nested snapshots and archive results
|
||||
|
||||
active_crawls_qs = (
|
||||
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
|
||||
.prefetch_related(
|
||||
"snapshot_set",
|
||||
"snapshot_set__archiveresult_set",
|
||||
"snapshot_set__archiveresult_set__process",
|
||||
)
|
||||
.distinct()
|
||||
.order_by("-modified_at")[:10]
|
||||
)
|
||||
|
||||
running_processes = Process.objects.filter(
|
||||
machine=machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
@@ -1343,28 +1354,45 @@ def live_progress_view(request):
|
||||
process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
||||
process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
||||
seen_process_records: set[str] = set()
|
||||
snapshots = [snapshot for crawl in active_crawls_qs for snapshot in crawl.snapshot_set.all()]
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
env = {}
|
||||
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if not proc.pwd:
|
||||
continue
|
||||
proc_pwd = Path(proc.pwd)
|
||||
matched_snapshot = None
|
||||
for snapshot in snapshots:
|
||||
try:
|
||||
proc_pwd.relative_to(snapshot.output_dir)
|
||||
matched_snapshot = snapshot
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
if matched_snapshot is None:
|
||||
continue
|
||||
crawl_id = str(matched_snapshot.crawl_id)
|
||||
snapshot_id = str(matched_snapshot.id)
|
||||
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
|
||||
if crawl_id and proc.pid:
|
||||
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
|
||||
crawl_process_pids.setdefault(crawl_id, proc.pid)
|
||||
if phase == "snapshot" and snapshot_id and proc.pid:
|
||||
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
|
||||
snapshot_process_pids.setdefault(snapshot_id, proc.pid)
|
||||
|
||||
for proc in recent_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
env = {}
|
||||
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if not crawl_id and not snapshot_id:
|
||||
if not proc.pwd:
|
||||
continue
|
||||
proc_pwd = Path(proc.pwd)
|
||||
matched_snapshot = None
|
||||
for snapshot in snapshots:
|
||||
try:
|
||||
proc_pwd.relative_to(snapshot.output_dir)
|
||||
matched_snapshot = snapshot
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
if matched_snapshot is None:
|
||||
continue
|
||||
crawl_id = str(matched_snapshot.crawl_id)
|
||||
snapshot_id = str(matched_snapshot.id)
|
||||
|
||||
plugin, label, phase, hook_name = process_label(proc.cmd)
|
||||
|
||||
@@ -1393,20 +1421,9 @@ def live_progress_view(request):
|
||||
payload["pid"] = proc.pid
|
||||
proc_started_at = proc.started_at or proc.modified_at
|
||||
if phase == "snapshot" and snapshot_id:
|
||||
process_records_by_snapshot.setdefault(str(snapshot_id), []).append((payload, proc_started_at))
|
||||
process_records_by_snapshot.setdefault(snapshot_id, []).append((payload, proc_started_at))
|
||||
elif crawl_id:
|
||||
process_records_by_crawl.setdefault(str(crawl_id), []).append((payload, proc_started_at))
|
||||
|
||||
active_crawls_qs = (
|
||||
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
|
||||
.prefetch_related(
|
||||
"snapshot_set",
|
||||
"snapshot_set__archiveresult_set",
|
||||
"snapshot_set__archiveresult_set__process",
|
||||
)
|
||||
.distinct()
|
||||
.order_by("-modified_at")[:10]
|
||||
)
|
||||
process_records_by_crawl.setdefault(crawl_id, []).append((payload, proc_started_at))
|
||||
|
||||
active_crawls = []
|
||||
total_workers = 0
|
||||
|
||||
@@ -827,7 +827,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
for record in records[:3]:
|
||||
print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}")
|
||||
if system_task:
|
||||
records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary", "Machine")]
|
||||
records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary")]
|
||||
overrides = {"crawl": self}
|
||||
stats = process_hook_records(records, overrides=overrides)
|
||||
if stats:
|
||||
|
||||
@@ -13,13 +13,9 @@ Hook-backed event families are discovered from filenames like:
|
||||
on_CrawlSetup__*
|
||||
on_Snapshot__*
|
||||
|
||||
InstallEvent itself is still part of the runtime lifecycle, but it has no
|
||||
corresponding hook family. Its dependency declarations come directly from each
|
||||
plugin's `config.json > required_binaries`.
|
||||
|
||||
Lifecycle event names like `InstallEvent` or `SnapshotCleanupEvent` are
|
||||
normalized to the corresponding `on_{EventFamily}__*` prefix by a simple
|
||||
string transform. If no scripts exist for that prefix, discovery returns `[]`.
|
||||
Internal bus event names are normalized to the corresponding
|
||||
`on_{EventFamily}__*` prefix by a simple string transform. If no scripts exist
|
||||
for that prefix, discovery returns `[]`.
|
||||
|
||||
Directory structure:
|
||||
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
|
||||
@@ -120,7 +116,6 @@ def normalize_hook_event_name(event_name: str) -> str | None:
|
||||
Normalize a hook event family or event class name to its on_* prefix.
|
||||
|
||||
Examples:
|
||||
InstallEvent -> Install
|
||||
BinaryRequestEvent -> BinaryRequest
|
||||
CrawlSetupEvent -> CrawlSetup
|
||||
SnapshotEvent -> Snapshot
|
||||
@@ -171,7 +166,7 @@ def discover_hooks(
|
||||
|
||||
Args:
|
||||
event_name: Hook event family or event class name.
|
||||
Examples: 'Install', 'InstallEvent', 'BinaryRequestEvent', 'Snapshot'.
|
||||
Examples: 'BinaryRequestEvent', 'Snapshot'.
|
||||
Event names are normalized by stripping a trailing `Event`.
|
||||
If no matching `on_{EventFamily}__*` scripts exist, returns [].
|
||||
filter_disabled: If True, skip hooks from disabled plugins (default: True)
|
||||
@@ -1070,9 +1065,8 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
|
||||
Process JSONL records emitted by hook stdout.
|
||||
|
||||
This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest,
|
||||
Binary, and Machine. It does not process bus lifecycle events like
|
||||
InstallEvent, CrawlEvent, CrawlCleanupEvent, or SnapshotCleanupEvent, since
|
||||
those are not emitted as JSONL records by hook subprocesses.
|
||||
and Binary. It does not process internal bus lifecycle events, since those
|
||||
are not emitted as JSONL records by hook subprocesses.
|
||||
|
||||
Args:
|
||||
records: List of JSONL record dicts from result['records']
|
||||
@@ -1131,13 +1125,6 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
|
||||
if obj:
|
||||
stats[record_type] = stats.get(record_type, 0) + 1
|
||||
|
||||
elif record_type == "Machine":
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
obj = Machine.from_json(record.copy(), overrides)
|
||||
if obj:
|
||||
stats["Machine"] = stats.get("Machine", 0) + 1
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@@ -566,33 +566,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
return None
|
||||
return {provider.strip() for provider in providers.split(",") if provider.strip()}
|
||||
|
||||
def _get_custom_install_command(self) -> str | None:
|
||||
"""Extract a custom install command from overrides when the custom provider is used."""
|
||||
import shlex
|
||||
|
||||
if not isinstance(self.overrides, dict):
|
||||
return None
|
||||
|
||||
for key in ("custom_cmd", "cmd", "command"):
|
||||
value = self.overrides.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip()
|
||||
|
||||
custom_overrides = self.overrides.get("custom")
|
||||
if isinstance(custom_overrides, dict):
|
||||
for key in ("custom_cmd", "cmd", "command"):
|
||||
value = custom_overrides.get(key)
|
||||
if isinstance(value, str) and value.strip():
|
||||
return value.strip()
|
||||
|
||||
install_args = custom_overrides.get("install_args")
|
||||
if isinstance(install_args, str) and install_args.strip():
|
||||
return install_args.strip()
|
||||
if isinstance(install_args, list) and install_args:
|
||||
return " ".join(shlex.quote(str(arg)) for arg in install_args if str(arg).strip())
|
||||
|
||||
return None
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute binary installation by running on_BinaryRequest__* hooks.
|
||||
@@ -637,13 +610,8 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
plugin_output_dir = output_dir / plugin_name
|
||||
plugin_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
custom_cmd = None
|
||||
overrides_json = None
|
||||
if plugin_name == "custom":
|
||||
custom_cmd = self._get_custom_install_command()
|
||||
if not custom_cmd:
|
||||
continue
|
||||
elif self.overrides:
|
||||
if self.overrides:
|
||||
overrides_json = json.dumps(self.overrides)
|
||||
|
||||
# Run the hook
|
||||
@@ -656,7 +624,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
machine_id=str(self.machine_id),
|
||||
name=self.name,
|
||||
binproviders=self.binproviders,
|
||||
custom_cmd=custom_cmd,
|
||||
overrides=overrides_json,
|
||||
)
|
||||
|
||||
|
||||
@@ -9,12 +9,11 @@ from typing import Any
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
|
||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent
|
||||
from abx_dl.output_files import guess_mimetype
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
from .process_service import ProcessService, parse_event_datetime
|
||||
from .process_service import parse_event_datetime
|
||||
|
||||
|
||||
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
|
||||
@@ -209,79 +208,41 @@ class ArchiveResultService(BaseService):
|
||||
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus, *, process_service: ProcessService):
|
||||
self.process_service = process_service
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db)
|
||||
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
|
||||
|
||||
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
|
||||
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
|
||||
if snapshot_output_dir is None:
|
||||
return
|
||||
plugin_dir = Path(snapshot_output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
|
||||
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
|
||||
return
|
||||
|
||||
plugin_dir = Path(event.output_dir)
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
records = _iter_archiveresult_records(event.stdout)
|
||||
if records:
|
||||
for record in records:
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
return
|
||||
|
||||
synthetic_record = {
|
||||
"plugin": event.plugin_name,
|
||||
"hook_name": event.hook_name,
|
||||
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
||||
"output_str": event.stderr if event.exit_code != 0 else "",
|
||||
"error": event.stderr if event.exit_code != 0 else "",
|
||||
}
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
synthetic_record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
|
||||
return str(snapshot.output_dir) if snapshot is not None else None
|
||||
|
||||
def _project(
|
||||
self,
|
||||
event: ArchiveResultEvent,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None:
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
||||
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||
if snapshot is None:
|
||||
return
|
||||
|
||||
plugin_dir = Path(snapshot.output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
process_started = await self.bus.find(
|
||||
ProcessStartedEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
|
||||
)
|
||||
process = None
|
||||
db_process_id = self.process_service.get_db_process_id(event.process_id)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process_started is not None:
|
||||
started_at = parse_event_datetime(process_started.start_ts)
|
||||
if started_at is None:
|
||||
raise ValueError("ProcessStartedEvent.start_ts is required")
|
||||
process_query = Process.objects.filter(
|
||||
pwd=process_started.output_dir,
|
||||
cmd=[process_started.hook_path, *process_started.hook_args],
|
||||
started_at=started_at,
|
||||
)
|
||||
if process_started.pid:
|
||||
process_query = process_query.filter(pid=process_started.pid)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
|
||||
result, _created = ArchiveResult.objects.get_or_create(
|
||||
result, _created = await ArchiveResult.objects.aget_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=event.plugin,
|
||||
hook_name=event.hook_name,
|
||||
@@ -302,32 +263,54 @@ class ArchiveResultService(BaseService):
|
||||
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
if event.error:
|
||||
result.notes = event.error
|
||||
result.save()
|
||||
await result.asave()
|
||||
|
||||
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
|
||||
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
|
||||
snapshot.title = next_title
|
||||
snapshot.save(update_fields=["title", "modified_at"])
|
||||
await snapshot.asave(update_fields=["title", "modified_at"])
|
||||
|
||||
def _project_from_process_completed(
|
||||
self,
|
||||
event: ProcessCompletedEvent,
|
||||
record: dict,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
archive_result_event = ArchiveResultEvent(
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
plugin=record.get("plugin") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or event.hook_name,
|
||||
status=record.get("status") or "",
|
||||
process_id=event.process_id,
|
||||
output_str=record.get("output_str") or "",
|
||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||
output_files=event.output_files,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
|
||||
if not event.hook_name.startswith("on_Snapshot"):
|
||||
return
|
||||
snapshot_event = await self.bus.find(
|
||||
SnapshotEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
|
||||
)
|
||||
if snapshot_event is None:
|
||||
return
|
||||
|
||||
records = _iter_archiveresult_records(event.stdout)
|
||||
if records:
|
||||
for record in records:
|
||||
await self.bus.emit(
|
||||
ArchiveResultEvent(
|
||||
snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id,
|
||||
plugin=record.get("plugin") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or event.hook_name,
|
||||
status=record.get("status") or "",
|
||||
output_str=record.get("output_str") or "",
|
||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||
output_files=event.output_files,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
await self.bus.emit(
|
||||
ArchiveResultEvent(
|
||||
snapshot_id=snapshot_event.snapshot_id,
|
||||
plugin=event.plugin_name,
|
||||
hook_name=event.hook_name,
|
||||
status="failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
||||
output_str=event.stderr if event.exit_code != 0 else "",
|
||||
output_files=event.output_files,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=event.stderr if event.exit_code != 0 else "",
|
||||
),
|
||||
)
|
||||
self._project(archive_result_event, output_files, output_size, output_mimetypes)
|
||||
|
||||
@@ -1,20 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from abx_dl.events import BinaryRequestEvent, BinaryEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class BinaryService(BaseService):
|
||||
LISTENS_TO = [BinaryRequestEvent, BinaryEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
|
||||
await run_db_op(self._project_binary, event)
|
||||
cached = await run_db_op(self._load_cached_binary, event)
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent)
|
||||
self.bus.on(BinaryEvent, self.on_BinaryEvent)
|
||||
|
||||
async def on_BinaryRequestEvent(self, event: BinaryRequestEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
existing = await Binary.objects.filter(machine=machine, name=event.name).afirst()
|
||||
if existing and existing.status == Binary.StatusChoices.INSTALLED:
|
||||
changed = False
|
||||
if event.binproviders and existing.binproviders != event.binproviders:
|
||||
existing.binproviders = event.binproviders
|
||||
changed = True
|
||||
if event.overrides and existing.overrides != event.overrides:
|
||||
existing.overrides = event.overrides
|
||||
changed = True
|
||||
if changed:
|
||||
await existing.asave(update_fields=["binproviders", "overrides", "modified_at"])
|
||||
elif existing is None:
|
||||
await Binary.objects.acreate(
|
||||
machine=machine,
|
||||
name=event.name,
|
||||
binproviders=event.binproviders,
|
||||
overrides=event.overrides or {},
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
installed = (
|
||||
await Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("-modified_at")
|
||||
.afirst()
|
||||
)
|
||||
cached = None
|
||||
if installed is not None:
|
||||
cached = {
|
||||
"abspath": installed.abspath,
|
||||
"version": installed.version or "",
|
||||
"sha256": installed.sha256 or "",
|
||||
"binproviders": installed.binproviders or "",
|
||||
"binprovider": installed.binprovider or "",
|
||||
"machine_id": str(installed.machine_id),
|
||||
"overrides": installed.overrides or {},
|
||||
}
|
||||
if cached is not None:
|
||||
await self.bus.emit(
|
||||
BinaryEvent(
|
||||
@@ -28,126 +70,34 @@ class BinaryService(BaseService):
|
||||
binprovider=cached["binprovider"],
|
||||
overrides=event.overrides or cached["overrides"],
|
||||
binary_id=event.binary_id,
|
||||
machine_id=event.machine_id or cached["machine_id"],
|
||||
machine_id=cached["machine_id"],
|
||||
),
|
||||
)
|
||||
|
||||
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
|
||||
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
|
||||
await run_db_op(self._project_installed_binary, event, resolved)
|
||||
|
||||
def _project_binary(self, event: BinaryRequestEvent) -> None:
|
||||
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
existing = Binary.objects.filter(machine=machine, name=event.name).first()
|
||||
if existing and existing.status == Binary.StatusChoices.INSTALLED:
|
||||
changed = False
|
||||
if event.binproviders and existing.binproviders != event.binproviders:
|
||||
existing.binproviders = event.binproviders
|
||||
changed = True
|
||||
if event.overrides and existing.overrides != event.overrides:
|
||||
existing.overrides = event.overrides
|
||||
changed = True
|
||||
if changed:
|
||||
existing.save(update_fields=["binproviders", "overrides", "modified_at"])
|
||||
return
|
||||
|
||||
Binary.from_json(
|
||||
{
|
||||
"name": event.name,
|
||||
"binproviders": event.binproviders,
|
||||
"overrides": event.overrides or {},
|
||||
},
|
||||
)
|
||||
|
||||
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
installed = (
|
||||
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("-modified_at")
|
||||
.first()
|
||||
)
|
||||
if installed is None:
|
||||
return None
|
||||
return {
|
||||
"abspath": installed.abspath,
|
||||
"version": installed.version or "",
|
||||
"sha256": installed.sha256 or "",
|
||||
"binproviders": installed.binproviders or "",
|
||||
"binprovider": installed.binprovider or "",
|
||||
"machine_id": str(installed.machine_id),
|
||||
"overrides": installed.overrides or {},
|
||||
}
|
||||
|
||||
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
|
||||
resolved = {
|
||||
"abspath": event.abspath or "",
|
||||
"version": event.version or "",
|
||||
"sha256": event.sha256 or "",
|
||||
"binproviders": event.binproviders or "",
|
||||
"binprovider": event.binprovider or "",
|
||||
}
|
||||
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
|
||||
return resolved
|
||||
|
||||
if resolved["abspath"] and not resolved["version"]:
|
||||
try:
|
||||
from abx_pkg.semver import bin_version
|
||||
|
||||
detected_version = bin_version(resolved["abspath"])
|
||||
except Exception:
|
||||
detected_version = None
|
||||
if detected_version:
|
||||
resolved["version"] = str(detected_version)
|
||||
if resolved["version"] and resolved["binprovider"]:
|
||||
return resolved
|
||||
|
||||
try:
|
||||
from abx_dl.dependencies import load_binary
|
||||
|
||||
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
|
||||
spec = {
|
||||
"name": event.name,
|
||||
"binproviders": allowed_providers,
|
||||
"overrides": event.overrides or {},
|
||||
}
|
||||
binary = load_binary(spec)
|
||||
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
|
||||
resolved["version"] = str(binary.version or resolved["version"] or "")
|
||||
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
|
||||
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
|
||||
resolved["binprovider"] = str(binary.loaded_binprovider.name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return resolved
|
||||
|
||||
def _project_installed_binary(self, event: BinaryEvent, resolved: dict[str, str]) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
binary, _ = Binary.objects.get_or_create(
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
binary, _ = await Binary.objects.aget_or_create(
|
||||
machine=machine,
|
||||
name=event.name,
|
||||
defaults={
|
||||
"status": Binary.StatusChoices.QUEUED,
|
||||
},
|
||||
)
|
||||
binary.abspath = resolved["abspath"] or binary.abspath
|
||||
binary.version = resolved["version"] or binary.version
|
||||
binary.sha256 = resolved["sha256"] or binary.sha256
|
||||
if resolved["binproviders"]:
|
||||
binary.binproviders = resolved["binproviders"]
|
||||
binary.binprovider = resolved["binprovider"] or binary.binprovider
|
||||
binary.abspath = event.abspath
|
||||
if event.version:
|
||||
binary.version = event.version
|
||||
if event.sha256:
|
||||
binary.sha256 = event.sha256
|
||||
if event.binproviders:
|
||||
binary.binproviders = event.binproviders
|
||||
if event.binprovider:
|
||||
binary.binprovider = event.binprovider
|
||||
if event.overrides and binary.overrides != event.overrides:
|
||||
binary.overrides = event.overrides
|
||||
binary.status = Binary.StatusChoices.INSTALLED
|
||||
binary.retry_at = None
|
||||
binary.save(
|
||||
await binary.asave(
|
||||
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
|
||||
)
|
||||
|
||||
@@ -3,8 +3,6 @@ from __future__ import annotations
|
||||
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class CrawlService(BaseService):
|
||||
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
|
||||
@@ -13,32 +11,42 @@ class CrawlService(BaseService):
|
||||
def __init__(self, bus, *, crawl_id: str):
|
||||
self.crawl_id = crawl_id
|
||||
super().__init__(bus)
|
||||
self.bus.on(CrawlSetupEvent, self.on_CrawlSetupEvent__save_to_db)
|
||||
self.bus.on(CrawlStartEvent, self.on_CrawlStartEvent__save_to_db)
|
||||
self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__save_to_db)
|
||||
self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__save_to_db)
|
||||
|
||||
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
|
||||
await run_db_op(self._mark_completed)
|
||||
|
||||
def _mark_started(self) -> None:
|
||||
async def on_CrawlSetupEvent__save_to_db(self, event: CrawlSetupEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
def _mark_completed(self) -> None:
|
||||
async def on_CrawlStartEvent__save_to_db(self, event: CrawlStartEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = None
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
async def on_CrawlCleanupEvent__save_to_db(self, event: CrawlCleanupEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = None
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
async def on_CrawlCompletedEvent__save_to_db(self, event: CrawlCompletedEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.db import close_old_connections
|
||||
|
||||
|
||||
def _run_db_op(func, *args, **kwargs):
|
||||
close_old_connections()
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
finally:
|
||||
close_old_connections()
|
||||
|
||||
|
||||
async def run_db_op(func, *args, **kwargs):
|
||||
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)
|
||||
@@ -1,22 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from abx_dl.events import MachineEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class MachineService(BaseService):
|
||||
LISTENS_TO = [MachineEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(MachineEvent, self.on_MachineEvent__save_to_db)
|
||||
|
||||
def _project(self, event: MachineEvent) -> None:
|
||||
async def on_MachineEvent__save_to_db(self, event: MachineEvent) -> None:
|
||||
from archivebox.machine.models import Machine, _sanitize_machine_config
|
||||
|
||||
machine = Machine.current()
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
config = dict(machine.config or {})
|
||||
|
||||
if event.config is not None:
|
||||
@@ -29,4 +30,4 @@ class MachineService(BaseService):
|
||||
return
|
||||
|
||||
machine.config = _sanitize_machine_config(config)
|
||||
machine.save(update_fields=["config", "modified_at"])
|
||||
await machine.asave(update_fields=["config", "modified_at"])
|
||||
|
||||
@@ -1,29 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone as datetime_timezone
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import socket
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, ClassVar
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
from typing import ClassVar
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abxbus import BaseEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
|
||||
WORKER_READY_TIMEOUT = 10.0
|
||||
|
||||
|
||||
def parse_event_datetime(value: str | None):
|
||||
if not value:
|
||||
@@ -37,308 +23,133 @@ def parse_event_datetime(value: str | None):
|
||||
return dt
|
||||
|
||||
|
||||
def _is_port_listening(host: str, port: int) -> bool:
|
||||
if not host or not port:
|
||||
return False
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
|
||||
if not url:
|
||||
return None
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
|
||||
return None
|
||||
return parsed.hostname, parsed.port
|
||||
|
||||
|
||||
def _supervisor_env(env: dict[str, str]) -> str:
|
||||
pairs = []
|
||||
for key, value in env.items():
|
||||
escaped = value.replace('"', '\\"')
|
||||
pairs.append(f'{key}="{escaped}"')
|
||||
return ",".join(pairs)
|
||||
|
||||
|
||||
def _iso_from_epoch(value: object) -> str:
|
||||
if not isinstance(value, (int, float)) or value <= 0:
|
||||
return ""
|
||||
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _int_from_object(value: object) -> int:
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
||||
|
||||
output_dir = Path(process_event.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
worker_name = process_event.hook_name
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
|
||||
|
||||
existing = get_worker(supervisor, worker_name)
|
||||
if (
|
||||
isinstance(existing, dict)
|
||||
and existing.get("statename") == "RUNNING"
|
||||
and (worker_socket is None or _is_port_listening(*worker_socket))
|
||||
):
|
||||
return existing
|
||||
|
||||
daemon = {
|
||||
"name": worker_name,
|
||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
||||
"directory": str(output_dir),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
if process_event.env:
|
||||
daemon["environment"] = _supervisor_env(process_event.env)
|
||||
|
||||
proc = start_worker(supervisor, daemon)
|
||||
deadline = time.monotonic() + WORKER_READY_TIMEOUT
|
||||
while time.monotonic() < deadline:
|
||||
current = get_worker(supervisor, worker_name)
|
||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
||||
if worker_socket is None or _is_port_listening(*worker_socket):
|
||||
return current
|
||||
time.sleep(0.1)
|
||||
return proc if isinstance(proc, dict) else {}
|
||||
|
||||
|
||||
class ProcessService(BaseService):
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = []
|
||||
|
||||
def __init__(self, bus):
|
||||
self.process_ids: dict[str, str] = {}
|
||||
super().__init__(bus)
|
||||
self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db)
|
||||
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
|
||||
|
||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
||||
try:
|
||||
record = json.loads(event.line)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return
|
||||
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
|
||||
return
|
||||
|
||||
passthrough_fields: dict[str, Any] = {
|
||||
key: value
|
||||
for key, value in record.items()
|
||||
if key
|
||||
not in {
|
||||
"type",
|
||||
"plugin_name",
|
||||
"hook_name",
|
||||
"hook_path",
|
||||
"hook_args",
|
||||
"is_background",
|
||||
"output_dir",
|
||||
"env",
|
||||
"snapshot_id",
|
||||
"process_id",
|
||||
"url",
|
||||
"timeout",
|
||||
"daemon",
|
||||
"process_type",
|
||||
"worker_type",
|
||||
"event_timeout",
|
||||
"event_handler_timeout",
|
||||
}
|
||||
}
|
||||
process_event = ProcessEvent(
|
||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or "process",
|
||||
hook_path=record["hook_path"],
|
||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
||||
is_background=bool(record.get("is_background", True)),
|
||||
output_dir=record.get("output_dir") or event.output_dir,
|
||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
timeout=int(record.get("timeout") or 60),
|
||||
daemon=bool(record.get("daemon", False)),
|
||||
url=str(record.get("url") or ""),
|
||||
process_type=str(record.get("process_type") or ""),
|
||||
worker_type=str(record.get("worker_type") or ""),
|
||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
||||
**passthrough_fields,
|
||||
)
|
||||
if not process_event.daemon:
|
||||
await self.bus.emit(process_event)
|
||||
return
|
||||
|
||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
||||
start_ts = _iso_from_epoch(proc.get("start"))
|
||||
pid = _int_from_object(proc.get("pid"))
|
||||
statename = str(proc.get("statename") or "")
|
||||
exitstatus = _int_from_object(proc.get("exitstatus"))
|
||||
process_type = process_event.process_type or "worker"
|
||||
worker_type = process_event.worker_type or process_event.plugin_name
|
||||
|
||||
if statename == "RUNNING" and pid:
|
||||
await self.bus.emit(
|
||||
ProcessStartedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
output_dir=process_event.output_dir,
|
||||
env=process_event.env,
|
||||
timeout=process_event.timeout,
|
||||
pid=pid,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
is_background=True,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
stderr = (
|
||||
f"Worker {process_event.hook_name} failed to start"
|
||||
if not statename
|
||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
||||
)
|
||||
await self.bus.emit(
|
||||
ProcessCompletedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
env=process_event.env,
|
||||
stdout="",
|
||||
stderr=stderr,
|
||||
exit_code=exitstatus or 1,
|
||||
output_dir=process_event.output_dir,
|
||||
is_background=True,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
pid=pid,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
raise RuntimeError(stderr)
|
||||
|
||||
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
|
||||
await run_db_op(self._project_started, event)
|
||||
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
await run_db_op(self._project_completed, event)
|
||||
|
||||
def get_db_process_id(self, process_id: str) -> str | None:
|
||||
return self.process_ids.get(process_id)
|
||||
|
||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
|
||||
async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None:
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
db_process_id = self.process_ids.get(event.process_id)
|
||||
iface = NetworkInterface.current(refresh=True)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process is not None:
|
||||
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
return process
|
||||
|
||||
process_type = getattr(event, "process_type", "") or (
|
||||
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
|
||||
process_type = event.process_type or (
|
||||
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
|
||||
)
|
||||
worker_type = getattr(event, "worker_type", "") or ""
|
||||
if process_type == Process.TypeChoices.WORKER and worker_type:
|
||||
existing = (
|
||||
Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
)
|
||||
.order_by("-modified_at")
|
||||
.first()
|
||||
)
|
||||
if existing is not None:
|
||||
self.process_ids[event.process_id] = str(existing.id)
|
||||
return existing
|
||||
process = Process.objects.create(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
worker_type = event.worker_type or ""
|
||||
started_at = parse_event_datetime(event.start_ts)
|
||||
if started_at is None:
|
||||
raise ValueError("ProcessStartedEvent.start_ts is required")
|
||||
process_query = Process.objects.filter(
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
timeout=getattr(event, "timeout", 60),
|
||||
pid=event.pid or None,
|
||||
url=getattr(event, "url", "") or None,
|
||||
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
started_at=started_at,
|
||||
)
|
||||
self.process_ids[event.process_id] = str(process.id)
|
||||
return process
|
||||
if event.pid:
|
||||
process_query = process_query.filter(pid=event.pid)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
if process is None:
|
||||
process = await Process.objects.acreate(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
timeout=event.timeout,
|
||||
pid=event.pid or None,
|
||||
url=event.url or None,
|
||||
started_at=started_at,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
)
|
||||
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
await process.asave(update_fields=["iface", "machine", "modified_at"])
|
||||
|
||||
def _project_started(self, event: ProcessStartedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.timeout = event.timeout
|
||||
process.pid = event.pid or None
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
process.url = event.url or process.url
|
||||
process.process_type = process_type or process.process_type
|
||||
process.worker_type = worker_type or process.worker_type
|
||||
process.started_at = started_at
|
||||
process.status = process.StatusChoices.RUNNING
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
|
||||
plugin_name=event.plugin_name,
|
||||
hook_path=event.hook_path,
|
||||
)
|
||||
await process.asave()
|
||||
|
||||
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
|
||||
process_type = event.process_type or (
|
||||
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
|
||||
)
|
||||
worker_type = event.worker_type or ""
|
||||
started_at = parse_event_datetime(event.start_ts)
|
||||
if started_at is None:
|
||||
raise ValueError("ProcessCompletedEvent.start_ts is required")
|
||||
process_query = Process.objects.filter(
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
started_at=started_at,
|
||||
)
|
||||
if event.pid:
|
||||
process_query = process_query.filter(pid=event.pid)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
if process is None:
|
||||
process = await Process.objects.acreate(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
timeout=event.timeout,
|
||||
pid=event.pid or None,
|
||||
url=event.url or None,
|
||||
started_at=started_at,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
)
|
||||
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
await process.asave(update_fields=["iface", "machine", "modified_at"])
|
||||
|
||||
def _project_completed(self, event: ProcessCompletedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
if not process.cmd:
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
process.url = event.url or process.url
|
||||
process.process_type = process_type or process.process_type
|
||||
process.worker_type = worker_type or process.worker_type
|
||||
process.started_at = started_at
|
||||
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
process.stdout = event.stdout
|
||||
process.stderr = event.stderr
|
||||
process.exit_code = event.exit_code
|
||||
process.status = process.StatusChoices.EXITED
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
|
||||
plugin_name=event.plugin_name,
|
||||
hook_path=event.hook_path,
|
||||
)
|
||||
await process.asave()
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -13,12 +12,13 @@ from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Any
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
from rich.console import Console
|
||||
|
||||
from abx_dl.events import BinaryRequestEvent
|
||||
from abx_dl.limits import CrawlLimitState
|
||||
from abx_dl.models import Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
|
||||
from abx_dl.models import Plugin, discover_plugins, filter_plugins
|
||||
from abx_dl.orchestrator import (
|
||||
create_bus,
|
||||
download,
|
||||
@@ -40,150 +40,9 @@ def _bus_name(prefix: str, identifier: str) -> str:
|
||||
return f"{prefix}_{normalized}"
|
||||
|
||||
|
||||
def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
|
||||
raw = str(config.get("PLUGINS") or "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
return [name.strip() for name in raw.split(",") if name.strip()]
|
||||
|
||||
|
||||
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
|
||||
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
|
||||
return sum(
|
||||
1
|
||||
for plugin in selected.values()
|
||||
for hook in plugin.hooks
|
||||
if "Install" in hook.name or "CrawlSetup" in hook.name or "Snapshot" in hook.name
|
||||
)
|
||||
|
||||
|
||||
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
|
||||
|
||||
|
||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
|
||||
keys: list[str] = []
|
||||
|
||||
for plugin in plugins.values():
|
||||
for spec in plugin.binaries:
|
||||
template_name = str(spec.get("name") or "").strip()
|
||||
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
|
||||
if match is None:
|
||||
continue
|
||||
key = match.group(1)
|
||||
configured_value = config.get(key)
|
||||
if configured_value is not None and str(configured_value).strip() == binary_name:
|
||||
keys.append(key)
|
||||
for key, prop in plugin.config_schema.items():
|
||||
if key.endswith("_BINARY") and prop.get("default") == binary_name:
|
||||
keys.append(key)
|
||||
|
||||
return list(dict.fromkeys(keys))
|
||||
|
||||
|
||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
active_config = dict(config or {})
|
||||
overrides: dict[str, str] = {}
|
||||
shared_lib_dir: Path | None = None
|
||||
pip_home: Path | None = None
|
||||
pip_bin_dir: Path | None = None
|
||||
npm_home: Path | None = None
|
||||
node_modules_dir: Path | None = None
|
||||
npm_bin_dir: Path | None = None
|
||||
binaries = (
|
||||
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
|
||||
)
|
||||
|
||||
for binary in binaries:
|
||||
try:
|
||||
resolved_path = Path(binary.abspath).expanduser()
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
|
||||
continue
|
||||
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
|
||||
overrides[key] = binary.abspath
|
||||
|
||||
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
|
||||
npm_bin_dir = npm_bin_dir or resolved_path.parent
|
||||
node_modules_dir = node_modules_dir or resolved_path.parent.parent
|
||||
npm_home = npm_home or resolved_path.parent.parent.parent
|
||||
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
|
||||
elif (
|
||||
resolved_path.parent.name == "bin"
|
||||
and resolved_path.parent.parent.name == "venv"
|
||||
and resolved_path.parent.parent.parent.name == "pip"
|
||||
):
|
||||
pip_bin_dir = pip_bin_dir or resolved_path.parent
|
||||
pip_home = pip_home or resolved_path.parent.parent.parent
|
||||
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
|
||||
|
||||
if shared_lib_dir is not None:
|
||||
overrides["LIB_DIR"] = str(shared_lib_dir)
|
||||
overrides["LIB_BIN_DIR"] = str(shared_lib_dir / "bin")
|
||||
if pip_home is not None:
|
||||
overrides["PIP_HOME"] = str(pip_home)
|
||||
if pip_bin_dir is not None:
|
||||
overrides["PIP_BIN_DIR"] = str(pip_bin_dir)
|
||||
if npm_home is not None:
|
||||
overrides["NPM_HOME"] = str(npm_home)
|
||||
if node_modules_dir is not None:
|
||||
overrides["NODE_MODULES_DIR"] = str(node_modules_dir)
|
||||
overrides["NODE_MODULE_DIR"] = str(node_modules_dir)
|
||||
overrides["NODE_PATH"] = str(node_modules_dir)
|
||||
if npm_bin_dir is not None:
|
||||
overrides["NPM_BIN_DIR"] = str(npm_bin_dir)
|
||||
|
||||
return overrides
|
||||
|
||||
|
||||
def _limit_stop_reason(config: dict[str, Any]) -> str:
|
||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
||||
|
||||
|
||||
def _attach_bus_trace(bus) -> None:
|
||||
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
|
||||
if not trace_target:
|
||||
return
|
||||
if getattr(bus, "_archivebox_trace_task", None) is not None:
|
||||
return
|
||||
|
||||
trace_path = None if trace_target in {"1", "-", "stderr"} else Path(trace_target)
|
||||
stop_event = asyncio.Event()
|
||||
|
||||
async def trace_loop() -> None:
|
||||
seen_event_ids: set[str] = set()
|
||||
while not stop_event.is_set():
|
||||
for event_id, event in list(bus.event_history.items()):
|
||||
if event_id in seen_event_ids:
|
||||
continue
|
||||
seen_event_ids.add(event_id)
|
||||
payload = event.model_dump(mode="json")
|
||||
payload["bus_name"] = bus.name
|
||||
line = json.dumps(payload, ensure_ascii=False, default=str, separators=(",", ":"))
|
||||
if trace_path is None:
|
||||
print(line, file=sys.stderr, flush=True)
|
||||
else:
|
||||
trace_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with trace_path.open("a", encoding="utf-8") as handle:
|
||||
handle.write(line + "\n")
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
bus._archivebox_trace_stop = stop_event
|
||||
bus._archivebox_trace_task = asyncio.create_task(trace_loop())
|
||||
|
||||
|
||||
async def _stop_bus_trace(bus) -> None:
|
||||
stop_event = getattr(bus, "_archivebox_trace_stop", None)
|
||||
trace_task = getattr(bus, "_archivebox_trace_task", None)
|
||||
if stop_event is None or trace_task is None:
|
||||
return
|
||||
stop_event.set()
|
||||
await asyncio.gather(trace_task, return_exceptions=True)
|
||||
bus._archivebox_trace_stop = None
|
||||
bus._archivebox_trace_task = None
|
||||
return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name)
|
||||
|
||||
|
||||
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
|
||||
@@ -235,22 +94,25 @@ class CrawlRunner:
|
||||
self.crawl = crawl
|
||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||
self.plugins = discover_plugins()
|
||||
self.process_service = ProcessService(self.bus)
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
ProcessService(self.bus)
|
||||
BinaryService(self.bus)
|
||||
TagService(self.bus)
|
||||
CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||
self.snapshot_service = SnapshotService(
|
||||
|
||||
async def ignore_snapshot(_snapshot_id: str) -> None:
|
||||
return None
|
||||
|
||||
SnapshotService(
|
||||
self.bus,
|
||||
crawl_id=str(crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot,
|
||||
)
|
||||
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
|
||||
ArchiveResultService(self.bus)
|
||||
self.selected_plugins = selected_plugins
|
||||
self.initial_snapshot_ids = snapshot_ids
|
||||
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
|
||||
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
|
||||
self.abx_services = None
|
||||
self.persona = None
|
||||
self.base_config: dict[str, Any] = {}
|
||||
self.derived_config: dict[str, Any] = {}
|
||||
@@ -258,15 +120,11 @@ class CrawlRunner:
|
||||
self._live_stream = None
|
||||
|
||||
async def run(self) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
try:
|
||||
await sync_to_async(self._prepare, thread_sensitive=True)()
|
||||
snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)()
|
||||
live_ui = self._create_live_ui()
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(self.bus)
|
||||
self.abx_services = setup_abx_services(
|
||||
setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides={
|
||||
@@ -278,18 +136,14 @@ class CrawlRunner:
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
await self.run_crawl_setup(root_snapshot_id)
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
await self._wait_for_snapshot_tasks()
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
if self.abx_services is not None:
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
await self.wait_for_snapshot_tasks()
|
||||
await self.run_crawl_cleanup(root_snapshot_id)
|
||||
finally:
|
||||
await _stop_bus_trace(self.bus)
|
||||
await self.bus.stop()
|
||||
if self._live_stream is not None:
|
||||
try:
|
||||
@@ -297,33 +151,16 @@ class CrawlRunner:
|
||||
except Exception:
|
||||
pass
|
||||
self._live_stream = None
|
||||
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
|
||||
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
|
||||
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
|
||||
if crawl_is_finished:
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
else:
|
||||
if crawl.status == Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
elif crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = crawl.retry_at or timezone.now()
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
await sync_to_async(self.finalize_run_state, thread_sensitive=True)()
|
||||
|
||||
async def enqueue_snapshot(self, snapshot_id: str) -> None:
|
||||
task = self.snapshot_tasks.get(snapshot_id)
|
||||
if task is not None and not task.done():
|
||||
return
|
||||
task = asyncio.create_task(self._run_snapshot(snapshot_id))
|
||||
task = asyncio.create_task(self.run_snapshot(snapshot_id))
|
||||
self.snapshot_tasks[snapshot_id] = task
|
||||
|
||||
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
|
||||
return None
|
||||
|
||||
async def _wait_for_snapshot_tasks(self) -> None:
|
||||
async def wait_for_snapshot_tasks(self) -> None:
|
||||
while True:
|
||||
pending_tasks: list[asyncio.Task[None]] = []
|
||||
for snapshot_id, task in list(self.snapshot_tasks.items()):
|
||||
@@ -339,9 +176,9 @@ class CrawlRunner:
|
||||
for task in done:
|
||||
task.result()
|
||||
|
||||
def _prepare(self) -> None:
|
||||
def load_run_state(self) -> list[str]:
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
from archivebox.machine.models import Machine, NetworkInterface, Process
|
||||
|
||||
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
|
||||
current_iface = NetworkInterface.current(refresh=True)
|
||||
@@ -352,17 +189,42 @@ class CrawlRunner:
|
||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
|
||||
self.derived_config = dict(Machine.current().config)
|
||||
self.base_config["ABX_RUNTIME"] = "archivebox"
|
||||
if self.selected_plugins is None:
|
||||
self.selected_plugins = _selected_plugins_from_config(self.base_config)
|
||||
raw_plugins = self.base_config["PLUGINS"].strip()
|
||||
self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] if raw_plugins else None
|
||||
if self.persona:
|
||||
chrome_binary = str(self.base_config.get("CHROME_BINARY") or "")
|
||||
self.base_config.update(self.persona.prepare_runtime_for_crawl(self.crawl, chrome_binary=chrome_binary))
|
||||
self.base_config.update(
|
||||
self.persona.prepare_runtime_for_crawl(
|
||||
self.crawl,
|
||||
chrome_binary=self.base_config["CHROME_BINARY"],
|
||||
),
|
||||
)
|
||||
if self.initial_snapshot_ids:
|
||||
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
|
||||
created = self.crawl.create_snapshots_from_urls()
|
||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||
return [str(snapshot.id) for snapshot in snapshots]
|
||||
|
||||
def finalize_run_state(self) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
def _cleanup_persona(self) -> None:
|
||||
if self.persona:
|
||||
self.persona.cleanup_runtime_for_crawl(self.crawl)
|
||||
crawl = Crawl.objects.get(id=self.crawl.id)
|
||||
if crawl.is_finished():
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
return
|
||||
if crawl.status == Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
elif crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = crawl.retry_at or timezone.now()
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
def _create_live_ui(self) -> LiveBusUI | None:
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
@@ -373,7 +235,7 @@ class CrawlRunner:
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
|
||||
stream = self._live_stream
|
||||
except OSError:
|
||||
self._live_stream = None
|
||||
@@ -399,7 +261,7 @@ class CrawlRunner:
|
||||
live_ui = LiveBusUI(
|
||||
self.bus,
|
||||
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
|
||||
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
|
||||
timeout_seconds=self.base_config["TIMEOUT"],
|
||||
ui_console=ui_console,
|
||||
interactive_tty=True,
|
||||
)
|
||||
@@ -410,128 +272,24 @@ class CrawlRunner:
|
||||
)
|
||||
return live_ui
|
||||
|
||||
def _create_root_snapshots(self) -> list[str]:
|
||||
created = self.crawl.create_snapshots_from_urls()
|
||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||
return [str(snapshot.id) for snapshot in snapshots]
|
||||
|
||||
def _initial_snapshot_ids(self) -> list[str]:
|
||||
if self.initial_snapshot_ids:
|
||||
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
|
||||
return self._create_root_snapshots()
|
||||
|
||||
def _snapshot_config(self, snapshot) -> dict[str, Any]:
|
||||
def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]:
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
config = get_config(crawl=self.crawl, snapshot=snapshot)
|
||||
config.update(self.base_config)
|
||||
config["CRAWL_DIR"] = str(self.crawl.output_dir)
|
||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||
config["SNAPSHOT_ID"] = str(snapshot.id)
|
||||
config["SNAPSHOT_DEPTH"] = snapshot.depth
|
||||
config["CRAWL_ID"] = str(self.crawl.id)
|
||||
config["SOURCE_URL"] = snapshot.url
|
||||
if snapshot.parent_snapshot_id:
|
||||
config["PARENT_SNAPSHOT_ID"] = str(snapshot.parent_snapshot_id)
|
||||
return config
|
||||
|
||||
async def _run_crawl_setup(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
setup_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=setup_snapshot,
|
||||
crawl_setup_only=True,
|
||||
)
|
||||
|
||||
async def _run_crawl_cleanup(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
cleanup_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=cleanup_snapshot,
|
||||
crawl_cleanup_only=True,
|
||||
)
|
||||
|
||||
async def _run_snapshot(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
async with self.snapshot_semaphore:
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
if snapshot["status"] == "sealed":
|
||||
return
|
||||
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
|
||||
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
|
||||
return
|
||||
abx_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
try:
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
|
||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
extra_context: dict[str, Any] = {}
|
||||
if config.get("EXTRA_CONTEXT"):
|
||||
parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"]))
|
||||
if not isinstance(parsed_extra_context, dict):
|
||||
raise TypeError("EXTRA_CONTEXT must decode to an object")
|
||||
extra_context = parsed_extra_context
|
||||
extra_context["snapshot_id"] = str(snapshot.id)
|
||||
extra_context["snapshot_depth"] = snapshot.depth
|
||||
config["EXTRA_CONTEXT"] = json.dumps(extra_context, separators=(",", ":"), sort_keys=True)
|
||||
return {
|
||||
"id": str(snapshot.id),
|
||||
"url": snapshot.url,
|
||||
@@ -542,12 +300,91 @@ class CrawlRunner:
|
||||
"tags": snapshot.tags_str(),
|
||||
"depth": snapshot.depth,
|
||||
"status": snapshot.status,
|
||||
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot.output_dir),
|
||||
"config": self._snapshot_config(snapshot),
|
||||
"config": config,
|
||||
}
|
||||
|
||||
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
|
||||
async def run_crawl_setup(self, snapshot_id: str) -> None:
|
||||
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
derived_config_overrides=self.derived_config,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
install_enabled=True,
|
||||
crawl_setup_enabled=True,
|
||||
crawl_start_enabled=False,
|
||||
snapshot_cleanup_enabled=False,
|
||||
crawl_cleanup_enabled=False,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
archive_result_service=None,
|
||||
tag_service=None,
|
||||
)
|
||||
|
||||
async def run_crawl_cleanup(self, snapshot_id: str) -> None:
|
||||
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||
await download(
|
||||
bus=self.bus,
|
||||
url=snapshot["url"],
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
plugins=self.plugins,
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
derived_config_overrides=self.derived_config,
|
||||
emit_jsonl=False,
|
||||
install_enabled=False,
|
||||
crawl_setup_enabled=False,
|
||||
crawl_start_enabled=False,
|
||||
snapshot_cleanup_enabled=False,
|
||||
crawl_cleanup_enabled=True,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
archive_result_service=None,
|
||||
tag_service=None,
|
||||
)
|
||||
|
||||
async def run_snapshot(self, snapshot_id: str) -> None:
|
||||
async with self.snapshot_semaphore:
|
||||
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||
if snapshot["status"] == "sealed":
|
||||
return
|
||||
if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() == "max_size":
|
||||
await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
|
||||
return
|
||||
try:
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
derived_config_overrides=self.derived_config,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
install_enabled=False,
|
||||
crawl_setup_enabled=False,
|
||||
crawl_start_enabled=True,
|
||||
snapshot_cleanup_enabled=True,
|
||||
crawl_cleanup_enabled=False,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
archive_result_service=None,
|
||||
tag_service=None,
|
||||
)
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
|
||||
def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||
@@ -579,21 +416,20 @@ def run_crawl(
|
||||
|
||||
|
||||
async def _run_binary(binary_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Binary
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
|
||||
binary = await Binary.objects.aget(id=binary_id)
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
derived_config = dict(machine.config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
||||
process_service = ProcessService(bus)
|
||||
ProcessService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
ArchiveResultService(bus)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
@@ -605,7 +441,6 @@ async def _run_binary(binary_id: str) -> None:
|
||||
)
|
||||
|
||||
try:
|
||||
_attach_bus_trace(bus)
|
||||
await bus.emit(
|
||||
BinaryRequestEvent(
|
||||
name=binary.name,
|
||||
@@ -619,7 +454,6 @@ async def _run_binary(binary_id: str) -> None:
|
||||
),
|
||||
)
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
|
||||
|
||||
@@ -628,20 +462,20 @@ def run_binary(binary_id: str) -> None:
|
||||
|
||||
|
||||
async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
derived_config = dict(machine.config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
ProcessService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
ArchiveResultService(bus)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
@@ -657,7 +491,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
if not selected_plugins:
|
||||
return
|
||||
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
|
||||
timeout_seconds = int(config.get("TIMEOUT") or 60)
|
||||
timeout_seconds = config["TIMEOUT"]
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
stderr_is_tty = sys.stderr.isatty()
|
||||
interactive_tty = stdout_is_tty or stderr_is_tty
|
||||
@@ -668,7 +502,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
|
||||
stream = live_stream
|
||||
except OSError:
|
||||
live_stream = None
|
||||
@@ -707,20 +541,21 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
plugins_label=plugins_label,
|
||||
)
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(bus)
|
||||
results = await abx_install_plugins(
|
||||
plugin_names=plugin_names,
|
||||
plugins=plugins,
|
||||
output_dir=output_dir,
|
||||
config_overrides=config,
|
||||
derived_config_overrides=derived_config,
|
||||
emit_jsonl=False,
|
||||
bus=bus,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
)
|
||||
await abx_services.process.wait_for_background_monitors()
|
||||
if live_ui is not None:
|
||||
live_ui.print_summary(results, output_dir=output_dir)
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
try:
|
||||
if live_stream is not None:
|
||||
@@ -739,6 +574,12 @@ def recover_orphaned_crawls() -> int:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_crawl_ids: set[str] = set()
|
||||
orphaned_crawls = list(
|
||||
Crawl.objects.filter(
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at__isnull=True,
|
||||
).prefetch_related("snapshot_set"),
|
||||
)
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
@@ -746,23 +587,27 @@ def recover_orphaned_crawls() -> int:
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
).only("pwd")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
if not proc.pwd:
|
||||
continue
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
if crawl_id:
|
||||
active_crawl_ids.add(str(crawl_id))
|
||||
proc_pwd = Path(proc.pwd)
|
||||
for crawl in orphaned_crawls:
|
||||
matched_snapshot = None
|
||||
for snapshot in crawl.snapshot_set.all():
|
||||
try:
|
||||
proc_pwd.relative_to(snapshot.output_dir)
|
||||
matched_snapshot = snapshot
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
if matched_snapshot is not None:
|
||||
active_crawl_ids.add(str(crawl.id))
|
||||
break
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_crawls = Crawl.objects.filter(
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at__isnull=True,
|
||||
).prefetch_related("snapshot_set")
|
||||
|
||||
for crawl in orphaned_crawls:
|
||||
if str(crawl.id) in active_crawl_ids:
|
||||
continue
|
||||
@@ -788,6 +633,11 @@ def recover_orphaned_snapshots() -> int:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_snapshot_ids: set[str] = set()
|
||||
orphaned_snapshots = list(
|
||||
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
.select_related("crawl")
|
||||
.prefetch_related("archiveresult_set"),
|
||||
)
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
@@ -795,24 +645,22 @@ def recover_orphaned_snapshots() -> int:
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
).only("pwd")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
if not proc.pwd:
|
||||
continue
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if snapshot_id:
|
||||
active_snapshot_ids.add(str(snapshot_id))
|
||||
proc_pwd = Path(proc.pwd)
|
||||
for snapshot in orphaned_snapshots:
|
||||
try:
|
||||
proc_pwd.relative_to(snapshot.output_dir)
|
||||
active_snapshot_ids.add(str(snapshot.id))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_snapshots = (
|
||||
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
.select_related("crawl")
|
||||
.prefetch_related("archiveresult_set")
|
||||
)
|
||||
|
||||
for snapshot in orphaned_snapshots:
|
||||
if str(snapshot.id) in active_snapshot_ids:
|
||||
continue
|
||||
|
||||
@@ -7,8 +7,6 @@ from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
||||
from abx_dl.limits import CrawlLimitState
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class SnapshotService(BaseService):
|
||||
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
||||
@@ -18,120 +16,96 @@ class SnapshotService(BaseService):
|
||||
self.crawl_id = crawl_id
|
||||
self.schedule_snapshot = schedule_snapshot
|
||||
super().__init__(bus)
|
||||
self.bus.on(SnapshotEvent, self.on_SnapshotEvent)
|
||||
self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent)
|
||||
|
||||
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._project_snapshot, event)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
|
||||
if snapshot_id and event.depth > 0:
|
||||
await self.schedule_snapshot(snapshot_id)
|
||||
|
||||
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._write_snapshot_details)(snapshot_id)
|
||||
|
||||
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
||||
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
snapshot_id: str | None = None
|
||||
snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).afirst()
|
||||
|
||||
if event.depth == 0:
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
|
||||
if snapshot is None:
|
||||
return None
|
||||
if snapshot is not None:
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
return str(snapshot.id)
|
||||
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot_id = str(snapshot.id)
|
||||
elif event.depth > 0:
|
||||
if event.depth <= crawl.max_depth and self._crawl_limit_stop_reason(crawl) != "max_size":
|
||||
parent_event = await self.bus.find(
|
||||
SnapshotEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
where=lambda candidate: candidate.depth == event.depth - 1 and self.bus.event_is_child_of(event, candidate),
|
||||
)
|
||||
parent_snapshot = None
|
||||
if parent_event is not None:
|
||||
parent_snapshot = await Snapshot.objects.filter(id=parent_event.snapshot_id, crawl=crawl).afirst()
|
||||
if parent_snapshot is not None and self._url_passes_filters(crawl, parent_snapshot, event.url):
|
||||
snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)(
|
||||
{
|
||||
"url": event.url,
|
||||
"depth": event.depth,
|
||||
"parent_snapshot_id": str(parent_snapshot.id),
|
||||
"crawl_id": str(crawl.id),
|
||||
},
|
||||
overrides={
|
||||
"crawl": crawl,
|
||||
"snapshot": parent_snapshot,
|
||||
"created_by_id": crawl.created_by_id,
|
||||
},
|
||||
queue_for_extraction=False,
|
||||
)
|
||||
if snapshot is not None and snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
snapshot.retry_at = None
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot_id = str(snapshot.id)
|
||||
|
||||
if event.depth > crawl.max_depth:
|
||||
return None
|
||||
if self._crawl_limit_stop_reason(crawl) == "max_size":
|
||||
return None
|
||||
if snapshot_id:
|
||||
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||
if snapshot is not None:
|
||||
await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)()
|
||||
if snapshot_id and event.depth > 0:
|
||||
await self.schedule_snapshot(snapshot_id)
|
||||
|
||||
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
|
||||
if parent_snapshot is None:
|
||||
return None
|
||||
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
|
||||
return None
|
||||
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.from_json(
|
||||
{
|
||||
"url": event.url,
|
||||
"depth": event.depth,
|
||||
"parent_snapshot_id": str(parent_snapshot.id),
|
||||
"crawl_id": str(crawl.id),
|
||||
},
|
||||
overrides={
|
||||
"crawl": crawl,
|
||||
"snapshot": parent_snapshot,
|
||||
"created_by_id": crawl.created_by_id,
|
||||
},
|
||||
queue_for_extraction=False,
|
||||
)
|
||||
if snapshot is None:
|
||||
return None
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
return None
|
||||
snapshot.retry_at = None
|
||||
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
return str(snapshot.id)
|
||||
snapshot = await Snapshot.objects.select_related("crawl").filter(id=event.snapshot_id).afirst()
|
||||
snapshot_id: str | None = None
|
||||
if snapshot is not None:
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
await snapshot.asave(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
||||
await (
|
||||
Snapshot.objects.filter(
|
||||
crawl_id=snapshot.crawl_id,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
.exclude(id=snapshot.id)
|
||||
.aupdate(
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
)
|
||||
snapshot_id = str(snapshot.id)
|
||||
if snapshot_id:
|
||||
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||
if snapshot is not None:
|
||||
await sync_to_async(snapshot.write_index_jsonl, thread_sensitive=True)()
|
||||
await sync_to_async(snapshot.write_json_details, thread_sensitive=True)()
|
||||
await sync_to_async(snapshot.write_html_details, thread_sensitive=True)()
|
||||
|
||||
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
||||
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
||||
|
||||
def _seal_snapshot(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return None
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
||||
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
|
||||
return str(snapshot.id)
|
||||
|
||||
def _crawl_limit_stop_reason(self, crawl) -> str:
|
||||
config = dict(crawl.config or {})
|
||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
||||
|
||||
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
return (
|
||||
Snapshot.objects.filter(
|
||||
crawl_id=crawl_id,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
.exclude(id=exclude_snapshot_id)
|
||||
.update(
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
)
|
||||
|
||||
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is not None:
|
||||
snapshot.ensure_crawl_symlink()
|
||||
|
||||
def _write_snapshot_details(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is None:
|
||||
return
|
||||
snapshot.write_index_jsonl()
|
||||
snapshot.write_json_details()
|
||||
snapshot.write_html_details()
|
||||
|
||||
@@ -3,20 +3,20 @@ from __future__ import annotations
|
||||
from abx_dl.events import TagEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class TagService(BaseService):
|
||||
LISTENS_TO = [TagEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(TagEvent, self.on_TagEvent__save_to_db)
|
||||
|
||||
def _project(self, event: TagEvent) -> None:
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
async def on_TagEvent__save_to_db(self, event: TagEvent) -> None:
|
||||
from archivebox.core.models import Snapshot, SnapshotTag, Tag
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
||||
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).afirst()
|
||||
if snapshot is None:
|
||||
return
|
||||
Tag.from_json({"name": event.name}, overrides={"snapshot": snapshot})
|
||||
tag, _ = await Tag.objects.aget_or_create(name=event.name)
|
||||
await SnapshotTag.objects.aget_or_create(snapshot=snapshot, tag=tag)
|
||||
|
||||
@@ -312,7 +312,7 @@ CREATE TABLE IF NOT EXISTS machine_dependency (
|
||||
modified_at DATETIME,
|
||||
bin_name VARCHAR(63) NOT NULL UNIQUE,
|
||||
bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
|
||||
custom_cmds TEXT DEFAULT '{}',
|
||||
overrides TEXT DEFAULT '{}',
|
||||
config TEXT DEFAULT '{}'
|
||||
);
|
||||
|
||||
@@ -973,7 +973,6 @@ def seed_0_8_data(db_path: Path) -> dict[str, list[dict]]:
|
||||
("machine", "0003_alter_installedbinary_options_and_more"),
|
||||
("machine", "0004_alter_installedbinary_abspath_and_more"),
|
||||
# Then the new migrations after squashing
|
||||
("machine", "0002_rename_custom_cmds_to_overrides"),
|
||||
("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"),
|
||||
("machine", "0004_drop_dependency_table"),
|
||||
# Crawls must come before core.0024 because 0024_b depends on it
|
||||
|
||||
@@ -144,13 +144,13 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
|
||||
pwd=str(snapshot.output_dir / "wget"),
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
env={
|
||||
"SOURCE_URL": "https://example.com",
|
||||
"SAFE_FLAG": "1",
|
||||
"API_KEY": "super-secret-key",
|
||||
"ACCESS_TOKEN": "super-secret-token",
|
||||
"SHARED_SECRET": "super-secret-secret",
|
||||
},
|
||||
status=Process.StatusChoices.EXITED,
|
||||
url="https://example.com",
|
||||
)
|
||||
result = ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
@@ -164,7 +164,7 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
|
||||
cmd_html = str(admin.cmd_str(result))
|
||||
|
||||
assert "SAFE_FLAG=1" in cmd_html
|
||||
assert "SOURCE_URL=https://example.com" in cmd_html
|
||||
assert "https://example.com" in cmd_html
|
||||
assert "API_KEY" not in cmd_html
|
||||
assert "ACCESS_TOKEN" not in cmd_html
|
||||
assert "SHARED_SECRET" not in cmd_html
|
||||
|
||||
@@ -8,6 +8,7 @@ Tests cover:
|
||||
- Snapshot progress statistics
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
@@ -822,7 +823,6 @@ class TestAdminSnapshotListView:
|
||||
pwd="/tmp/archivebox",
|
||||
cmd=["python", "/tmp/job.py", "--url=https://example.com"],
|
||||
env={
|
||||
"SNAPSHOT_ID": "abc123",
|
||||
"ENABLED": True,
|
||||
"API_KEY": "super-secret-key",
|
||||
"ACCESS_TOKEN": "super-secret-token",
|
||||
@@ -843,7 +843,6 @@ class TestAdminSnapshotListView:
|
||||
assert response.status_code == 200
|
||||
assert b"Kill" in response.content
|
||||
assert b"python /tmp/job.py --url=https://example.com" in response.content
|
||||
assert b"SNAPSHOT_ID=abc123" in response.content
|
||||
assert b"ENABLED=True" in response.content
|
||||
assert b"52s" in response.content
|
||||
assert b"API_KEY=" not in response.content
|
||||
@@ -1065,7 +1064,7 @@ class TestAdminSnapshotListView:
|
||||
pid=54321,
|
||||
exit_code=0,
|
||||
cmd=["/plugins/title/on_Snapshot__54_title.js", "--url=https://example.com"],
|
||||
env={"SNAPSHOT_ID": str(snapshot.id)},
|
||||
env={"EXTRA_CONTEXT": json.dumps({"snapshot_id": str(snapshot.id)})},
|
||||
started_at=timezone.now(),
|
||||
ended_at=timezone.now(),
|
||||
)
|
||||
@@ -1252,11 +1251,8 @@ class TestLiveProgressView:
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=pid,
|
||||
pwd=str(snapshot.output_dir / "chrome"),
|
||||
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js", "--url=https://example.com"],
|
||||
env={
|
||||
"CRAWL_ID": str(snapshot.crawl_id),
|
||||
"SNAPSHOT_ID": str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
@@ -1290,11 +1286,8 @@ class TestLiveProgressView:
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=pid,
|
||||
pwd=str(snapshot.output_dir / "title"),
|
||||
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
|
||||
env={
|
||||
"CRAWL_ID": str(snapshot.crawl_id),
|
||||
"SNAPSHOT_ID": str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
@@ -1327,11 +1320,8 @@ class TestLiveProgressView:
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=os.getpid(),
|
||||
pwd=str(snapshot.output_dir / "chrome"),
|
||||
cmd=["/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
|
||||
env={
|
||||
"CRAWL_ID": str(snapshot.crawl_id),
|
||||
"SNAPSHOT_ID": str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
ArchiveResult.objects.create(
|
||||
@@ -1369,11 +1359,8 @@ class TestLiveProgressView:
|
||||
status=Process.StatusChoices.EXITED,
|
||||
exit_code=0,
|
||||
pid=99999,
|
||||
pwd=str(snapshot.output_dir / "title"),
|
||||
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
|
||||
env={
|
||||
"CRAWL_ID": str(snapshot.crawl_id),
|
||||
"SNAPSHOT_ID": str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
ended_at=timezone.now(),
|
||||
)
|
||||
|
||||
@@ -5,12 +5,12 @@ import pytest
|
||||
from django.db import connection
|
||||
|
||||
|
||||
from abx_dl.events import BinaryRequestEvent, ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.events import ArchiveResultEvent, BinaryRequestEvent, ProcessEvent, ProcessStartedEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
from abx_dl.output_files import OutputFile
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
pytestmark = pytest.mark.django_db(transaction=True)
|
||||
|
||||
|
||||
def _cleanup_machine_process_rows() -> None:
|
||||
@@ -75,8 +75,8 @@ def _create_iface(machine):
|
||||
|
||||
def test_process_completed_projects_inline_archiveresult():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
from archivebox.services.archive_result_service import ArchiveResultService
|
||||
import asyncio
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "wget"
|
||||
@@ -84,37 +84,23 @@ def test_process_completed_projects_inline_archiveresult():
|
||||
(plugin_dir / "index.html").write_text("<html>ok</html>")
|
||||
|
||||
bus = create_bus(name="test_inline_archiveresult")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
service = ArchiveResultService(bus)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="wget",
|
||||
hook_name="on_Snapshot__06_wget.finite.bg",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"wget/index.html"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
|
||||
process_id="proc-inline",
|
||||
event = ArchiveResultEvent(
|
||||
snapshot_id=str(snapshot.id),
|
||||
plugin="wget",
|
||||
hook_name="on_Snapshot__06_wget.finite.bg",
|
||||
status="succeeded",
|
||||
output_str="wget/index.html",
|
||||
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "wget",
|
||||
"hook_name": "on_Snapshot__06_wget.finite.bg",
|
||||
"status": "succeeded",
|
||||
"output_str": "wget/index.html",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
async def emit_event() -> None:
|
||||
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||
|
||||
asyncio.run(emit_event())
|
||||
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
|
||||
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
@@ -127,45 +113,31 @@ def test_process_completed_projects_inline_archiveresult():
|
||||
|
||||
def test_process_completed_projects_synthetic_failed_archiveresult():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
from archivebox.services.archive_result_service import ArchiveResultService
|
||||
import asyncio
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "chrome"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
bus = create_bus(name="test_synthetic_archiveresult")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
service = ArchiveResultService(bus)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="chrome",
|
||||
hook_name="on_Snapshot__11_chrome_wait",
|
||||
stdout="",
|
||||
stderr="Hook timed out after 60 seconds",
|
||||
exit_code=-1,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[],
|
||||
process_id="proc-failed",
|
||||
event = ArchiveResultEvent(
|
||||
snapshot_id=str(snapshot.id),
|
||||
plugin="chrome",
|
||||
hook_name="on_Snapshot__11_chrome_wait",
|
||||
status="failed",
|
||||
output_str="Hook timed out after 60 seconds",
|
||||
error="Hook timed out after 60 seconds",
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:01:00+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"plugin": "chrome",
|
||||
"hook_name": "on_Snapshot__11_chrome_wait",
|
||||
"status": "failed",
|
||||
"output_str": "Hook timed out after 60 seconds",
|
||||
"error": "Hook timed out after 60 seconds",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
async def emit_event() -> None:
|
||||
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||
|
||||
asyncio.run(emit_event())
|
||||
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
|
||||
assert result.status == ArchiveResult.StatusChoices.FAILED
|
||||
@@ -176,45 +148,30 @@ def test_process_completed_projects_synthetic_failed_archiveresult():
|
||||
|
||||
def test_process_completed_projects_noresults_archiveresult():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
from archivebox.services.archive_result_service import ArchiveResultService
|
||||
import asyncio
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
bus = create_bus(name="test_noresults_archiveresult")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
service = ArchiveResultService(bus)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[],
|
||||
process_id="proc-noresults",
|
||||
event = ArchiveResultEvent(
|
||||
snapshot_id=str(snapshot.id),
|
||||
plugin="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
status="noresults",
|
||||
output_str="No title found",
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "title",
|
||||
"hook_name": "on_Snapshot__54_title.js",
|
||||
"status": "noresults",
|
||||
"output_str": "No title found",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
async def emit_event() -> None:
|
||||
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||
|
||||
asyncio.run(emit_event())
|
||||
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
|
||||
assert result.status == ArchiveResult.StatusChoices.NORESULTS
|
||||
@@ -258,45 +215,30 @@ def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state():
|
||||
|
||||
|
||||
def test_process_completed_projects_snapshot_title_from_output_str():
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
from archivebox.services.archive_result_service import ArchiveResultService
|
||||
import asyncio
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
bus = create_bus(name="test_snapshot_title_output_str")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
service = ArchiveResultService(bus)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"Example Domain"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[],
|
||||
process_id="proc-title-output-str",
|
||||
event = ArchiveResultEvent(
|
||||
snapshot_id=str(snapshot.id),
|
||||
plugin="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
status="succeeded",
|
||||
output_str="Example Domain",
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "title",
|
||||
"hook_name": "on_Snapshot__54_title.js",
|
||||
"status": "succeeded",
|
||||
"output_str": "Example Domain",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
async def emit_event() -> None:
|
||||
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||
|
||||
asyncio.run(emit_event())
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
assert snapshot.title == "Example Domain"
|
||||
@@ -304,8 +246,8 @@ def test_process_completed_projects_snapshot_title_from_output_str():
|
||||
|
||||
|
||||
def test_process_completed_projects_snapshot_title_from_title_file():
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
from archivebox.services.archive_result_service import ArchiveResultService
|
||||
import asyncio
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||
@@ -313,37 +255,23 @@ def test_process_completed_projects_snapshot_title_from_title_file():
|
||||
(plugin_dir / "title.txt").write_text("Example Domain")
|
||||
|
||||
bus = create_bus(name="test_snapshot_title_file")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
service = ArchiveResultService(bus)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
|
||||
process_id="proc-title-file",
|
||||
event = ArchiveResultEvent(
|
||||
snapshot_id=str(snapshot.id),
|
||||
plugin="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
status="noresults",
|
||||
output_str="No title found",
|
||||
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "title",
|
||||
"hook_name": "on_Snapshot__54_title.js",
|
||||
"status": "noresults",
|
||||
"output_str": "No title found",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
async def emit_event() -> None:
|
||||
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||
|
||||
asyncio.run(emit_event())
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
assert snapshot.title == "Example Domain"
|
||||
@@ -410,9 +338,12 @@ def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path):
|
||||
assert output_mimetypes == "application/warc"
|
||||
|
||||
|
||||
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch, tmp_path):
|
||||
from archivebox.machine.models import Binary, NetworkInterface
|
||||
from archivebox.services.process_service import ProcessService
|
||||
from archivebox.machine.models import Process as MachineProcess
|
||||
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
|
||||
from abx_dl.services.process_service import ProcessService as DlProcessService
|
||||
|
||||
machine = _create_machine()
|
||||
iface = _create_iface(machine)
|
||||
@@ -428,35 +359,60 @@ def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
hook_path = tmp_path / "on_Snapshot__57_mercury.py"
|
||||
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
|
||||
hook_path.chmod(0o755)
|
||||
output_dir = tmp_path / "mercury"
|
||||
output_dir.mkdir()
|
||||
|
||||
bus = create_bus(name="test_process_started_binary_hydration")
|
||||
service = ProcessService(bus)
|
||||
event = ProcessStartedEvent(
|
||||
plugin_name="mercury",
|
||||
hook_name="on_Snapshot__57_mercury.py",
|
||||
hook_path="/plugins/mercury/on_Snapshot__57_mercury.py",
|
||||
hook_args=["--url=https://example.com"],
|
||||
output_dir="/tmp/mercury",
|
||||
env={
|
||||
"MERCURY_BINARY": binary.abspath,
|
||||
"NODE_BINARY": "/tmp/node",
|
||||
},
|
||||
timeout=60,
|
||||
pid=4321,
|
||||
process_id="proc-mercury",
|
||||
snapshot_id="",
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
|
||||
ArchiveBoxProcessService(bus)
|
||||
|
||||
async def run_test() -> None:
|
||||
await bus.emit(
|
||||
ProcessEvent(
|
||||
plugin_name="mercury",
|
||||
hook_name="on_Snapshot__57_mercury.py",
|
||||
hook_path=str(hook_path),
|
||||
hook_args=["--url=https://example.com"],
|
||||
is_background=False,
|
||||
output_dir=str(output_dir),
|
||||
env={
|
||||
"MERCURY_BINARY": binary.abspath,
|
||||
"NODE_BINARY": "/tmp/node",
|
||||
},
|
||||
timeout=60,
|
||||
url="https://example.com",
|
||||
),
|
||||
)
|
||||
started = await bus.find(
|
||||
ProcessStartedEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
hook_name="on_Snapshot__57_mercury.py",
|
||||
output_dir=str(output_dir),
|
||||
)
|
||||
assert started is not None
|
||||
|
||||
import asyncio
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
process = MachineProcess.objects.get(
|
||||
pwd=str(output_dir),
|
||||
cmd=[str(hook_path), "--url=https://example.com"],
|
||||
)
|
||||
|
||||
service._project_started(event)
|
||||
|
||||
process = service._get_or_create_process(event)
|
||||
assert process.binary_id == binary.id
|
||||
assert process.iface_id == iface.id
|
||||
|
||||
|
||||
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch):
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch, tmp_path):
|
||||
from archivebox.machine.models import Binary, NetworkInterface
|
||||
from archivebox.services.process_service import ProcessService
|
||||
from archivebox.machine.models import Process as MachineProcess
|
||||
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
|
||||
from abx_dl.services.process_service import ProcessService as DlProcessService
|
||||
|
||||
machine = _create_machine()
|
||||
iface = _create_iface(machine)
|
||||
@@ -472,27 +428,47 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
hook_path = tmp_path / "on_Snapshot__75_parse_dom_outlinks.js"
|
||||
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
|
||||
hook_path.chmod(0o755)
|
||||
output_dir = tmp_path / "parse-dom-outlinks"
|
||||
output_dir.mkdir()
|
||||
|
||||
bus = create_bus(name="test_process_started_node_fallback")
|
||||
service = ProcessService(bus)
|
||||
event = ProcessStartedEvent(
|
||||
plugin_name="parse_dom_outlinks",
|
||||
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
|
||||
hook_path="/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js",
|
||||
hook_args=["--url=https://example.com"],
|
||||
output_dir="/tmp/parse-dom-outlinks",
|
||||
env={
|
||||
"NODE_BINARY": node.abspath,
|
||||
},
|
||||
timeout=60,
|
||||
pid=9876,
|
||||
process_id="proc-parse-dom-outlinks",
|
||||
snapshot_id="",
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
|
||||
ArchiveBoxProcessService(bus)
|
||||
|
||||
async def run_test() -> None:
|
||||
await bus.emit(
|
||||
ProcessEvent(
|
||||
plugin_name="parse_dom_outlinks",
|
||||
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
|
||||
hook_path=str(hook_path),
|
||||
hook_args=["--url=https://example.com"],
|
||||
is_background=False,
|
||||
output_dir=str(output_dir),
|
||||
env={"NODE_BINARY": node.abspath},
|
||||
timeout=60,
|
||||
url="https://example.com",
|
||||
),
|
||||
)
|
||||
started = await bus.find(
|
||||
ProcessStartedEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
|
||||
output_dir=str(output_dir),
|
||||
)
|
||||
assert started is not None
|
||||
|
||||
import asyncio
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
process = MachineProcess.objects.get(
|
||||
pwd=str(output_dir),
|
||||
cmd=[str(hook_path), "--url=https://example.com"],
|
||||
)
|
||||
|
||||
service._project_started(event)
|
||||
|
||||
process = service._get_or_create_process(event)
|
||||
assert process.binary_id == node.id
|
||||
assert process.iface_id == iface.id
|
||||
|
||||
@@ -500,6 +476,7 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
|
||||
def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService
|
||||
import asyncio
|
||||
|
||||
machine = _create_machine()
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||
@@ -522,7 +499,7 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
||||
binproviders="provider",
|
||||
)
|
||||
|
||||
service._project_binary(event)
|
||||
asyncio.run(service.on_BinaryRequestEvent(event))
|
||||
|
||||
binary.refresh_from_db()
|
||||
assert Binary.objects.filter(machine=machine, name="wget").count() == 1
|
||||
|
||||
@@ -378,11 +378,8 @@ class TestRecoverOrphanedCrawls:
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pwd=str(snapshot.output_dir / "chrome"),
|
||||
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js"],
|
||||
env={
|
||||
"CRAWL_ID": str(crawl.id),
|
||||
"SNAPSHOT_ID": str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
|
||||
@@ -464,23 +464,24 @@ class TestDependencyRecordOutput(unittest.TestCase):
|
||||
self.assertEqual(data["name"], "wget")
|
||||
self.assertTrue(data["abspath"].startswith("/"))
|
||||
|
||||
def test_dependency_record_outputs_machine_config(self):
|
||||
"""Dependency resolution should output Machine config update JSONL."""
|
||||
def test_dependency_record_outputs_binary_jsonl(self):
|
||||
"""Dependency resolution should output Binary JSONL."""
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "Machine",
|
||||
"config": {
|
||||
"WGET_BINARY": "/usr/bin/wget",
|
||||
},
|
||||
"type": "Binary",
|
||||
"name": "wget",
|
||||
"abspath": "/usr/bin/wget",
|
||||
"version": "1.21.3",
|
||||
"binprovider": "env",
|
||||
},
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data["type"], "Machine")
|
||||
self.assertIn("config", data)
|
||||
self.assertEqual(data["config"]["WGET_BINARY"], "/usr/bin/wget")
|
||||
self.assertEqual(data["type"], "Binary")
|
||||
self.assertEqual(data["name"], "wget")
|
||||
self.assertEqual(data["abspath"], "/usr/bin/wget")
|
||||
|
||||
|
||||
class TestSnapshotHookOutput(unittest.TestCase):
|
||||
|
||||
@@ -269,12 +269,12 @@ class TestBinaryModel(TestCase):
|
||||
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
|
||||
self.assertGreater(binary.modified_at, old_modified)
|
||||
|
||||
def test_binary_from_json_preserves_install_args_overrides(self):
|
||||
"""Binary.from_json() should persist canonical install_args overrides unchanged."""
|
||||
def test_binary_from_json_preserves_provider_overrides(self):
|
||||
"""Binary.from_json() should persist provider overrides unchanged."""
|
||||
overrides = {
|
||||
"apt": {"install_args": ["chromium"]},
|
||||
"npm": {"install_args": "puppeteer"},
|
||||
"custom": {"install_args": ["bash", "-lc", "echo ok"]},
|
||||
"custom": {"install": "bash -lc 'echo ok'"},
|
||||
}
|
||||
|
||||
binary = Binary.from_json(
|
||||
|
||||
@@ -1,69 +1,4 @@
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
|
||||
from archivebox.services import process_service as process_service_module
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
bus = create_bus(name="test_process_service_inline_process_event")
|
||||
ProcessService(bus)
|
||||
|
||||
monkeypatch.setattr(
|
||||
process_service_module,
|
||||
"_ensure_worker",
|
||||
lambda event: {
|
||||
"pid": 4321,
|
||||
"start": 1711111111.0,
|
||||
"statename": "RUNNING",
|
||||
"exitstatus": 0,
|
||||
},
|
||||
)
|
||||
|
||||
async def run_test():
|
||||
await bus.emit(
|
||||
ProcessStdoutEvent(
|
||||
line=json.dumps(
|
||||
{
|
||||
"type": "ProcessEvent",
|
||||
"plugin_name": "search_backend_sonic",
|
||||
"hook_name": "worker_sonic",
|
||||
"hook_path": "/usr/bin/sonic",
|
||||
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
|
||||
"is_background": True,
|
||||
"daemon": True,
|
||||
"url": "tcp://127.0.0.1:1491",
|
||||
"output_dir": "/tmp/sonic",
|
||||
"env": {},
|
||||
"process_type": "worker",
|
||||
"worker_type": "sonic",
|
||||
"process_id": "worker:sonic",
|
||||
"output_str": "127.0.0.1:1491",
|
||||
},
|
||||
),
|
||||
plugin_name="search_backend_sonic",
|
||||
hook_name="on_CrawlSetup__55_sonic_start.py",
|
||||
output_dir="/tmp/search_backend_sonic",
|
||||
snapshot_id="snap-1",
|
||||
process_id="proc-hook",
|
||||
),
|
||||
)
|
||||
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
|
||||
await bus.stop()
|
||||
return started
|
||||
|
||||
started = asyncio.run(run_test())
|
||||
assert started is not None
|
||||
assert started.hook_name == "worker_sonic"
|
||||
assert started.process_type == "worker"
|
||||
assert started.worker_type == "sonic"
|
||||
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
|
||||
assert getattr(started, "output_str", "") == "127.0.0.1:1491"
|
||||
|
||||
@@ -34,18 +34,6 @@ class _DummyService:
|
||||
pass
|
||||
|
||||
|
||||
class _DummyAbxServices:
|
||||
def __init__(self):
|
||||
self.process = SimpleNamespace(wait_for_background_monitors=self._wait)
|
||||
|
||||
async def _wait(self):
|
||||
return None
|
||||
|
||||
|
||||
async def _call_sync(func, *args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
@@ -82,18 +70,18 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||
|
||||
download_calls = []
|
||||
|
||||
async def fake_download(*, url, bus, snapshot, **kwargs):
|
||||
async def fake_download(*, url, bus, config_overrides, **kwargs):
|
||||
extra_context = json.loads(config_overrides["EXTRA_CONTEXT"])
|
||||
download_calls.append(
|
||||
{
|
||||
"url": url,
|
||||
"bus": bus,
|
||||
"snapshot_id": snapshot.id,
|
||||
"source_url": snapshot.url,
|
||||
"abx_snapshot_id": snapshot.id,
|
||||
"snapshot_id": extra_context["snapshot_id"],
|
||||
"source_url": url,
|
||||
},
|
||||
)
|
||||
await asyncio.sleep(0)
|
||||
@@ -113,9 +101,8 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
|
||||
"tags": snapshot_a.tags_str(),
|
||||
"depth": snapshot_a.depth,
|
||||
"parent_snapshot_id": str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot_a.output_dir),
|
||||
"config": crawl_runner._snapshot_config(snapshot_a),
|
||||
"config": crawl_runner.load_snapshot_payload(str(snapshot_a.id))["config"],
|
||||
},
|
||||
str(snapshot_b.id): {
|
||||
"id": str(snapshot_b.id),
|
||||
@@ -127,17 +114,16 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
|
||||
"tags": snapshot_b.tags_str(),
|
||||
"depth": snapshot_b.depth,
|
||||
"parent_snapshot_id": str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot_b.output_dir),
|
||||
"config": crawl_runner._snapshot_config(snapshot_b),
|
||||
"config": crawl_runner.load_snapshot_payload(str(snapshot_b.id))["config"],
|
||||
},
|
||||
}
|
||||
monkeypatch.setattr(crawl_runner, "_load_snapshot_run_data", lambda snapshot_id: snapshot_data[snapshot_id])
|
||||
monkeypatch.setattr(crawl_runner, "load_snapshot_payload", lambda snapshot_id: snapshot_data[snapshot_id])
|
||||
|
||||
async def run_both():
|
||||
await asyncio.gather(
|
||||
crawl_runner._run_snapshot(str(snapshot_a.id)),
|
||||
crawl_runner._run_snapshot(str(snapshot_b.id)),
|
||||
crawl_runner.run_snapshot(str(snapshot_a.id)),
|
||||
crawl_runner.run_snapshot(str(snapshot_b.id)),
|
||||
)
|
||||
|
||||
asyncio.run(run_both())
|
||||
@@ -243,10 +229,10 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
||||
refresh_calls = []
|
||||
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
|
||||
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
|
||||
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {})
|
||||
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
|
||||
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
crawl_runner._prepare()
|
||||
crawl_runner.load_run_state()
|
||||
|
||||
assert refresh_calls == [True]
|
||||
assert proc.iface is not None
|
||||
@@ -254,10 +240,12 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
||||
assert saved_updates == [("iface", "machine", "modified_at")]
|
||||
|
||||
|
||||
def test_installed_binary_config_overrides_include_valid_installed_binaries(monkeypatch):
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
def test_load_run_state_uses_machine_config_as_derived_config(monkeypatch):
|
||||
from archivebox.machine.models import Machine, NetworkInterface, Process
|
||||
from archivebox.services import runner as runner_module
|
||||
from abx_dl.models import Plugin
|
||||
from archivebox.config import configset as configset_module
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
machine = Machine.objects.create(
|
||||
guid="test-guid-runner-overrides",
|
||||
@@ -273,143 +261,30 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
|
||||
os_release="14.0",
|
||||
os_kernel="Darwin",
|
||||
stats={},
|
||||
config={},
|
||||
config={"WGET_BINARY": "/tmp/wget", "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}},
|
||||
)
|
||||
mercury_binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="postlight-parser",
|
||||
abspath=sys.executable,
|
||||
version="2.0.0",
|
||||
binprovider="pip",
|
||||
binproviders="env,pip",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
wget_binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="wget",
|
||||
abspath="/tmp/not-an-executable",
|
||||
version="1.0.0",
|
||||
binprovider="env",
|
||||
binproviders="env",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
puppeteer_binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="puppeteer",
|
||||
abspath="/tmp/shared-lib/npm/node_modules/.bin/puppeteer",
|
||||
version="24.40.0",
|
||||
binprovider="npm",
|
||||
binproviders="npm",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
ytdlp_binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="yt-dlp",
|
||||
abspath="/tmp/shared-lib/pip/venv/bin/yt-dlp",
|
||||
version="2026.3.17",
|
||||
binprovider="pip",
|
||||
binproviders="pip",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
proc = SimpleNamespace(iface_id=str(machine.id), machine_id=str(machine.id), iface=None, machine=machine, save=lambda **kwargs: None)
|
||||
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||
monkeypatch.setattr(
|
||||
Path,
|
||||
"is_file",
|
||||
lambda self: (
|
||||
str(self) in {sys.executable, mercury_binary.abspath, wget_binary.abspath, puppeteer_binary.abspath, ytdlp_binary.abspath}
|
||||
),
|
||||
NetworkInterface,
|
||||
"current",
|
||||
classmethod(lambda cls, refresh=False: SimpleNamespace(id=machine.id, machine=machine)),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
runner_module.os,
|
||||
"access",
|
||||
lambda path, mode: str(path) in {sys.executable, puppeteer_binary.abspath, ytdlp_binary.abspath},
|
||||
)
|
||||
|
||||
overrides = runner_module._installed_binary_config_overrides(
|
||||
{
|
||||
"mercury": Plugin(
|
||||
name="mercury",
|
||||
path=Path("."),
|
||||
hooks=[],
|
||||
config_schema={"MERCURY_BINARY": {"type": "string", "default": "postlight-parser"}},
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
assert overrides["MERCURY_BINARY"] == sys.executable
|
||||
assert "POSTLIGHT_PARSER_BINARY" not in overrides
|
||||
assert "WGET_BINARY" not in overrides
|
||||
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
|
||||
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
|
||||
assert overrides["PIP_HOME"] == "/tmp/shared-lib/pip"
|
||||
assert overrides["PIP_BIN_DIR"] == "/tmp/shared-lib/pip/venv/bin"
|
||||
assert overrides["NPM_HOME"] == "/tmp/shared-lib/npm"
|
||||
assert overrides["NPM_BIN_DIR"] == "/tmp/shared-lib/npm/node_modules/.bin"
|
||||
assert overrides["NODE_MODULES_DIR"] == "/tmp/shared-lib/npm/node_modules"
|
||||
assert overrides["NODE_MODULE_DIR"] == "/tmp/shared-lib/npm/node_modules"
|
||||
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
|
||||
|
||||
|
||||
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from archivebox.services import runner as runner_module
|
||||
from abx_dl.models import Plugin
|
||||
|
||||
machine = Machine.objects.create(
|
||||
guid="test-guid-runner-singlefile-cache",
|
||||
hostname="runner-host-singlefile",
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer="Test",
|
||||
hw_product="Test Product",
|
||||
hw_uuid="test-hw-runner-singlefile-cache",
|
||||
os_arch="arm64",
|
||||
os_family="darwin",
|
||||
os_platform="macOS",
|
||||
os_release="14.0",
|
||||
os_kernel="Darwin",
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
singlefile_extension = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="singlefile",
|
||||
abspath="/tmp/shared-lib/bin/singlefile",
|
||||
version="1.0.0",
|
||||
binprovider="chromewebstore",
|
||||
binproviders="chromewebstore",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath)
|
||||
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
|
||||
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
|
||||
|
||||
overrides = runner_module._installed_binary_config_overrides(
|
||||
{
|
||||
"singlefile": Plugin(
|
||||
name="singlefile",
|
||||
path=Path("."),
|
||||
hooks=[],
|
||||
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
|
||||
binaries=[
|
||||
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
|
||||
{"name": "singlefile", "binproviders": "chromewebstore"},
|
||||
],
|
||||
),
|
||||
},
|
||||
config={"SINGLEFILE_BINARY": "single-file"},
|
||||
)
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
crawl_runner.load_run_state()
|
||||
|
||||
assert "SINGLEFILE_BINARY" not in overrides
|
||||
assert "LIB_DIR" not in overrides
|
||||
assert "LIB_BIN_DIR" not in overrides
|
||||
assert crawl_runner.derived_config == machine.config
|
||||
|
||||
|
||||
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
|
||||
import asgiref.sync
|
||||
|
||||
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch, tmp_path):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.services import runner as runner_module
|
||||
@@ -428,12 +303,6 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "_limit_stop_reason", lambda config: "max_size")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
"sync_to_async",
|
||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
runner_module,
|
||||
"download",
|
||||
@@ -441,8 +310,21 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
|
||||
)
|
||||
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
state_dir = tmp_path / ".abx-dl"
|
||||
state_dir.mkdir(parents=True, exist_ok=True)
|
||||
(state_dir / "limits.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"admitted_snapshot_ids": ["child-1"],
|
||||
"counted_process_ids": ["proc-1"],
|
||||
"total_size": 32,
|
||||
"stop_reason": "max_size",
|
||||
},
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
cancelled: list[str] = []
|
||||
crawl_runner._load_snapshot_run_data = lambda snapshot_id: {
|
||||
crawl_runner.load_snapshot_payload = lambda snapshot_id: {
|
||||
"id": snapshot_id,
|
||||
"url": "https://example.com/child",
|
||||
"title": "",
|
||||
@@ -452,22 +334,23 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
|
||||
"tags": "",
|
||||
"depth": 1,
|
||||
"status": "queued",
|
||||
"parent_snapshot_id": None,
|
||||
"output_dir": "/tmp/child",
|
||||
"config": {"CRAWL_DIR": "/tmp/crawl", "MAX_SIZE": 16},
|
||||
"config": {"CRAWL_DIR": str(tmp_path), "MAX_SIZE": 16},
|
||||
}
|
||||
crawl_runner._cancel_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
|
||||
crawl_runner.seal_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
|
||||
|
||||
asyncio.run(crawl_runner._run_snapshot("child-1"))
|
||||
asyncio.run(crawl_runner.run_snapshot("child-1"))
|
||||
|
||||
assert cancelled == ["child-1"]
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_seal_snapshot_cancels_queued_descendants_after_max_size():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.snapshot_service import SnapshotService
|
||||
from abx_dl.events import SnapshotCompletedEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
@@ -505,13 +388,22 @@ def test_seal_snapshot_cancels_queued_descendants_after_max_size():
|
||||
bus = create_bus(name="test_snapshot_limit_cancel")
|
||||
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
|
||||
try:
|
||||
sealed_id = service._seal_snapshot(str(root.id))
|
||||
|
||||
async def emit_event() -> None:
|
||||
await service.on_SnapshotCompletedEvent(
|
||||
SnapshotCompletedEvent(
|
||||
url=root.url,
|
||||
snapshot_id=str(root.id),
|
||||
output_dir=str(root.output_dir),
|
||||
),
|
||||
)
|
||||
|
||||
asyncio.run(emit_event())
|
||||
finally:
|
||||
asyncio.run(bus.stop())
|
||||
|
||||
root.refresh_from_db()
|
||||
child.refresh_from_db()
|
||||
assert sealed_id == str(root.id)
|
||||
assert root.status == Snapshot.StatusChoices.SEALED
|
||||
assert child.status == Snapshot.StatusChoices.SEALED
|
||||
assert child.retry_at is None
|
||||
@@ -548,7 +440,6 @@ def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
|
||||
|
||||
|
||||
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
|
||||
import asgiref.sync
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -565,35 +456,23 @@ def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
"sync_to_async",
|
||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
||||
)
|
||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, "is_finished", lambda: False)
|
||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert crawl.status != Crawl.StatusChoices.SEALED
|
||||
assert crawl.retry_at is not None
|
||||
|
||||
|
||||
def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
||||
import asgiref.sync
|
||||
def test_crawl_runner_calls_load_and_finalize_run_state(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -618,50 +497,34 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(crawl, "cleanup", lambda: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||
|
||||
sync_to_async_wrapped: list[str] = []
|
||||
sync_to_async_active = False
|
||||
method_calls: list[str] = []
|
||||
|
||||
def fake_sync_to_async(func, thread_sensitive=True):
|
||||
async def wrapper(*args, **kwargs):
|
||||
nonlocal sync_to_async_active
|
||||
sync_to_async_wrapped.append(getattr(func, "__name__", repr(func)))
|
||||
previous = sync_to_async_active
|
||||
sync_to_async_active = True
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
finally:
|
||||
sync_to_async_active = previous
|
||||
def wrapped_finalize(self):
|
||||
method_calls.append("finalize_run_state")
|
||||
return None
|
||||
|
||||
return wrapper
|
||||
def wrapped_load(self):
|
||||
method_calls.append("load_run_state")
|
||||
return [str(snapshot.id)]
|
||||
|
||||
def guarded_is_finished():
|
||||
assert sync_to_async_active is True
|
||||
return False
|
||||
|
||||
monkeypatch.setattr(asgiref.sync, "sync_to_async", fake_sync_to_async)
|
||||
monkeypatch.setattr(crawl, "is_finished", guarded_is_finished)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", wrapped_finalize)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", wrapped_load)
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert crawl.status == Crawl.StatusChoices.STARTED
|
||||
assert crawl.retry_at is not None
|
||||
assert "guarded_is_finished" in sync_to_async_wrapped
|
||||
assert method_calls == ["load_run_state", "finalize_run_state"]
|
||||
|
||||
|
||||
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
||||
@@ -680,7 +543,7 @@ def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
||||
task.set_exception(RuntimeError("snapshot failed"))
|
||||
crawl_runner.snapshot_tasks["snap-1"] = task
|
||||
with pytest.raises(RuntimeError, match="snapshot failed"):
|
||||
await crawl_runner._wait_for_snapshot_tasks()
|
||||
await crawl_runner.wait_for_snapshot_tasks()
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
@@ -702,14 +565,13 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
|
||||
async def run_test():
|
||||
task = asyncio.create_task(finish_snapshot())
|
||||
crawl_runner.snapshot_tasks["snap-1"] = task
|
||||
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
|
||||
await asyncio.wait_for(crawl_runner.wait_for_snapshot_tasks(), timeout=0.5)
|
||||
assert crawl_runner.snapshot_tasks == {}
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
|
||||
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||
import asgiref.sync
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -726,30 +588,18 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
"sync_to_async",
|
||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
||||
)
|
||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, "is_finished", lambda: False)
|
||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
|
||||
|
||||
cleanup_calls = []
|
||||
monkeypatch.setattr(
|
||||
runner_module.CrawlRunner,
|
||||
"_run_crawl_cleanup",
|
||||
"run_crawl_cleanup",
|
||||
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
|
||||
)
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
@@ -757,17 +607,20 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||
assert cleanup_calls == ["abx_cleanup"]
|
||||
|
||||
|
||||
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
|
||||
def test_abx_process_service_background_process_finishes_after_process_exit(monkeypatch, tmp_path):
|
||||
from abx_dl.models import Process as AbxProcess, now_iso
|
||||
from abx_dl.services.process_service import ProcessService
|
||||
from abx_dl.events import ProcessCompletedEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
|
||||
service = object.__new__(ProcessService)
|
||||
service.emit_jsonl = False
|
||||
emitted_events = []
|
||||
|
||||
async def fake_emit_event(event, *, detach_from_parent):
|
||||
emitted_events.append((event, detach_from_parent))
|
||||
class FakeBus:
|
||||
async def emit(self, event):
|
||||
emitted_events.append(event)
|
||||
|
||||
service.bus = FakeBus()
|
||||
|
||||
async def fake_stream_stdout(**kwargs):
|
||||
try:
|
||||
@@ -775,19 +628,8 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
||||
except asyncio.CancelledError:
|
||||
return ["daemon output\n"]
|
||||
|
||||
service._emit_event = fake_emit_event
|
||||
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
|
||||
|
||||
class FakeAsyncProcess:
|
||||
def __init__(self):
|
||||
self.pid = 42424
|
||||
self.returncode = None
|
||||
|
||||
async def wait(self):
|
||||
await asyncio.sleep(0)
|
||||
self.returncode = 0
|
||||
return 0
|
||||
|
||||
plugin_output_dir = tmp_path / "chrome"
|
||||
plugin_output_dir.mkdir()
|
||||
stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log"
|
||||
@@ -804,41 +646,45 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
||||
plugin="chrome",
|
||||
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
|
||||
)
|
||||
process = FakeAsyncProcess()
|
||||
event = SimpleNamespace(
|
||||
plugin_name="chrome",
|
||||
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
|
||||
hook_path="hook",
|
||||
hook_args=["--url=https://example.org/"],
|
||||
env={},
|
||||
output_dir=str(plugin_output_dir),
|
||||
timeout=60,
|
||||
snapshot_id="snap-1",
|
||||
is_background=True,
|
||||
url="https://example.org/",
|
||||
process_type="hook",
|
||||
worker_type="hook",
|
||||
)
|
||||
|
||||
async def run_test():
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
sys.executable,
|
||||
"-c",
|
||||
"pass",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
event = ProcessStartedEvent(
|
||||
plugin_name="chrome",
|
||||
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
|
||||
hook_path="hook",
|
||||
hook_args=["--url=https://example.org/"],
|
||||
env={},
|
||||
output_dir=str(plugin_output_dir),
|
||||
timeout=60,
|
||||
pid=process.pid,
|
||||
is_background=True,
|
||||
url="https://example.org/",
|
||||
process_type="hook",
|
||||
worker_type="hook",
|
||||
start_ts=proc.started_at or "",
|
||||
subprocess=process,
|
||||
stdout_file=stdout_file,
|
||||
stderr_file=stderr_file,
|
||||
pid_file=pid_file,
|
||||
cmd_file=plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.sh",
|
||||
files_before=set(),
|
||||
)
|
||||
await asyncio.wait_for(
|
||||
service._monitor_background_process(
|
||||
event=event,
|
||||
proc=proc,
|
||||
process=process,
|
||||
plugin_output_dir=plugin_output_dir,
|
||||
stdout_file=stdout_file,
|
||||
stderr_file=stderr_file,
|
||||
pid_file=pid_file,
|
||||
files_before=set(),
|
||||
),
|
||||
service.on_ProcessStartedEvent(event),
|
||||
timeout=0.5,
|
||||
)
|
||||
|
||||
asyncio.run(run_test())
|
||||
|
||||
assert pid_file.exists() is False
|
||||
assert any(isinstance(event, ProcessCompletedEvent) for event, _ in emitted_events)
|
||||
assert any(isinstance(event, ProcessCompletedEvent) for event in emitted_events)
|
||||
|
||||
|
||||
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
|
||||
|
||||
48
archivebox/tests/test_tag_service.py
Normal file
48
archivebox/tests/test_tag_service.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
|
||||
from abx_dl.events import TagEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db(transaction=True)
|
||||
|
||||
|
||||
def _create_snapshot():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
return Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
|
||||
def test_tag_event_projects_tag_to_snapshot():
|
||||
from archivebox.core.models import Tag
|
||||
from archivebox.services.tag_service import TagService
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
bus = create_bus(name="test_tag_service")
|
||||
TagService(bus)
|
||||
|
||||
async def emit_tag_event() -> None:
|
||||
await bus.emit(
|
||||
TagEvent(
|
||||
name="example",
|
||||
snapshot_id=str(snapshot.id),
|
||||
),
|
||||
)
|
||||
|
||||
asyncio.run(emit_tag_event())
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
assert snapshot.tags.filter(name="example").exists()
|
||||
assert Tag.objects.filter(name="example").exists()
|
||||
2
docs
2
docs
Submodule docs updated: be25d9bfa2...7244076ece
@@ -42,7 +42,7 @@ Crawl.run()
|
||||
{'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}}
|
||||
|
||||
# ❌ WRONG - uses different field names
|
||||
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'custom_cmds': {...}}
|
||||
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'overrides': {...}}
|
||||
```
|
||||
|
||||
4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery.
|
||||
@@ -84,7 +84,7 @@ Crawl.run()
|
||||
# ❌ WRONG - complex transformation logic
|
||||
if obj.get('type') == 'Dependency':
|
||||
dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields
|
||||
dep.custom_commands = transform_overrides(obj['overrides']) # transforming data
|
||||
dep.overrides = transform_overrides(obj['overrides']) # transforming data
|
||||
```
|
||||
|
||||
### Pattern Consistency
|
||||
|
||||
@@ -159,6 +159,11 @@ environments = ["sys_platform == 'darwin'", "sys_platform == 'linux'"]
|
||||
package = true
|
||||
# compile-bytecode = true
|
||||
|
||||
[tool.uv.sources]
|
||||
abx-pkg = { path = "../abx-pkg", editable = true }
|
||||
abx-plugins = { path = "../abx-plugins", editable = true }
|
||||
abx-dl = { path = "../abx-dl", editable = true }
|
||||
|
||||
[build-system]
|
||||
requires = ["pdm-backend"]
|
||||
build-backend = "pdm.backend"
|
||||
|
||||
Reference in New Issue
Block a user