mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
update working changes
This commit is contained in:
@@ -26,7 +26,7 @@ EVENT_FLOW_DIAGRAM = """
|
|||||||
│ CrawlStartEvent │
|
│ CrawlStartEvent │
|
||||||
│ └─ SnapshotEvent │
|
│ └─ SnapshotEvent │
|
||||||
│ └─ on_Snapshot__* │
|
│ └─ on_Snapshot__* │
|
||||||
│ └─ Snapshot / ArchiveResult / Tag / Machine / BinaryRequest │
|
│ └─ ArchiveResult / Snapshot / Tag │
|
||||||
│ │
|
│ │
|
||||||
│ SnapshotCleanupEvent -> internal cleanup, no direct hook family │
|
│ SnapshotCleanupEvent -> internal cleanup, no direct hook family │
|
||||||
│ CrawlCleanupEvent -> internal cleanup, no direct hook family │
|
│ CrawlCleanupEvent -> internal cleanup, no direct hook family │
|
||||||
@@ -89,8 +89,8 @@ def pluginmap(
|
|||||||
"emits": ["ProcessEvent"],
|
"emits": ["ProcessEvent"],
|
||||||
},
|
},
|
||||||
"SnapshotEvent": {
|
"SnapshotEvent": {
|
||||||
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, and BinaryRequest records.",
|
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, and Tag records.",
|
||||||
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "BinaryRequestEvent", "ProcessEvent"],
|
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "ProcessEvent"],
|
||||||
},
|
},
|
||||||
"SnapshotCleanupEvent": {
|
"SnapshotCleanupEvent": {
|
||||||
"description": "Internal snapshot cleanup phase.",
|
"description": "Internal snapshot cleanup phase.",
|
||||||
|
|||||||
@@ -267,19 +267,13 @@ def get_config(
|
|||||||
if crawl and hasattr(crawl, "output_dir"):
|
if crawl and hasattr(crawl, "output_dir"):
|
||||||
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
|
config["CRAWL_OUTPUT_DIR"] = str(crawl.output_dir)
|
||||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||||
config["CRAWL_ID"] = str(getattr(crawl, "id", "")) if getattr(crawl, "id", None) else config.get("CRAWL_ID")
|
|
||||||
|
|
||||||
# Apply snapshot config overrides (highest priority)
|
# Apply snapshot config overrides (highest priority)
|
||||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||||
config.update(snapshot.config)
|
config.update(snapshot.config)
|
||||||
|
|
||||||
if snapshot:
|
if snapshot and hasattr(snapshot, "output_dir"):
|
||||||
config["SNAPSHOT_ID"] = str(getattr(snapshot, "id", "")) if getattr(snapshot, "id", None) else config.get("SNAPSHOT_ID")
|
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||||
config["SNAPSHOT_DEPTH"] = int(getattr(snapshot, "depth", 0) or 0)
|
|
||||||
if hasattr(snapshot, "output_dir"):
|
|
||||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
|
||||||
if getattr(snapshot, "crawl_id", None):
|
|
||||||
config["CRAWL_ID"] = str(snapshot.crawl_id)
|
|
||||||
|
|
||||||
# Normalize all aliases to canonical names (after all sources merged)
|
# Normalize all aliases to canonical names (after all sources merged)
|
||||||
# This handles aliases that came from user/crawl/snapshot configs, not just env
|
# This handles aliases that came from user/crawl/snapshot configs, not just env
|
||||||
|
|||||||
@@ -38,8 +38,8 @@ def _quote_shell_string(value: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _get_replay_source_url(result: ArchiveResult) -> str:
|
def _get_replay_source_url(result: ArchiveResult) -> str:
|
||||||
process_env = getattr(getattr(result, "process", None), "env", None) or {}
|
process = getattr(result, "process", None)
|
||||||
return str(process_env.get("SOURCE_URL") or result.snapshot.url or "")
|
return str(getattr(process, "url", None) or result.snapshot.url or "")
|
||||||
|
|
||||||
|
|
||||||
def build_abx_dl_display_command(result: ArchiveResult) -> str:
|
def build_abx_dl_display_command(result: ArchiveResult) -> str:
|
||||||
|
|||||||
@@ -1322,6 +1322,17 @@ def live_progress_view(request):
|
|||||||
|
|
||||||
# Build hierarchical active crawls with nested snapshots and archive results
|
# Build hierarchical active crawls with nested snapshots and archive results
|
||||||
|
|
||||||
|
active_crawls_qs = (
|
||||||
|
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
|
||||||
|
.prefetch_related(
|
||||||
|
"snapshot_set",
|
||||||
|
"snapshot_set__archiveresult_set",
|
||||||
|
"snapshot_set__archiveresult_set__process",
|
||||||
|
)
|
||||||
|
.distinct()
|
||||||
|
.order_by("-modified_at")[:10]
|
||||||
|
)
|
||||||
|
|
||||||
running_processes = Process.objects.filter(
|
running_processes = Process.objects.filter(
|
||||||
machine=machine,
|
machine=machine,
|
||||||
status=Process.StatusChoices.RUNNING,
|
status=Process.StatusChoices.RUNNING,
|
||||||
@@ -1343,28 +1354,45 @@ def live_progress_view(request):
|
|||||||
process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
||||||
process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
||||||
seen_process_records: set[str] = set()
|
seen_process_records: set[str] = set()
|
||||||
|
snapshots = [snapshot for crawl in active_crawls_qs for snapshot in crawl.snapshot_set.all()]
|
||||||
for proc in running_processes:
|
for proc in running_processes:
|
||||||
env = proc.env or {}
|
if not proc.pwd:
|
||||||
if not isinstance(env, dict):
|
continue
|
||||||
env = {}
|
proc_pwd = Path(proc.pwd)
|
||||||
|
matched_snapshot = None
|
||||||
crawl_id = env.get("CRAWL_ID")
|
for snapshot in snapshots:
|
||||||
snapshot_id = env.get("SNAPSHOT_ID")
|
try:
|
||||||
|
proc_pwd.relative_to(snapshot.output_dir)
|
||||||
|
matched_snapshot = snapshot
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if matched_snapshot is None:
|
||||||
|
continue
|
||||||
|
crawl_id = str(matched_snapshot.crawl_id)
|
||||||
|
snapshot_id = str(matched_snapshot.id)
|
||||||
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
|
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
|
||||||
if crawl_id and proc.pid:
|
if crawl_id and proc.pid:
|
||||||
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
|
crawl_process_pids.setdefault(crawl_id, proc.pid)
|
||||||
if phase == "snapshot" and snapshot_id and proc.pid:
|
if phase == "snapshot" and snapshot_id and proc.pid:
|
||||||
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
|
snapshot_process_pids.setdefault(snapshot_id, proc.pid)
|
||||||
|
|
||||||
for proc in recent_processes:
|
for proc in recent_processes:
|
||||||
env = proc.env or {}
|
if not proc.pwd:
|
||||||
if not isinstance(env, dict):
|
|
||||||
env = {}
|
|
||||||
|
|
||||||
crawl_id = env.get("CRAWL_ID")
|
|
||||||
snapshot_id = env.get("SNAPSHOT_ID")
|
|
||||||
if not crawl_id and not snapshot_id:
|
|
||||||
continue
|
continue
|
||||||
|
proc_pwd = Path(proc.pwd)
|
||||||
|
matched_snapshot = None
|
||||||
|
for snapshot in snapshots:
|
||||||
|
try:
|
||||||
|
proc_pwd.relative_to(snapshot.output_dir)
|
||||||
|
matched_snapshot = snapshot
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if matched_snapshot is None:
|
||||||
|
continue
|
||||||
|
crawl_id = str(matched_snapshot.crawl_id)
|
||||||
|
snapshot_id = str(matched_snapshot.id)
|
||||||
|
|
||||||
plugin, label, phase, hook_name = process_label(proc.cmd)
|
plugin, label, phase, hook_name = process_label(proc.cmd)
|
||||||
|
|
||||||
@@ -1393,20 +1421,9 @@ def live_progress_view(request):
|
|||||||
payload["pid"] = proc.pid
|
payload["pid"] = proc.pid
|
||||||
proc_started_at = proc.started_at or proc.modified_at
|
proc_started_at = proc.started_at or proc.modified_at
|
||||||
if phase == "snapshot" and snapshot_id:
|
if phase == "snapshot" and snapshot_id:
|
||||||
process_records_by_snapshot.setdefault(str(snapshot_id), []).append((payload, proc_started_at))
|
process_records_by_snapshot.setdefault(snapshot_id, []).append((payload, proc_started_at))
|
||||||
elif crawl_id:
|
elif crawl_id:
|
||||||
process_records_by_crawl.setdefault(str(crawl_id), []).append((payload, proc_started_at))
|
process_records_by_crawl.setdefault(crawl_id, []).append((payload, proc_started_at))
|
||||||
|
|
||||||
active_crawls_qs = (
|
|
||||||
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
|
|
||||||
.prefetch_related(
|
|
||||||
"snapshot_set",
|
|
||||||
"snapshot_set__archiveresult_set",
|
|
||||||
"snapshot_set__archiveresult_set__process",
|
|
||||||
)
|
|
||||||
.distinct()
|
|
||||||
.order_by("-modified_at")[:10]
|
|
||||||
)
|
|
||||||
|
|
||||||
active_crawls = []
|
active_crawls = []
|
||||||
total_workers = 0
|
total_workers = 0
|
||||||
|
|||||||
@@ -827,7 +827,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
for record in records[:3]:
|
for record in records[:3]:
|
||||||
print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}")
|
print(f" Record: type={record.get('type')}, keys={list(record.keys())[:5]}")
|
||||||
if system_task:
|
if system_task:
|
||||||
records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary", "Machine")]
|
records = [record for record in records if record.get("type") in ("BinaryRequest", "Binary")]
|
||||||
overrides = {"crawl": self}
|
overrides = {"crawl": self}
|
||||||
stats = process_hook_records(records, overrides=overrides)
|
stats = process_hook_records(records, overrides=overrides)
|
||||||
if stats:
|
if stats:
|
||||||
|
|||||||
@@ -13,13 +13,9 @@ Hook-backed event families are discovered from filenames like:
|
|||||||
on_CrawlSetup__*
|
on_CrawlSetup__*
|
||||||
on_Snapshot__*
|
on_Snapshot__*
|
||||||
|
|
||||||
InstallEvent itself is still part of the runtime lifecycle, but it has no
|
Internal bus event names are normalized to the corresponding
|
||||||
corresponding hook family. Its dependency declarations come directly from each
|
`on_{EventFamily}__*` prefix by a simple string transform. If no scripts exist
|
||||||
plugin's `config.json > required_binaries`.
|
for that prefix, discovery returns `[]`.
|
||||||
|
|
||||||
Lifecycle event names like `InstallEvent` or `SnapshotCleanupEvent` are
|
|
||||||
normalized to the corresponding `on_{EventFamily}__*` prefix by a simple
|
|
||||||
string transform. If no scripts exist for that prefix, discovery returns `[]`.
|
|
||||||
|
|
||||||
Directory structure:
|
Directory structure:
|
||||||
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
|
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
|
||||||
@@ -120,7 +116,6 @@ def normalize_hook_event_name(event_name: str) -> str | None:
|
|||||||
Normalize a hook event family or event class name to its on_* prefix.
|
Normalize a hook event family or event class name to its on_* prefix.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
InstallEvent -> Install
|
|
||||||
BinaryRequestEvent -> BinaryRequest
|
BinaryRequestEvent -> BinaryRequest
|
||||||
CrawlSetupEvent -> CrawlSetup
|
CrawlSetupEvent -> CrawlSetup
|
||||||
SnapshotEvent -> Snapshot
|
SnapshotEvent -> Snapshot
|
||||||
@@ -171,7 +166,7 @@ def discover_hooks(
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
event_name: Hook event family or event class name.
|
event_name: Hook event family or event class name.
|
||||||
Examples: 'Install', 'InstallEvent', 'BinaryRequestEvent', 'Snapshot'.
|
Examples: 'BinaryRequestEvent', 'Snapshot'.
|
||||||
Event names are normalized by stripping a trailing `Event`.
|
Event names are normalized by stripping a trailing `Event`.
|
||||||
If no matching `on_{EventFamily}__*` scripts exist, returns [].
|
If no matching `on_{EventFamily}__*` scripts exist, returns [].
|
||||||
filter_disabled: If True, skip hooks from disabled plugins (default: True)
|
filter_disabled: If True, skip hooks from disabled plugins (default: True)
|
||||||
@@ -1070,9 +1065,8 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
|
|||||||
Process JSONL records emitted by hook stdout.
|
Process JSONL records emitted by hook stdout.
|
||||||
|
|
||||||
This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest,
|
This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest,
|
||||||
Binary, and Machine. It does not process bus lifecycle events like
|
and Binary. It does not process internal bus lifecycle events, since those
|
||||||
InstallEvent, CrawlEvent, CrawlCleanupEvent, or SnapshotCleanupEvent, since
|
are not emitted as JSONL records by hook subprocesses.
|
||||||
those are not emitted as JSONL records by hook subprocesses.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
records: List of JSONL record dicts from result['records']
|
records: List of JSONL record dicts from result['records']
|
||||||
@@ -1131,13 +1125,6 @@ def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any
|
|||||||
if obj:
|
if obj:
|
||||||
stats[record_type] = stats.get(record_type, 0) + 1
|
stats[record_type] = stats.get(record_type, 0) + 1
|
||||||
|
|
||||||
elif record_type == "Machine":
|
|
||||||
from archivebox.machine.models import Machine
|
|
||||||
|
|
||||||
obj = Machine.from_json(record.copy(), overrides)
|
|
||||||
if obj:
|
|
||||||
stats["Machine"] = stats.get("Machine", 0) + 1
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|||||||
@@ -566,33 +566,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
|||||||
return None
|
return None
|
||||||
return {provider.strip() for provider in providers.split(",") if provider.strip()}
|
return {provider.strip() for provider in providers.split(",") if provider.strip()}
|
||||||
|
|
||||||
def _get_custom_install_command(self) -> str | None:
|
|
||||||
"""Extract a custom install command from overrides when the custom provider is used."""
|
|
||||||
import shlex
|
|
||||||
|
|
||||||
if not isinstance(self.overrides, dict):
|
|
||||||
return None
|
|
||||||
|
|
||||||
for key in ("custom_cmd", "cmd", "command"):
|
|
||||||
value = self.overrides.get(key)
|
|
||||||
if isinstance(value, str) and value.strip():
|
|
||||||
return value.strip()
|
|
||||||
|
|
||||||
custom_overrides = self.overrides.get("custom")
|
|
||||||
if isinstance(custom_overrides, dict):
|
|
||||||
for key in ("custom_cmd", "cmd", "command"):
|
|
||||||
value = custom_overrides.get(key)
|
|
||||||
if isinstance(value, str) and value.strip():
|
|
||||||
return value.strip()
|
|
||||||
|
|
||||||
install_args = custom_overrides.get("install_args")
|
|
||||||
if isinstance(install_args, str) and install_args.strip():
|
|
||||||
return install_args.strip()
|
|
||||||
if isinstance(install_args, list) and install_args:
|
|
||||||
return " ".join(shlex.quote(str(arg)) for arg in install_args if str(arg).strip())
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
"""
|
"""
|
||||||
Execute binary installation by running on_BinaryRequest__* hooks.
|
Execute binary installation by running on_BinaryRequest__* hooks.
|
||||||
@@ -637,13 +610,8 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
|||||||
plugin_output_dir = output_dir / plugin_name
|
plugin_output_dir = output_dir / plugin_name
|
||||||
plugin_output_dir.mkdir(parents=True, exist_ok=True)
|
plugin_output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
custom_cmd = None
|
|
||||||
overrides_json = None
|
overrides_json = None
|
||||||
if plugin_name == "custom":
|
if self.overrides:
|
||||||
custom_cmd = self._get_custom_install_command()
|
|
||||||
if not custom_cmd:
|
|
||||||
continue
|
|
||||||
elif self.overrides:
|
|
||||||
overrides_json = json.dumps(self.overrides)
|
overrides_json = json.dumps(self.overrides)
|
||||||
|
|
||||||
# Run the hook
|
# Run the hook
|
||||||
@@ -656,7 +624,6 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
|||||||
machine_id=str(self.machine_id),
|
machine_id=str(self.machine_id),
|
||||||
name=self.name,
|
name=self.name,
|
||||||
binproviders=self.binproviders,
|
binproviders=self.binproviders,
|
||||||
custom_cmd=custom_cmd,
|
|
||||||
overrides=overrides_json,
|
overrides=overrides_json,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -9,12 +9,11 @@ from typing import Any
|
|||||||
from asgiref.sync import sync_to_async
|
from asgiref.sync import sync_to_async
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
|
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent
|
||||||
from abx_dl.output_files import guess_mimetype
|
from abx_dl.output_files import guess_mimetype
|
||||||
from abx_dl.services.base import BaseService
|
from abx_dl.services.base import BaseService
|
||||||
|
|
||||||
from .db import run_db_op
|
from .process_service import parse_event_datetime
|
||||||
from .process_service import ProcessService, parse_event_datetime
|
|
||||||
|
|
||||||
|
|
||||||
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
|
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
|
||||||
@@ -209,79 +208,41 @@ class ArchiveResultService(BaseService):
|
|||||||
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
|
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
|
||||||
EMITS = []
|
EMITS = []
|
||||||
|
|
||||||
def __init__(self, bus, *, process_service: ProcessService):
|
def __init__(self, bus):
|
||||||
self.process_service = process_service
|
|
||||||
super().__init__(bus)
|
super().__init__(bus)
|
||||||
|
self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db)
|
||||||
|
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
|
||||||
|
|
||||||
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
|
async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None:
|
||||||
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
|
|
||||||
if snapshot_output_dir is None:
|
|
||||||
return
|
|
||||||
plugin_dir = Path(snapshot_output_dir) / event.plugin
|
|
||||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
|
||||||
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
|
|
||||||
|
|
||||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
|
||||||
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
|
|
||||||
return
|
|
||||||
|
|
||||||
plugin_dir = Path(event.output_dir)
|
|
||||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
|
||||||
records = _iter_archiveresult_records(event.stdout)
|
|
||||||
if records:
|
|
||||||
for record in records:
|
|
||||||
await run_db_op(
|
|
||||||
self._project_from_process_completed,
|
|
||||||
event,
|
|
||||||
record,
|
|
||||||
output_files,
|
|
||||||
output_size,
|
|
||||||
output_mimetypes,
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
synthetic_record = {
|
|
||||||
"plugin": event.plugin_name,
|
|
||||||
"hook_name": event.hook_name,
|
|
||||||
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
|
||||||
"output_str": event.stderr if event.exit_code != 0 else "",
|
|
||||||
"error": event.stderr if event.exit_code != 0 else "",
|
|
||||||
}
|
|
||||||
await run_db_op(
|
|
||||||
self._project_from_process_completed,
|
|
||||||
event,
|
|
||||||
synthetic_record,
|
|
||||||
output_files,
|
|
||||||
output_size,
|
|
||||||
output_mimetypes,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
|
|
||||||
from archivebox.core.models import Snapshot
|
|
||||||
|
|
||||||
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
|
|
||||||
return str(snapshot.output_dir) if snapshot is not None else None
|
|
||||||
|
|
||||||
def _project(
|
|
||||||
self,
|
|
||||||
event: ArchiveResultEvent,
|
|
||||||
output_files: dict[str, dict],
|
|
||||||
output_size: int,
|
|
||||||
output_mimetypes: str,
|
|
||||||
) -> None:
|
|
||||||
from archivebox.core.models import ArchiveResult, Snapshot
|
from archivebox.core.models import ArchiveResult, Snapshot
|
||||||
from archivebox.machine.models import Process
|
from archivebox.machine.models import Process
|
||||||
|
|
||||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||||
if snapshot is None:
|
if snapshot is None:
|
||||||
return
|
return
|
||||||
|
plugin_dir = Path(snapshot.output_dir) / event.plugin
|
||||||
|
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||||
|
process_started = await self.bus.find(
|
||||||
|
ProcessStartedEvent,
|
||||||
|
past=True,
|
||||||
|
future=False,
|
||||||
|
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
|
||||||
|
)
|
||||||
process = None
|
process = None
|
||||||
db_process_id = self.process_service.get_db_process_id(event.process_id)
|
if process_started is not None:
|
||||||
if db_process_id:
|
started_at = parse_event_datetime(process_started.start_ts)
|
||||||
process = Process.objects.filter(id=db_process_id).first()
|
if started_at is None:
|
||||||
|
raise ValueError("ProcessStartedEvent.start_ts is required")
|
||||||
|
process_query = Process.objects.filter(
|
||||||
|
pwd=process_started.output_dir,
|
||||||
|
cmd=[process_started.hook_path, *process_started.hook_args],
|
||||||
|
started_at=started_at,
|
||||||
|
)
|
||||||
|
if process_started.pid:
|
||||||
|
process_query = process_query.filter(pid=process_started.pid)
|
||||||
|
process = await process_query.order_by("-modified_at").afirst()
|
||||||
|
|
||||||
result, _created = ArchiveResult.objects.get_or_create(
|
result, _created = await ArchiveResult.objects.aget_or_create(
|
||||||
snapshot=snapshot,
|
snapshot=snapshot,
|
||||||
plugin=event.plugin,
|
plugin=event.plugin,
|
||||||
hook_name=event.hook_name,
|
hook_name=event.hook_name,
|
||||||
@@ -302,32 +263,54 @@ class ArchiveResultService(BaseService):
|
|||||||
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
|
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
|
||||||
if event.error:
|
if event.error:
|
||||||
result.notes = event.error
|
result.notes = event.error
|
||||||
result.save()
|
await result.asave()
|
||||||
|
|
||||||
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
|
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
|
||||||
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
|
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
|
||||||
snapshot.title = next_title
|
snapshot.title = next_title
|
||||||
snapshot.save(update_fields=["title", "modified_at"])
|
await snapshot.asave(update_fields=["title", "modified_at"])
|
||||||
|
|
||||||
def _project_from_process_completed(
|
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
|
||||||
self,
|
if not event.hook_name.startswith("on_Snapshot"):
|
||||||
event: ProcessCompletedEvent,
|
return
|
||||||
record: dict,
|
snapshot_event = await self.bus.find(
|
||||||
output_files: dict[str, dict],
|
SnapshotEvent,
|
||||||
output_size: int,
|
past=True,
|
||||||
output_mimetypes: str,
|
future=False,
|
||||||
) -> None:
|
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
|
||||||
archive_result_event = ArchiveResultEvent(
|
)
|
||||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
if snapshot_event is None:
|
||||||
plugin=record.get("plugin") or event.plugin_name,
|
return
|
||||||
hook_name=record.get("hook_name") or event.hook_name,
|
|
||||||
status=record.get("status") or "",
|
records = _iter_archiveresult_records(event.stdout)
|
||||||
process_id=event.process_id,
|
if records:
|
||||||
output_str=record.get("output_str") or "",
|
for record in records:
|
||||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
await self.bus.emit(
|
||||||
output_files=event.output_files,
|
ArchiveResultEvent(
|
||||||
start_ts=event.start_ts,
|
snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id,
|
||||||
end_ts=event.end_ts,
|
plugin=record.get("plugin") or event.plugin_name,
|
||||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
hook_name=record.get("hook_name") or event.hook_name,
|
||||||
|
status=record.get("status") or "",
|
||||||
|
output_str=record.get("output_str") or "",
|
||||||
|
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||||
|
output_files=event.output_files,
|
||||||
|
start_ts=event.start_ts,
|
||||||
|
end_ts=event.end_ts,
|
||||||
|
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
await self.bus.emit(
|
||||||
|
ArchiveResultEvent(
|
||||||
|
snapshot_id=snapshot_event.snapshot_id,
|
||||||
|
plugin=event.plugin_name,
|
||||||
|
hook_name=event.hook_name,
|
||||||
|
status="failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
||||||
|
output_str=event.stderr if event.exit_code != 0 else "",
|
||||||
|
output_files=event.output_files,
|
||||||
|
start_ts=event.start_ts,
|
||||||
|
end_ts=event.end_ts,
|
||||||
|
error=event.stderr if event.exit_code != 0 else "",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
self._project(archive_result_event, output_files, output_size, output_mimetypes)
|
|
||||||
|
|||||||
@@ -1,20 +1,62 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
from asgiref.sync import sync_to_async
|
||||||
|
|
||||||
from abx_dl.events import BinaryRequestEvent, BinaryEvent
|
from abx_dl.events import BinaryRequestEvent, BinaryEvent
|
||||||
from abx_dl.services.base import BaseService
|
from abx_dl.services.base import BaseService
|
||||||
|
|
||||||
from .db import run_db_op
|
|
||||||
|
|
||||||
|
|
||||||
class BinaryService(BaseService):
|
class BinaryService(BaseService):
|
||||||
LISTENS_TO = [BinaryRequestEvent, BinaryEvent]
|
LISTENS_TO = [BinaryRequestEvent, BinaryEvent]
|
||||||
EMITS = []
|
EMITS = []
|
||||||
|
|
||||||
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
|
def __init__(self, bus):
|
||||||
await run_db_op(self._project_binary, event)
|
super().__init__(bus)
|
||||||
cached = await run_db_op(self._load_cached_binary, event)
|
self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent)
|
||||||
|
self.bus.on(BinaryEvent, self.on_BinaryEvent)
|
||||||
|
|
||||||
|
async def on_BinaryRequestEvent(self, event: BinaryRequestEvent) -> None:
|
||||||
|
from archivebox.machine.models import Binary, Machine
|
||||||
|
|
||||||
|
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||||
|
existing = await Binary.objects.filter(machine=machine, name=event.name).afirst()
|
||||||
|
if existing and existing.status == Binary.StatusChoices.INSTALLED:
|
||||||
|
changed = False
|
||||||
|
if event.binproviders and existing.binproviders != event.binproviders:
|
||||||
|
existing.binproviders = event.binproviders
|
||||||
|
changed = True
|
||||||
|
if event.overrides and existing.overrides != event.overrides:
|
||||||
|
existing.overrides = event.overrides
|
||||||
|
changed = True
|
||||||
|
if changed:
|
||||||
|
await existing.asave(update_fields=["binproviders", "overrides", "modified_at"])
|
||||||
|
elif existing is None:
|
||||||
|
await Binary.objects.acreate(
|
||||||
|
machine=machine,
|
||||||
|
name=event.name,
|
||||||
|
binproviders=event.binproviders,
|
||||||
|
overrides=event.overrides or {},
|
||||||
|
status=Binary.StatusChoices.QUEUED,
|
||||||
|
)
|
||||||
|
|
||||||
|
installed = (
|
||||||
|
await Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
||||||
|
.exclude(abspath="")
|
||||||
|
.exclude(abspath__isnull=True)
|
||||||
|
.order_by("-modified_at")
|
||||||
|
.afirst()
|
||||||
|
)
|
||||||
|
cached = None
|
||||||
|
if installed is not None:
|
||||||
|
cached = {
|
||||||
|
"abspath": installed.abspath,
|
||||||
|
"version": installed.version or "",
|
||||||
|
"sha256": installed.sha256 or "",
|
||||||
|
"binproviders": installed.binproviders or "",
|
||||||
|
"binprovider": installed.binprovider or "",
|
||||||
|
"machine_id": str(installed.machine_id),
|
||||||
|
"overrides": installed.overrides or {},
|
||||||
|
}
|
||||||
if cached is not None:
|
if cached is not None:
|
||||||
await self.bus.emit(
|
await self.bus.emit(
|
||||||
BinaryEvent(
|
BinaryEvent(
|
||||||
@@ -28,126 +70,34 @@ class BinaryService(BaseService):
|
|||||||
binprovider=cached["binprovider"],
|
binprovider=cached["binprovider"],
|
||||||
overrides=event.overrides or cached["overrides"],
|
overrides=event.overrides or cached["overrides"],
|
||||||
binary_id=event.binary_id,
|
binary_id=event.binary_id,
|
||||||
machine_id=event.machine_id or cached["machine_id"],
|
machine_id=cached["machine_id"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
|
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
|
||||||
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
|
|
||||||
await run_db_op(self._project_installed_binary, event, resolved)
|
|
||||||
|
|
||||||
def _project_binary(self, event: BinaryRequestEvent) -> None:
|
|
||||||
from archivebox.machine.models import Binary, Machine
|
from archivebox.machine.models import Binary, Machine
|
||||||
|
|
||||||
machine = Machine.current()
|
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||||
existing = Binary.objects.filter(machine=machine, name=event.name).first()
|
binary, _ = await Binary.objects.aget_or_create(
|
||||||
if existing and existing.status == Binary.StatusChoices.INSTALLED:
|
|
||||||
changed = False
|
|
||||||
if event.binproviders and existing.binproviders != event.binproviders:
|
|
||||||
existing.binproviders = event.binproviders
|
|
||||||
changed = True
|
|
||||||
if event.overrides and existing.overrides != event.overrides:
|
|
||||||
existing.overrides = event.overrides
|
|
||||||
changed = True
|
|
||||||
if changed:
|
|
||||||
existing.save(update_fields=["binproviders", "overrides", "modified_at"])
|
|
||||||
return
|
|
||||||
|
|
||||||
Binary.from_json(
|
|
||||||
{
|
|
||||||
"name": event.name,
|
|
||||||
"binproviders": event.binproviders,
|
|
||||||
"overrides": event.overrides or {},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
|
|
||||||
from archivebox.machine.models import Binary, Machine
|
|
||||||
|
|
||||||
machine = Machine.current()
|
|
||||||
installed = (
|
|
||||||
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
|
||||||
.exclude(abspath="")
|
|
||||||
.exclude(abspath__isnull=True)
|
|
||||||
.order_by("-modified_at")
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
if installed is None:
|
|
||||||
return None
|
|
||||||
return {
|
|
||||||
"abspath": installed.abspath,
|
|
||||||
"version": installed.version or "",
|
|
||||||
"sha256": installed.sha256 or "",
|
|
||||||
"binproviders": installed.binproviders or "",
|
|
||||||
"binprovider": installed.binprovider or "",
|
|
||||||
"machine_id": str(installed.machine_id),
|
|
||||||
"overrides": installed.overrides or {},
|
|
||||||
}
|
|
||||||
|
|
||||||
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
|
|
||||||
resolved = {
|
|
||||||
"abspath": event.abspath or "",
|
|
||||||
"version": event.version or "",
|
|
||||||
"sha256": event.sha256 or "",
|
|
||||||
"binproviders": event.binproviders or "",
|
|
||||||
"binprovider": event.binprovider or "",
|
|
||||||
}
|
|
||||||
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
|
|
||||||
return resolved
|
|
||||||
|
|
||||||
if resolved["abspath"] and not resolved["version"]:
|
|
||||||
try:
|
|
||||||
from abx_pkg.semver import bin_version
|
|
||||||
|
|
||||||
detected_version = bin_version(resolved["abspath"])
|
|
||||||
except Exception:
|
|
||||||
detected_version = None
|
|
||||||
if detected_version:
|
|
||||||
resolved["version"] = str(detected_version)
|
|
||||||
if resolved["version"] and resolved["binprovider"]:
|
|
||||||
return resolved
|
|
||||||
|
|
||||||
try:
|
|
||||||
from abx_dl.dependencies import load_binary
|
|
||||||
|
|
||||||
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
|
|
||||||
spec = {
|
|
||||||
"name": event.name,
|
|
||||||
"binproviders": allowed_providers,
|
|
||||||
"overrides": event.overrides or {},
|
|
||||||
}
|
|
||||||
binary = load_binary(spec)
|
|
||||||
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
|
|
||||||
resolved["version"] = str(binary.version or resolved["version"] or "")
|
|
||||||
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
|
|
||||||
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
|
|
||||||
resolved["binprovider"] = str(binary.loaded_binprovider.name)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return resolved
|
|
||||||
|
|
||||||
def _project_installed_binary(self, event: BinaryEvent, resolved: dict[str, str]) -> None:
|
|
||||||
from archivebox.machine.models import Binary, Machine
|
|
||||||
|
|
||||||
machine = Machine.current()
|
|
||||||
binary, _ = Binary.objects.get_or_create(
|
|
||||||
machine=machine,
|
machine=machine,
|
||||||
name=event.name,
|
name=event.name,
|
||||||
defaults={
|
defaults={
|
||||||
"status": Binary.StatusChoices.QUEUED,
|
"status": Binary.StatusChoices.QUEUED,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
binary.abspath = resolved["abspath"] or binary.abspath
|
binary.abspath = event.abspath
|
||||||
binary.version = resolved["version"] or binary.version
|
if event.version:
|
||||||
binary.sha256 = resolved["sha256"] or binary.sha256
|
binary.version = event.version
|
||||||
if resolved["binproviders"]:
|
if event.sha256:
|
||||||
binary.binproviders = resolved["binproviders"]
|
binary.sha256 = event.sha256
|
||||||
binary.binprovider = resolved["binprovider"] or binary.binprovider
|
if event.binproviders:
|
||||||
|
binary.binproviders = event.binproviders
|
||||||
|
if event.binprovider:
|
||||||
|
binary.binprovider = event.binprovider
|
||||||
if event.overrides and binary.overrides != event.overrides:
|
if event.overrides and binary.overrides != event.overrides:
|
||||||
binary.overrides = event.overrides
|
binary.overrides = event.overrides
|
||||||
binary.status = Binary.StatusChoices.INSTALLED
|
binary.status = Binary.StatusChoices.INSTALLED
|
||||||
binary.retry_at = None
|
binary.retry_at = None
|
||||||
binary.save(
|
await binary.asave(
|
||||||
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
|
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,8 +3,6 @@ from __future__ import annotations
|
|||||||
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
|
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
|
||||||
from abx_dl.services.base import BaseService
|
from abx_dl.services.base import BaseService
|
||||||
|
|
||||||
from .db import run_db_op
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlService(BaseService):
|
class CrawlService(BaseService):
|
||||||
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
|
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
|
||||||
@@ -13,32 +11,42 @@ class CrawlService(BaseService):
|
|||||||
def __init__(self, bus, *, crawl_id: str):
|
def __init__(self, bus, *, crawl_id: str):
|
||||||
self.crawl_id = crawl_id
|
self.crawl_id = crawl_id
|
||||||
super().__init__(bus)
|
super().__init__(bus)
|
||||||
|
self.bus.on(CrawlSetupEvent, self.on_CrawlSetupEvent__save_to_db)
|
||||||
|
self.bus.on(CrawlStartEvent, self.on_CrawlStartEvent__save_to_db)
|
||||||
|
self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__save_to_db)
|
||||||
|
self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__save_to_db)
|
||||||
|
|
||||||
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
|
async def on_CrawlSetupEvent__save_to_db(self, event: CrawlSetupEvent) -> None:
|
||||||
await run_db_op(self._mark_started)
|
|
||||||
|
|
||||||
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
|
|
||||||
await run_db_op(self._mark_started)
|
|
||||||
|
|
||||||
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
|
|
||||||
await run_db_op(self._mark_started)
|
|
||||||
|
|
||||||
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
|
|
||||||
await run_db_op(self._mark_completed)
|
|
||||||
|
|
||||||
def _mark_started(self) -> None:
|
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||||
crawl.status = Crawl.StatusChoices.STARTED
|
crawl.status = Crawl.StatusChoices.STARTED
|
||||||
crawl.retry_at = None
|
crawl.retry_at = None
|
||||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||||
|
|
||||||
def _mark_completed(self) -> None:
|
async def on_CrawlStartEvent__save_to_db(self, event: CrawlStartEvent) -> None:
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||||
|
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||||
|
crawl.status = Crawl.StatusChoices.STARTED
|
||||||
|
crawl.retry_at = None
|
||||||
|
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||||
|
|
||||||
|
async def on_CrawlCleanupEvent__save_to_db(self, event: CrawlCleanupEvent) -> None:
|
||||||
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
|
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||||
|
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||||
|
crawl.status = Crawl.StatusChoices.STARTED
|
||||||
|
crawl.retry_at = None
|
||||||
|
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||||
|
|
||||||
|
async def on_CrawlCompletedEvent__save_to_db(self, event: CrawlCompletedEvent) -> None:
|
||||||
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
|
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||||
crawl.status = Crawl.StatusChoices.SEALED
|
crawl.status = Crawl.StatusChoices.SEALED
|
||||||
crawl.retry_at = None
|
crawl.retry_at = None
|
||||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
from django.db import close_old_connections
|
|
||||||
|
|
||||||
|
|
||||||
def _run_db_op(func, *args, **kwargs):
|
|
||||||
close_old_connections()
|
|
||||||
try:
|
|
||||||
return func(*args, **kwargs)
|
|
||||||
finally:
|
|
||||||
close_old_connections()
|
|
||||||
|
|
||||||
|
|
||||||
async def run_db_op(func, *args, **kwargs):
|
|
||||||
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)
|
|
||||||
@@ -1,22 +1,23 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from asgiref.sync import sync_to_async
|
||||||
|
|
||||||
from abx_dl.events import MachineEvent
|
from abx_dl.events import MachineEvent
|
||||||
from abx_dl.services.base import BaseService
|
from abx_dl.services.base import BaseService
|
||||||
|
|
||||||
from .db import run_db_op
|
|
||||||
|
|
||||||
|
|
||||||
class MachineService(BaseService):
|
class MachineService(BaseService):
|
||||||
LISTENS_TO = [MachineEvent]
|
LISTENS_TO = [MachineEvent]
|
||||||
EMITS = []
|
EMITS = []
|
||||||
|
|
||||||
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
|
def __init__(self, bus):
|
||||||
await run_db_op(self._project, event)
|
super().__init__(bus)
|
||||||
|
self.bus.on(MachineEvent, self.on_MachineEvent__save_to_db)
|
||||||
|
|
||||||
def _project(self, event: MachineEvent) -> None:
|
async def on_MachineEvent__save_to_db(self, event: MachineEvent) -> None:
|
||||||
from archivebox.machine.models import Machine, _sanitize_machine_config
|
from archivebox.machine.models import Machine, _sanitize_machine_config
|
||||||
|
|
||||||
machine = Machine.current()
|
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||||
config = dict(machine.config or {})
|
config = dict(machine.config or {})
|
||||||
|
|
||||||
if event.config is not None:
|
if event.config is not None:
|
||||||
@@ -29,4 +30,4 @@ class MachineService(BaseService):
|
|||||||
return
|
return
|
||||||
|
|
||||||
machine.config = _sanitize_machine_config(config)
|
machine.config = _sanitize_machine_config(config)
|
||||||
machine.save(update_fields=["config", "modified_at"])
|
await machine.asave(update_fields=["config", "modified_at"])
|
||||||
|
|||||||
@@ -1,29 +1,15 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
from datetime import datetime
|
||||||
from datetime import datetime, timezone as datetime_timezone
|
from typing import ClassVar
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
import shlex
|
|
||||||
import socket
|
|
||||||
import time
|
|
||||||
from typing import TYPE_CHECKING, Any, ClassVar
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
|
from asgiref.sync import sync_to_async
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from abxbus import BaseEvent
|
from abxbus import BaseEvent
|
||||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||||
from abx_dl.services.base import BaseService
|
from abx_dl.services.base import BaseService
|
||||||
|
|
||||||
from .db import run_db_op
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from archivebox.machine.models import Process
|
|
||||||
|
|
||||||
|
|
||||||
WORKER_READY_TIMEOUT = 10.0
|
|
||||||
|
|
||||||
|
|
||||||
def parse_event_datetime(value: str | None):
|
def parse_event_datetime(value: str | None):
|
||||||
if not value:
|
if not value:
|
||||||
@@ -37,308 +23,133 @@ def parse_event_datetime(value: str | None):
|
|||||||
return dt
|
return dt
|
||||||
|
|
||||||
|
|
||||||
def _is_port_listening(host: str, port: int) -> bool:
|
|
||||||
if not host or not port:
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
with socket.create_connection((host, port), timeout=0.5):
|
|
||||||
return True
|
|
||||||
except OSError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
|
|
||||||
if not url:
|
|
||||||
return None
|
|
||||||
parsed = urlparse(url)
|
|
||||||
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
|
|
||||||
return None
|
|
||||||
return parsed.hostname, parsed.port
|
|
||||||
|
|
||||||
|
|
||||||
def _supervisor_env(env: dict[str, str]) -> str:
|
|
||||||
pairs = []
|
|
||||||
for key, value in env.items():
|
|
||||||
escaped = value.replace('"', '\\"')
|
|
||||||
pairs.append(f'{key}="{escaped}"')
|
|
||||||
return ",".join(pairs)
|
|
||||||
|
|
||||||
|
|
||||||
def _iso_from_epoch(value: object) -> str:
|
|
||||||
if not isinstance(value, (int, float)) or value <= 0:
|
|
||||||
return ""
|
|
||||||
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
|
|
||||||
|
|
||||||
|
|
||||||
def _int_from_object(value: object) -> int:
|
|
||||||
if isinstance(value, bool):
|
|
||||||
return int(value)
|
|
||||||
if isinstance(value, int):
|
|
||||||
return value
|
|
||||||
if isinstance(value, float):
|
|
||||||
return int(value)
|
|
||||||
if isinstance(value, str):
|
|
||||||
try:
|
|
||||||
return int(value)
|
|
||||||
except ValueError:
|
|
||||||
return 0
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
|
||||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
|
||||||
|
|
||||||
output_dir = Path(process_event.output_dir)
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
worker_name = process_event.hook_name
|
|
||||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
|
||||||
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
|
|
||||||
|
|
||||||
existing = get_worker(supervisor, worker_name)
|
|
||||||
if (
|
|
||||||
isinstance(existing, dict)
|
|
||||||
and existing.get("statename") == "RUNNING"
|
|
||||||
and (worker_socket is None or _is_port_listening(*worker_socket))
|
|
||||||
):
|
|
||||||
return existing
|
|
||||||
|
|
||||||
daemon = {
|
|
||||||
"name": worker_name,
|
|
||||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
|
||||||
"directory": str(output_dir),
|
|
||||||
"autostart": "false",
|
|
||||||
"autorestart": "true",
|
|
||||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
|
||||||
"redirect_stderr": "true",
|
|
||||||
}
|
|
||||||
if process_event.env:
|
|
||||||
daemon["environment"] = _supervisor_env(process_event.env)
|
|
||||||
|
|
||||||
proc = start_worker(supervisor, daemon)
|
|
||||||
deadline = time.monotonic() + WORKER_READY_TIMEOUT
|
|
||||||
while time.monotonic() < deadline:
|
|
||||||
current = get_worker(supervisor, worker_name)
|
|
||||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
|
||||||
if worker_socket is None or _is_port_listening(*worker_socket):
|
|
||||||
return current
|
|
||||||
time.sleep(0.1)
|
|
||||||
return proc if isinstance(proc, dict) else {}
|
|
||||||
|
|
||||||
|
|
||||||
class ProcessService(BaseService):
|
class ProcessService(BaseService):
|
||||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStartedEvent, ProcessCompletedEvent]
|
||||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
EMITS: ClassVar[list[type[BaseEvent]]] = []
|
||||||
|
|
||||||
def __init__(self, bus):
|
def __init__(self, bus):
|
||||||
self.process_ids: dict[str, str] = {}
|
|
||||||
super().__init__(bus)
|
super().__init__(bus)
|
||||||
|
self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db)
|
||||||
|
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
|
||||||
|
|
||||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None:
|
||||||
try:
|
|
||||||
record = json.loads(event.line)
|
|
||||||
except (json.JSONDecodeError, ValueError):
|
|
||||||
return
|
|
||||||
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
|
|
||||||
return
|
|
||||||
|
|
||||||
passthrough_fields: dict[str, Any] = {
|
|
||||||
key: value
|
|
||||||
for key, value in record.items()
|
|
||||||
if key
|
|
||||||
not in {
|
|
||||||
"type",
|
|
||||||
"plugin_name",
|
|
||||||
"hook_name",
|
|
||||||
"hook_path",
|
|
||||||
"hook_args",
|
|
||||||
"is_background",
|
|
||||||
"output_dir",
|
|
||||||
"env",
|
|
||||||
"snapshot_id",
|
|
||||||
"process_id",
|
|
||||||
"url",
|
|
||||||
"timeout",
|
|
||||||
"daemon",
|
|
||||||
"process_type",
|
|
||||||
"worker_type",
|
|
||||||
"event_timeout",
|
|
||||||
"event_handler_timeout",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
process_event = ProcessEvent(
|
|
||||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
|
||||||
hook_name=record.get("hook_name") or "process",
|
|
||||||
hook_path=record["hook_path"],
|
|
||||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
|
||||||
is_background=bool(record.get("is_background", True)),
|
|
||||||
output_dir=record.get("output_dir") or event.output_dir,
|
|
||||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
|
||||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
|
||||||
timeout=int(record.get("timeout") or 60),
|
|
||||||
daemon=bool(record.get("daemon", False)),
|
|
||||||
url=str(record.get("url") or ""),
|
|
||||||
process_type=str(record.get("process_type") or ""),
|
|
||||||
worker_type=str(record.get("worker_type") or ""),
|
|
||||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
|
||||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
|
||||||
**passthrough_fields,
|
|
||||||
)
|
|
||||||
if not process_event.daemon:
|
|
||||||
await self.bus.emit(process_event)
|
|
||||||
return
|
|
||||||
|
|
||||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
|
||||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
|
||||||
start_ts = _iso_from_epoch(proc.get("start"))
|
|
||||||
pid = _int_from_object(proc.get("pid"))
|
|
||||||
statename = str(proc.get("statename") or "")
|
|
||||||
exitstatus = _int_from_object(proc.get("exitstatus"))
|
|
||||||
process_type = process_event.process_type or "worker"
|
|
||||||
worker_type = process_event.worker_type or process_event.plugin_name
|
|
||||||
|
|
||||||
if statename == "RUNNING" and pid:
|
|
||||||
await self.bus.emit(
|
|
||||||
ProcessStartedEvent(
|
|
||||||
plugin_name=process_event.plugin_name,
|
|
||||||
hook_name=process_event.hook_name,
|
|
||||||
hook_path=process_event.hook_path,
|
|
||||||
hook_args=process_event.hook_args,
|
|
||||||
output_dir=process_event.output_dir,
|
|
||||||
env=process_event.env,
|
|
||||||
timeout=process_event.timeout,
|
|
||||||
pid=pid,
|
|
||||||
process_id=process_id,
|
|
||||||
snapshot_id=process_event.snapshot_id,
|
|
||||||
is_background=True,
|
|
||||||
url=process_event.url,
|
|
||||||
process_type=process_type,
|
|
||||||
worker_type=worker_type,
|
|
||||||
start_ts=start_ts,
|
|
||||||
**passthrough_fields,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
stderr = (
|
|
||||||
f"Worker {process_event.hook_name} failed to start"
|
|
||||||
if not statename
|
|
||||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
|
||||||
)
|
|
||||||
await self.bus.emit(
|
|
||||||
ProcessCompletedEvent(
|
|
||||||
plugin_name=process_event.plugin_name,
|
|
||||||
hook_name=process_event.hook_name,
|
|
||||||
hook_path=process_event.hook_path,
|
|
||||||
hook_args=process_event.hook_args,
|
|
||||||
env=process_event.env,
|
|
||||||
stdout="",
|
|
||||||
stderr=stderr,
|
|
||||||
exit_code=exitstatus or 1,
|
|
||||||
output_dir=process_event.output_dir,
|
|
||||||
is_background=True,
|
|
||||||
process_id=process_id,
|
|
||||||
snapshot_id=process_event.snapshot_id,
|
|
||||||
pid=pid,
|
|
||||||
url=process_event.url,
|
|
||||||
process_type=process_type,
|
|
||||||
worker_type=worker_type,
|
|
||||||
start_ts=start_ts,
|
|
||||||
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
|
|
||||||
**passthrough_fields,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
raise RuntimeError(stderr)
|
|
||||||
|
|
||||||
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
|
|
||||||
await run_db_op(self._project_started, event)
|
|
||||||
|
|
||||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
|
||||||
await run_db_op(self._project_completed, event)
|
|
||||||
|
|
||||||
def get_db_process_id(self, process_id: str) -> str | None:
|
|
||||||
return self.process_ids.get(process_id)
|
|
||||||
|
|
||||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
|
|
||||||
from archivebox.machine.models import NetworkInterface, Process
|
from archivebox.machine.models import NetworkInterface, Process
|
||||||
|
|
||||||
db_process_id = self.process_ids.get(event.process_id)
|
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
|
||||||
iface = NetworkInterface.current(refresh=True)
|
process_type = event.process_type or (
|
||||||
if db_process_id:
|
|
||||||
process = Process.objects.filter(id=db_process_id).first()
|
|
||||||
if process is not None:
|
|
||||||
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
|
|
||||||
process.iface = iface
|
|
||||||
process.machine = iface.machine
|
|
||||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
|
||||||
return process
|
|
||||||
|
|
||||||
process_type = getattr(event, "process_type", "") or (
|
|
||||||
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
|
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
|
||||||
)
|
)
|
||||||
worker_type = getattr(event, "worker_type", "") or ""
|
worker_type = event.worker_type or ""
|
||||||
if process_type == Process.TypeChoices.WORKER and worker_type:
|
started_at = parse_event_datetime(event.start_ts)
|
||||||
existing = (
|
if started_at is None:
|
||||||
Process.objects.filter(
|
raise ValueError("ProcessStartedEvent.start_ts is required")
|
||||||
process_type=Process.TypeChoices.WORKER,
|
process_query = Process.objects.filter(
|
||||||
worker_type=worker_type,
|
|
||||||
pwd=event.output_dir,
|
|
||||||
)
|
|
||||||
.order_by("-modified_at")
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
if existing is not None:
|
|
||||||
self.process_ids[event.process_id] = str(existing.id)
|
|
||||||
return existing
|
|
||||||
process = Process.objects.create(
|
|
||||||
machine=iface.machine,
|
|
||||||
iface=iface,
|
|
||||||
process_type=process_type,
|
process_type=process_type,
|
||||||
worker_type=worker_type,
|
worker_type=worker_type,
|
||||||
pwd=event.output_dir,
|
pwd=event.output_dir,
|
||||||
cmd=[event.hook_path, *event.hook_args],
|
cmd=[event.hook_path, *event.hook_args],
|
||||||
env=event.env,
|
started_at=started_at,
|
||||||
timeout=getattr(event, "timeout", 60),
|
|
||||||
pid=event.pid or None,
|
|
||||||
url=getattr(event, "url", "") or None,
|
|
||||||
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
|
|
||||||
status=Process.StatusChoices.RUNNING,
|
|
||||||
retry_at=None,
|
|
||||||
)
|
)
|
||||||
self.process_ids[event.process_id] = str(process.id)
|
if event.pid:
|
||||||
return process
|
process_query = process_query.filter(pid=event.pid)
|
||||||
|
process = await process_query.order_by("-modified_at").afirst()
|
||||||
|
if process is None:
|
||||||
|
process = await Process.objects.acreate(
|
||||||
|
machine=iface.machine,
|
||||||
|
iface=iface,
|
||||||
|
process_type=process_type,
|
||||||
|
worker_type=worker_type,
|
||||||
|
pwd=event.output_dir,
|
||||||
|
cmd=[event.hook_path, *event.hook_args],
|
||||||
|
env=event.env,
|
||||||
|
timeout=event.timeout,
|
||||||
|
pid=event.pid or None,
|
||||||
|
url=event.url or None,
|
||||||
|
started_at=started_at,
|
||||||
|
status=Process.StatusChoices.RUNNING,
|
||||||
|
retry_at=None,
|
||||||
|
)
|
||||||
|
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||||
|
process.iface = iface
|
||||||
|
process.machine = iface.machine
|
||||||
|
await process.asave(update_fields=["iface", "machine", "modified_at"])
|
||||||
|
|
||||||
def _project_started(self, event: ProcessStartedEvent) -> None:
|
|
||||||
process = self._get_or_create_process(event)
|
|
||||||
process.pwd = event.output_dir
|
process.pwd = event.output_dir
|
||||||
process.cmd = [event.hook_path, *event.hook_args]
|
process.cmd = [event.hook_path, *event.hook_args]
|
||||||
process.env = event.env
|
process.env = event.env
|
||||||
process.timeout = event.timeout
|
process.timeout = event.timeout
|
||||||
process.pid = event.pid or None
|
process.pid = event.pid or None
|
||||||
process.url = getattr(event, "url", "") or process.url
|
process.url = event.url or process.url
|
||||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
process.process_type = process_type or process.process_type
|
||||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
process.worker_type = worker_type or process.worker_type
|
||||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
process.started_at = started_at
|
||||||
process.status = process.StatusChoices.RUNNING
|
process.status = process.StatusChoices.RUNNING
|
||||||
process.retry_at = None
|
process.retry_at = None
|
||||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
|
||||||
process.save()
|
plugin_name=event.plugin_name,
|
||||||
|
hook_path=event.hook_path,
|
||||||
|
)
|
||||||
|
await process.asave()
|
||||||
|
|
||||||
|
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
|
||||||
|
from archivebox.machine.models import NetworkInterface, Process
|
||||||
|
|
||||||
|
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
|
||||||
|
process_type = event.process_type or (
|
||||||
|
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
|
||||||
|
)
|
||||||
|
worker_type = event.worker_type or ""
|
||||||
|
started_at = parse_event_datetime(event.start_ts)
|
||||||
|
if started_at is None:
|
||||||
|
raise ValueError("ProcessCompletedEvent.start_ts is required")
|
||||||
|
process_query = Process.objects.filter(
|
||||||
|
process_type=process_type,
|
||||||
|
worker_type=worker_type,
|
||||||
|
pwd=event.output_dir,
|
||||||
|
cmd=[event.hook_path, *event.hook_args],
|
||||||
|
started_at=started_at,
|
||||||
|
)
|
||||||
|
if event.pid:
|
||||||
|
process_query = process_query.filter(pid=event.pid)
|
||||||
|
process = await process_query.order_by("-modified_at").afirst()
|
||||||
|
if process is None:
|
||||||
|
process = await Process.objects.acreate(
|
||||||
|
machine=iface.machine,
|
||||||
|
iface=iface,
|
||||||
|
process_type=process_type,
|
||||||
|
worker_type=worker_type,
|
||||||
|
pwd=event.output_dir,
|
||||||
|
cmd=[event.hook_path, *event.hook_args],
|
||||||
|
env=event.env,
|
||||||
|
timeout=event.timeout,
|
||||||
|
pid=event.pid or None,
|
||||||
|
url=event.url or None,
|
||||||
|
started_at=started_at,
|
||||||
|
status=Process.StatusChoices.RUNNING,
|
||||||
|
retry_at=None,
|
||||||
|
)
|
||||||
|
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||||
|
process.iface = iface
|
||||||
|
process.machine = iface.machine
|
||||||
|
await process.asave(update_fields=["iface", "machine", "modified_at"])
|
||||||
|
|
||||||
def _project_completed(self, event: ProcessCompletedEvent) -> None:
|
|
||||||
process = self._get_or_create_process(event)
|
|
||||||
process.pwd = event.output_dir
|
process.pwd = event.output_dir
|
||||||
if not process.cmd:
|
if not process.cmd:
|
||||||
process.cmd = [event.hook_path, *event.hook_args]
|
process.cmd = [event.hook_path, *event.hook_args]
|
||||||
process.env = event.env
|
process.env = event.env
|
||||||
process.pid = event.pid or process.pid
|
process.pid = event.pid or process.pid
|
||||||
process.url = getattr(event, "url", "") or process.url
|
process.url = event.url or process.url
|
||||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
process.process_type = process_type or process.process_type
|
||||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
process.worker_type = worker_type or process.worker_type
|
||||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
process.started_at = started_at
|
||||||
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
|
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
|
||||||
process.stdout = event.stdout
|
process.stdout = event.stdout
|
||||||
process.stderr = event.stderr
|
process.stderr = event.stderr
|
||||||
process.exit_code = event.exit_code
|
process.exit_code = event.exit_code
|
||||||
process.status = process.StatusChoices.EXITED
|
process.status = process.StatusChoices.EXITED
|
||||||
process.retry_at = None
|
process.retry_at = None
|
||||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
|
||||||
process.save()
|
plugin_name=event.plugin_name,
|
||||||
|
hook_path=event.hook_path,
|
||||||
|
)
|
||||||
|
await process.asave()
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
@@ -13,12 +12,13 @@ from pathlib import Path
|
|||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from asgiref.sync import sync_to_async
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
|
||||||
from abx_dl.events import BinaryRequestEvent
|
from abx_dl.events import BinaryRequestEvent
|
||||||
from abx_dl.limits import CrawlLimitState
|
from abx_dl.limits import CrawlLimitState
|
||||||
from abx_dl.models import Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
|
from abx_dl.models import Plugin, discover_plugins, filter_plugins
|
||||||
from abx_dl.orchestrator import (
|
from abx_dl.orchestrator import (
|
||||||
create_bus,
|
create_bus,
|
||||||
download,
|
download,
|
||||||
@@ -40,150 +40,9 @@ def _bus_name(prefix: str, identifier: str) -> str:
|
|||||||
return f"{prefix}_{normalized}"
|
return f"{prefix}_{normalized}"
|
||||||
|
|
||||||
|
|
||||||
def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
|
|
||||||
raw = str(config.get("PLUGINS") or "").strip()
|
|
||||||
if not raw:
|
|
||||||
return None
|
|
||||||
return [name.strip() for name in raw.split(",") if name.strip()]
|
|
||||||
|
|
||||||
|
|
||||||
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
|
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
|
||||||
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
|
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
|
||||||
return sum(
|
return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name)
|
||||||
1
|
|
||||||
for plugin in selected.values()
|
|
||||||
for hook in plugin.hooks
|
|
||||||
if "Install" in hook.name or "CrawlSetup" in hook.name or "Snapshot" in hook.name
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
|
|
||||||
|
|
||||||
|
|
||||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
|
|
||||||
keys: list[str] = []
|
|
||||||
|
|
||||||
for plugin in plugins.values():
|
|
||||||
for spec in plugin.binaries:
|
|
||||||
template_name = str(spec.get("name") or "").strip()
|
|
||||||
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
|
|
||||||
if match is None:
|
|
||||||
continue
|
|
||||||
key = match.group(1)
|
|
||||||
configured_value = config.get(key)
|
|
||||||
if configured_value is not None and str(configured_value).strip() == binary_name:
|
|
||||||
keys.append(key)
|
|
||||||
for key, prop in plugin.config_schema.items():
|
|
||||||
if key.endswith("_BINARY") and prop.get("default") == binary_name:
|
|
||||||
keys.append(key)
|
|
||||||
|
|
||||||
return list(dict.fromkeys(keys))
|
|
||||||
|
|
||||||
|
|
||||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
|
|
||||||
from archivebox.machine.models import Binary, Machine
|
|
||||||
|
|
||||||
machine = Machine.current()
|
|
||||||
active_config = dict(config or {})
|
|
||||||
overrides: dict[str, str] = {}
|
|
||||||
shared_lib_dir: Path | None = None
|
|
||||||
pip_home: Path | None = None
|
|
||||||
pip_bin_dir: Path | None = None
|
|
||||||
npm_home: Path | None = None
|
|
||||||
node_modules_dir: Path | None = None
|
|
||||||
npm_bin_dir: Path | None = None
|
|
||||||
binaries = (
|
|
||||||
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
for binary in binaries:
|
|
||||||
try:
|
|
||||||
resolved_path = Path(binary.abspath).expanduser()
|
|
||||||
except (TypeError, ValueError):
|
|
||||||
continue
|
|
||||||
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
|
|
||||||
continue
|
|
||||||
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
|
|
||||||
overrides[key] = binary.abspath
|
|
||||||
|
|
||||||
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
|
|
||||||
npm_bin_dir = npm_bin_dir or resolved_path.parent
|
|
||||||
node_modules_dir = node_modules_dir or resolved_path.parent.parent
|
|
||||||
npm_home = npm_home or resolved_path.parent.parent.parent
|
|
||||||
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
|
|
||||||
elif (
|
|
||||||
resolved_path.parent.name == "bin"
|
|
||||||
and resolved_path.parent.parent.name == "venv"
|
|
||||||
and resolved_path.parent.parent.parent.name == "pip"
|
|
||||||
):
|
|
||||||
pip_bin_dir = pip_bin_dir or resolved_path.parent
|
|
||||||
pip_home = pip_home or resolved_path.parent.parent.parent
|
|
||||||
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
|
|
||||||
|
|
||||||
if shared_lib_dir is not None:
|
|
||||||
overrides["LIB_DIR"] = str(shared_lib_dir)
|
|
||||||
overrides["LIB_BIN_DIR"] = str(shared_lib_dir / "bin")
|
|
||||||
if pip_home is not None:
|
|
||||||
overrides["PIP_HOME"] = str(pip_home)
|
|
||||||
if pip_bin_dir is not None:
|
|
||||||
overrides["PIP_BIN_DIR"] = str(pip_bin_dir)
|
|
||||||
if npm_home is not None:
|
|
||||||
overrides["NPM_HOME"] = str(npm_home)
|
|
||||||
if node_modules_dir is not None:
|
|
||||||
overrides["NODE_MODULES_DIR"] = str(node_modules_dir)
|
|
||||||
overrides["NODE_MODULE_DIR"] = str(node_modules_dir)
|
|
||||||
overrides["NODE_PATH"] = str(node_modules_dir)
|
|
||||||
if npm_bin_dir is not None:
|
|
||||||
overrides["NPM_BIN_DIR"] = str(npm_bin_dir)
|
|
||||||
|
|
||||||
return overrides
|
|
||||||
|
|
||||||
|
|
||||||
def _limit_stop_reason(config: dict[str, Any]) -> str:
|
|
||||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
|
||||||
|
|
||||||
|
|
||||||
def _attach_bus_trace(bus) -> None:
|
|
||||||
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
|
|
||||||
if not trace_target:
|
|
||||||
return
|
|
||||||
if getattr(bus, "_archivebox_trace_task", None) is not None:
|
|
||||||
return
|
|
||||||
|
|
||||||
trace_path = None if trace_target in {"1", "-", "stderr"} else Path(trace_target)
|
|
||||||
stop_event = asyncio.Event()
|
|
||||||
|
|
||||||
async def trace_loop() -> None:
|
|
||||||
seen_event_ids: set[str] = set()
|
|
||||||
while not stop_event.is_set():
|
|
||||||
for event_id, event in list(bus.event_history.items()):
|
|
||||||
if event_id in seen_event_ids:
|
|
||||||
continue
|
|
||||||
seen_event_ids.add(event_id)
|
|
||||||
payload = event.model_dump(mode="json")
|
|
||||||
payload["bus_name"] = bus.name
|
|
||||||
line = json.dumps(payload, ensure_ascii=False, default=str, separators=(",", ":"))
|
|
||||||
if trace_path is None:
|
|
||||||
print(line, file=sys.stderr, flush=True)
|
|
||||||
else:
|
|
||||||
trace_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
with trace_path.open("a", encoding="utf-8") as handle:
|
|
||||||
handle.write(line + "\n")
|
|
||||||
await asyncio.sleep(0.05)
|
|
||||||
|
|
||||||
bus._archivebox_trace_stop = stop_event
|
|
||||||
bus._archivebox_trace_task = asyncio.create_task(trace_loop())
|
|
||||||
|
|
||||||
|
|
||||||
async def _stop_bus_trace(bus) -> None:
|
|
||||||
stop_event = getattr(bus, "_archivebox_trace_stop", None)
|
|
||||||
trace_task = getattr(bus, "_archivebox_trace_task", None)
|
|
||||||
if stop_event is None or trace_task is None:
|
|
||||||
return
|
|
||||||
stop_event.set()
|
|
||||||
await asyncio.gather(trace_task, return_exceptions=True)
|
|
||||||
bus._archivebox_trace_stop = None
|
|
||||||
bus._archivebox_trace_task = None
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
|
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
|
||||||
@@ -235,22 +94,25 @@ class CrawlRunner:
|
|||||||
self.crawl = crawl
|
self.crawl = crawl
|
||||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||||
self.plugins = discover_plugins()
|
self.plugins = discover_plugins()
|
||||||
self.process_service = ProcessService(self.bus)
|
ProcessService(self.bus)
|
||||||
self.binary_service = BinaryService(self.bus)
|
BinaryService(self.bus)
|
||||||
self.tag_service = TagService(self.bus)
|
TagService(self.bus)
|
||||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||||
self.snapshot_service = SnapshotService(
|
|
||||||
|
async def ignore_snapshot(_snapshot_id: str) -> None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
SnapshotService(
|
||||||
self.bus,
|
self.bus,
|
||||||
crawl_id=str(crawl.id),
|
crawl_id=str(crawl.id),
|
||||||
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot,
|
||||||
)
|
)
|
||||||
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
|
ArchiveResultService(self.bus)
|
||||||
self.selected_plugins = selected_plugins
|
self.selected_plugins = selected_plugins
|
||||||
self.initial_snapshot_ids = snapshot_ids
|
self.initial_snapshot_ids = snapshot_ids
|
||||||
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
|
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
|
||||||
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
|
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
|
||||||
self.abx_services = None
|
|
||||||
self.persona = None
|
self.persona = None
|
||||||
self.base_config: dict[str, Any] = {}
|
self.base_config: dict[str, Any] = {}
|
||||||
self.derived_config: dict[str, Any] = {}
|
self.derived_config: dict[str, Any] = {}
|
||||||
@@ -258,15 +120,11 @@ class CrawlRunner:
|
|||||||
self._live_stream = None
|
self._live_stream = None
|
||||||
|
|
||||||
async def run(self) -> None:
|
async def run(self) -> None:
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
from archivebox.crawls.models import Crawl
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await sync_to_async(self._prepare, thread_sensitive=True)()
|
snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)()
|
||||||
live_ui = self._create_live_ui()
|
live_ui = self._create_live_ui()
|
||||||
with live_ui if live_ui is not None else nullcontext():
|
with live_ui if live_ui is not None else nullcontext():
|
||||||
_attach_bus_trace(self.bus)
|
setup_abx_services(
|
||||||
self.abx_services = setup_abx_services(
|
|
||||||
self.bus,
|
self.bus,
|
||||||
plugins=self.plugins,
|
plugins=self.plugins,
|
||||||
config_overrides={
|
config_overrides={
|
||||||
@@ -278,18 +136,14 @@ class CrawlRunner:
|
|||||||
auto_install=True,
|
auto_install=True,
|
||||||
emit_jsonl=False,
|
emit_jsonl=False,
|
||||||
)
|
)
|
||||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
|
||||||
if snapshot_ids:
|
if snapshot_ids:
|
||||||
root_snapshot_id = snapshot_ids[0]
|
root_snapshot_id = snapshot_ids[0]
|
||||||
await self._run_crawl_setup(root_snapshot_id)
|
await self.run_crawl_setup(root_snapshot_id)
|
||||||
for snapshot_id in snapshot_ids:
|
for snapshot_id in snapshot_ids:
|
||||||
await self.enqueue_snapshot(snapshot_id)
|
await self.enqueue_snapshot(snapshot_id)
|
||||||
await self._wait_for_snapshot_tasks()
|
await self.wait_for_snapshot_tasks()
|
||||||
await self._run_crawl_cleanup(root_snapshot_id)
|
await self.run_crawl_cleanup(root_snapshot_id)
|
||||||
if self.abx_services is not None:
|
|
||||||
await self.abx_services.process.wait_for_background_monitors()
|
|
||||||
finally:
|
finally:
|
||||||
await _stop_bus_trace(self.bus)
|
|
||||||
await self.bus.stop()
|
await self.bus.stop()
|
||||||
if self._live_stream is not None:
|
if self._live_stream is not None:
|
||||||
try:
|
try:
|
||||||
@@ -297,33 +151,16 @@ class CrawlRunner:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
self._live_stream = None
|
self._live_stream = None
|
||||||
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
|
await sync_to_async(self.finalize_run_state, thread_sensitive=True)()
|
||||||
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
|
|
||||||
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
|
|
||||||
if crawl_is_finished:
|
|
||||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
|
||||||
crawl.status = Crawl.StatusChoices.SEALED
|
|
||||||
crawl.retry_at = None
|
|
||||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
|
||||||
else:
|
|
||||||
if crawl.status == Crawl.StatusChoices.SEALED:
|
|
||||||
crawl.status = Crawl.StatusChoices.QUEUED
|
|
||||||
elif crawl.status != Crawl.StatusChoices.STARTED:
|
|
||||||
crawl.status = Crawl.StatusChoices.STARTED
|
|
||||||
crawl.retry_at = crawl.retry_at or timezone.now()
|
|
||||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
|
||||||
|
|
||||||
async def enqueue_snapshot(self, snapshot_id: str) -> None:
|
async def enqueue_snapshot(self, snapshot_id: str) -> None:
|
||||||
task = self.snapshot_tasks.get(snapshot_id)
|
task = self.snapshot_tasks.get(snapshot_id)
|
||||||
if task is not None and not task.done():
|
if task is not None and not task.done():
|
||||||
return
|
return
|
||||||
task = asyncio.create_task(self._run_snapshot(snapshot_id))
|
task = asyncio.create_task(self.run_snapshot(snapshot_id))
|
||||||
self.snapshot_tasks[snapshot_id] = task
|
self.snapshot_tasks[snapshot_id] = task
|
||||||
|
|
||||||
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
|
async def wait_for_snapshot_tasks(self) -> None:
|
||||||
return None
|
|
||||||
|
|
||||||
async def _wait_for_snapshot_tasks(self) -> None:
|
|
||||||
while True:
|
while True:
|
||||||
pending_tasks: list[asyncio.Task[None]] = []
|
pending_tasks: list[asyncio.Task[None]] = []
|
||||||
for snapshot_id, task in list(self.snapshot_tasks.items()):
|
for snapshot_id, task in list(self.snapshot_tasks.items()):
|
||||||
@@ -339,9 +176,9 @@ class CrawlRunner:
|
|||||||
for task in done:
|
for task in done:
|
||||||
task.result()
|
task.result()
|
||||||
|
|
||||||
def _prepare(self) -> None:
|
def load_run_state(self) -> list[str]:
|
||||||
from archivebox.config.configset import get_config
|
from archivebox.config.configset import get_config
|
||||||
from archivebox.machine.models import NetworkInterface, Process
|
from archivebox.machine.models import Machine, NetworkInterface, Process
|
||||||
|
|
||||||
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
|
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
|
||||||
current_iface = NetworkInterface.current(refresh=True)
|
current_iface = NetworkInterface.current(refresh=True)
|
||||||
@@ -352,17 +189,42 @@ class CrawlRunner:
|
|||||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||||
self.persona = self.crawl.resolve_persona()
|
self.persona = self.crawl.resolve_persona()
|
||||||
self.base_config = get_config(crawl=self.crawl)
|
self.base_config = get_config(crawl=self.crawl)
|
||||||
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
|
self.derived_config = dict(Machine.current().config)
|
||||||
self.base_config["ABX_RUNTIME"] = "archivebox"
|
self.base_config["ABX_RUNTIME"] = "archivebox"
|
||||||
if self.selected_plugins is None:
|
if self.selected_plugins is None:
|
||||||
self.selected_plugins = _selected_plugins_from_config(self.base_config)
|
raw_plugins = self.base_config["PLUGINS"].strip()
|
||||||
|
self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] if raw_plugins else None
|
||||||
if self.persona:
|
if self.persona:
|
||||||
chrome_binary = str(self.base_config.get("CHROME_BINARY") or "")
|
self.base_config.update(
|
||||||
self.base_config.update(self.persona.prepare_runtime_for_crawl(self.crawl, chrome_binary=chrome_binary))
|
self.persona.prepare_runtime_for_crawl(
|
||||||
|
self.crawl,
|
||||||
|
chrome_binary=self.base_config["CHROME_BINARY"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if self.initial_snapshot_ids:
|
||||||
|
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
|
||||||
|
created = self.crawl.create_snapshots_from_urls()
|
||||||
|
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||||
|
return [str(snapshot.id) for snapshot in snapshots]
|
||||||
|
|
||||||
|
def finalize_run_state(self) -> None:
|
||||||
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
def _cleanup_persona(self) -> None:
|
|
||||||
if self.persona:
|
if self.persona:
|
||||||
self.persona.cleanup_runtime_for_crawl(self.crawl)
|
self.persona.cleanup_runtime_for_crawl(self.crawl)
|
||||||
|
crawl = Crawl.objects.get(id=self.crawl.id)
|
||||||
|
if crawl.is_finished():
|
||||||
|
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||||
|
crawl.status = Crawl.StatusChoices.SEALED
|
||||||
|
crawl.retry_at = None
|
||||||
|
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||||
|
return
|
||||||
|
if crawl.status == Crawl.StatusChoices.SEALED:
|
||||||
|
crawl.status = Crawl.StatusChoices.QUEUED
|
||||||
|
elif crawl.status != Crawl.StatusChoices.STARTED:
|
||||||
|
crawl.status = Crawl.StatusChoices.STARTED
|
||||||
|
crawl.retry_at = crawl.retry_at or timezone.now()
|
||||||
|
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||||
|
|
||||||
def _create_live_ui(self) -> LiveBusUI | None:
|
def _create_live_ui(self) -> LiveBusUI | None:
|
||||||
stdout_is_tty = sys.stdout.isatty()
|
stdout_is_tty = sys.stdout.isatty()
|
||||||
@@ -373,7 +235,7 @@ class CrawlRunner:
|
|||||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||||
if os.path.exists("/dev/tty"):
|
if os.path.exists("/dev/tty"):
|
||||||
try:
|
try:
|
||||||
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
|
||||||
stream = self._live_stream
|
stream = self._live_stream
|
||||||
except OSError:
|
except OSError:
|
||||||
self._live_stream = None
|
self._live_stream = None
|
||||||
@@ -399,7 +261,7 @@ class CrawlRunner:
|
|||||||
live_ui = LiveBusUI(
|
live_ui = LiveBusUI(
|
||||||
self.bus,
|
self.bus,
|
||||||
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
|
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
|
||||||
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
|
timeout_seconds=self.base_config["TIMEOUT"],
|
||||||
ui_console=ui_console,
|
ui_console=ui_console,
|
||||||
interactive_tty=True,
|
interactive_tty=True,
|
||||||
)
|
)
|
||||||
@@ -410,128 +272,24 @@ class CrawlRunner:
|
|||||||
)
|
)
|
||||||
return live_ui
|
return live_ui
|
||||||
|
|
||||||
def _create_root_snapshots(self) -> list[str]:
|
def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]:
|
||||||
created = self.crawl.create_snapshots_from_urls()
|
from archivebox.core.models import Snapshot
|
||||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
|
||||||
return [str(snapshot.id) for snapshot in snapshots]
|
|
||||||
|
|
||||||
def _initial_snapshot_ids(self) -> list[str]:
|
|
||||||
if self.initial_snapshot_ids:
|
|
||||||
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
|
|
||||||
return self._create_root_snapshots()
|
|
||||||
|
|
||||||
def _snapshot_config(self, snapshot) -> dict[str, Any]:
|
|
||||||
from archivebox.config.configset import get_config
|
from archivebox.config.configset import get_config
|
||||||
|
|
||||||
|
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||||
config = get_config(crawl=self.crawl, snapshot=snapshot)
|
config = get_config(crawl=self.crawl, snapshot=snapshot)
|
||||||
config.update(self.base_config)
|
config.update(self.base_config)
|
||||||
config["CRAWL_DIR"] = str(self.crawl.output_dir)
|
config["CRAWL_DIR"] = str(self.crawl.output_dir)
|
||||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||||
config["SNAPSHOT_ID"] = str(snapshot.id)
|
extra_context: dict[str, Any] = {}
|
||||||
config["SNAPSHOT_DEPTH"] = snapshot.depth
|
if config.get("EXTRA_CONTEXT"):
|
||||||
config["CRAWL_ID"] = str(self.crawl.id)
|
parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"]))
|
||||||
config["SOURCE_URL"] = snapshot.url
|
if not isinstance(parsed_extra_context, dict):
|
||||||
if snapshot.parent_snapshot_id:
|
raise TypeError("EXTRA_CONTEXT must decode to an object")
|
||||||
config["PARENT_SNAPSHOT_ID"] = str(snapshot.parent_snapshot_id)
|
extra_context = parsed_extra_context
|
||||||
return config
|
extra_context["snapshot_id"] = str(snapshot.id)
|
||||||
|
extra_context["snapshot_depth"] = snapshot.depth
|
||||||
async def _run_crawl_setup(self, snapshot_id: str) -> None:
|
config["EXTRA_CONTEXT"] = json.dumps(extra_context, separators=(",", ":"), sort_keys=True)
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
|
|
||||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
|
||||||
setup_snapshot = AbxSnapshot(
|
|
||||||
url=snapshot["url"],
|
|
||||||
id=snapshot["id"],
|
|
||||||
title=snapshot["title"],
|
|
||||||
timestamp=snapshot["timestamp"],
|
|
||||||
bookmarked_at=snapshot["bookmarked_at"],
|
|
||||||
created_at=snapshot["created_at"],
|
|
||||||
tags=snapshot["tags"],
|
|
||||||
depth=snapshot["depth"],
|
|
||||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
|
||||||
crawl_id=str(self.crawl.id),
|
|
||||||
)
|
|
||||||
await download(
|
|
||||||
url=snapshot["url"],
|
|
||||||
plugins=self.plugins,
|
|
||||||
output_dir=Path(snapshot["output_dir"]),
|
|
||||||
selected_plugins=self.selected_plugins,
|
|
||||||
bus=self.bus,
|
|
||||||
emit_jsonl=False,
|
|
||||||
snapshot=setup_snapshot,
|
|
||||||
crawl_setup_only=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _run_crawl_cleanup(self, snapshot_id: str) -> None:
|
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
|
|
||||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
|
||||||
cleanup_snapshot = AbxSnapshot(
|
|
||||||
url=snapshot["url"],
|
|
||||||
id=snapshot["id"],
|
|
||||||
title=snapshot["title"],
|
|
||||||
timestamp=snapshot["timestamp"],
|
|
||||||
bookmarked_at=snapshot["bookmarked_at"],
|
|
||||||
created_at=snapshot["created_at"],
|
|
||||||
tags=snapshot["tags"],
|
|
||||||
depth=snapshot["depth"],
|
|
||||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
|
||||||
crawl_id=str(self.crawl.id),
|
|
||||||
)
|
|
||||||
await download(
|
|
||||||
url=snapshot["url"],
|
|
||||||
plugins=self.plugins,
|
|
||||||
output_dir=Path(snapshot["output_dir"]),
|
|
||||||
selected_plugins=self.selected_plugins,
|
|
||||||
bus=self.bus,
|
|
||||||
emit_jsonl=False,
|
|
||||||
snapshot=cleanup_snapshot,
|
|
||||||
crawl_cleanup_only=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _run_snapshot(self, snapshot_id: str) -> None:
|
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
|
|
||||||
async with self.snapshot_semaphore:
|
|
||||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
|
||||||
if snapshot["status"] == "sealed":
|
|
||||||
return
|
|
||||||
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
|
|
||||||
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
|
|
||||||
return
|
|
||||||
abx_snapshot = AbxSnapshot(
|
|
||||||
url=snapshot["url"],
|
|
||||||
id=snapshot["id"],
|
|
||||||
title=snapshot["title"],
|
|
||||||
timestamp=snapshot["timestamp"],
|
|
||||||
bookmarked_at=snapshot["bookmarked_at"],
|
|
||||||
created_at=snapshot["created_at"],
|
|
||||||
tags=snapshot["tags"],
|
|
||||||
depth=snapshot["depth"],
|
|
||||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
|
||||||
crawl_id=str(self.crawl.id),
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
await download(
|
|
||||||
url=snapshot["url"],
|
|
||||||
plugins=self.plugins,
|
|
||||||
output_dir=Path(snapshot["output_dir"]),
|
|
||||||
selected_plugins=self.selected_plugins,
|
|
||||||
bus=self.bus,
|
|
||||||
emit_jsonl=False,
|
|
||||||
snapshot=abx_snapshot,
|
|
||||||
skip_crawl_setup=True,
|
|
||||||
skip_crawl_cleanup=True,
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
current_task = asyncio.current_task()
|
|
||||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
|
||||||
self.snapshot_tasks.pop(snapshot_id, None)
|
|
||||||
|
|
||||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
|
||||||
from archivebox.core.models import Snapshot
|
|
||||||
|
|
||||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
|
||||||
return {
|
return {
|
||||||
"id": str(snapshot.id),
|
"id": str(snapshot.id),
|
||||||
"url": snapshot.url,
|
"url": snapshot.url,
|
||||||
@@ -542,12 +300,91 @@ class CrawlRunner:
|
|||||||
"tags": snapshot.tags_str(),
|
"tags": snapshot.tags_str(),
|
||||||
"depth": snapshot.depth,
|
"depth": snapshot.depth,
|
||||||
"status": snapshot.status,
|
"status": snapshot.status,
|
||||||
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
|
|
||||||
"output_dir": str(snapshot.output_dir),
|
"output_dir": str(snapshot.output_dir),
|
||||||
"config": self._snapshot_config(snapshot),
|
"config": config,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
|
async def run_crawl_setup(self, snapshot_id: str) -> None:
|
||||||
|
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||||
|
await download(
|
||||||
|
url=snapshot["url"],
|
||||||
|
plugins=self.plugins,
|
||||||
|
output_dir=Path(snapshot["output_dir"]),
|
||||||
|
selected_plugins=self.selected_plugins,
|
||||||
|
config_overrides=snapshot["config"],
|
||||||
|
derived_config_overrides=self.derived_config,
|
||||||
|
bus=self.bus,
|
||||||
|
emit_jsonl=False,
|
||||||
|
install_enabled=True,
|
||||||
|
crawl_setup_enabled=True,
|
||||||
|
crawl_start_enabled=False,
|
||||||
|
snapshot_cleanup_enabled=False,
|
||||||
|
crawl_cleanup_enabled=False,
|
||||||
|
machine_service=None,
|
||||||
|
binary_service=None,
|
||||||
|
process_service=None,
|
||||||
|
archive_result_service=None,
|
||||||
|
tag_service=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_crawl_cleanup(self, snapshot_id: str) -> None:
|
||||||
|
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||||
|
await download(
|
||||||
|
bus=self.bus,
|
||||||
|
url=snapshot["url"],
|
||||||
|
output_dir=Path(snapshot["output_dir"]),
|
||||||
|
plugins=self.plugins,
|
||||||
|
selected_plugins=self.selected_plugins,
|
||||||
|
config_overrides=snapshot["config"],
|
||||||
|
derived_config_overrides=self.derived_config,
|
||||||
|
emit_jsonl=False,
|
||||||
|
install_enabled=False,
|
||||||
|
crawl_setup_enabled=False,
|
||||||
|
crawl_start_enabled=False,
|
||||||
|
snapshot_cleanup_enabled=False,
|
||||||
|
crawl_cleanup_enabled=True,
|
||||||
|
machine_service=None,
|
||||||
|
binary_service=None,
|
||||||
|
process_service=None,
|
||||||
|
archive_result_service=None,
|
||||||
|
tag_service=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_snapshot(self, snapshot_id: str) -> None:
|
||||||
|
async with self.snapshot_semaphore:
|
||||||
|
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||||
|
if snapshot["status"] == "sealed":
|
||||||
|
return
|
||||||
|
if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() == "max_size":
|
||||||
|
await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
await download(
|
||||||
|
url=snapshot["url"],
|
||||||
|
plugins=self.plugins,
|
||||||
|
output_dir=Path(snapshot["output_dir"]),
|
||||||
|
selected_plugins=self.selected_plugins,
|
||||||
|
config_overrides=snapshot["config"],
|
||||||
|
derived_config_overrides=self.derived_config,
|
||||||
|
bus=self.bus,
|
||||||
|
emit_jsonl=False,
|
||||||
|
install_enabled=False,
|
||||||
|
crawl_setup_enabled=False,
|
||||||
|
crawl_start_enabled=True,
|
||||||
|
snapshot_cleanup_enabled=True,
|
||||||
|
crawl_cleanup_enabled=False,
|
||||||
|
machine_service=None,
|
||||||
|
binary_service=None,
|
||||||
|
process_service=None,
|
||||||
|
archive_result_service=None,
|
||||||
|
tag_service=None,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
current_task = asyncio.current_task()
|
||||||
|
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||||
|
self.snapshot_tasks.pop(snapshot_id, None)
|
||||||
|
|
||||||
|
def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None:
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
|
|
||||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||||
@@ -579,21 +416,20 @@ def run_crawl(
|
|||||||
|
|
||||||
|
|
||||||
async def _run_binary(binary_id: str) -> None:
|
async def _run_binary(binary_id: str) -> None:
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
|
|
||||||
from archivebox.config.configset import get_config
|
from archivebox.config.configset import get_config
|
||||||
from archivebox.machine.models import Binary
|
from archivebox.machine.models import Binary, Machine
|
||||||
|
|
||||||
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
|
binary = await Binary.objects.aget(id=binary_id)
|
||||||
plugins = discover_plugins()
|
plugins = discover_plugins()
|
||||||
config = get_config()
|
config = get_config()
|
||||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||||
|
derived_config = dict(machine.config)
|
||||||
config["ABX_RUNTIME"] = "archivebox"
|
config["ABX_RUNTIME"] = "archivebox"
|
||||||
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
||||||
process_service = ProcessService(bus)
|
ProcessService(bus)
|
||||||
BinaryService(bus)
|
BinaryService(bus)
|
||||||
TagService(bus)
|
TagService(bus)
|
||||||
ArchiveResultService(bus, process_service=process_service)
|
ArchiveResultService(bus)
|
||||||
setup_abx_services(
|
setup_abx_services(
|
||||||
bus,
|
bus,
|
||||||
plugins=plugins,
|
plugins=plugins,
|
||||||
@@ -605,7 +441,6 @@ async def _run_binary(binary_id: str) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_attach_bus_trace(bus)
|
|
||||||
await bus.emit(
|
await bus.emit(
|
||||||
BinaryRequestEvent(
|
BinaryRequestEvent(
|
||||||
name=binary.name,
|
name=binary.name,
|
||||||
@@ -619,7 +454,6 @@ async def _run_binary(binary_id: str) -> None:
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
await _stop_bus_trace(bus)
|
|
||||||
await bus.stop()
|
await bus.stop()
|
||||||
|
|
||||||
|
|
||||||
@@ -628,20 +462,20 @@ def run_binary(binary_id: str) -> None:
|
|||||||
|
|
||||||
|
|
||||||
async def _run_install(plugin_names: list[str] | None = None) -> None:
|
async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||||
from asgiref.sync import sync_to_async
|
|
||||||
|
|
||||||
from archivebox.config.configset import get_config
|
from archivebox.config.configset import get_config
|
||||||
|
from archivebox.machine.models import Machine
|
||||||
|
|
||||||
plugins = discover_plugins()
|
plugins = discover_plugins()
|
||||||
config = get_config()
|
config = get_config()
|
||||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||||
|
derived_config = dict(machine.config)
|
||||||
config["ABX_RUNTIME"] = "archivebox"
|
config["ABX_RUNTIME"] = "archivebox"
|
||||||
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
||||||
process_service = ProcessService(bus)
|
ProcessService(bus)
|
||||||
BinaryService(bus)
|
BinaryService(bus)
|
||||||
TagService(bus)
|
TagService(bus)
|
||||||
ArchiveResultService(bus, process_service=process_service)
|
ArchiveResultService(bus)
|
||||||
abx_services = setup_abx_services(
|
setup_abx_services(
|
||||||
bus,
|
bus,
|
||||||
plugins=plugins,
|
plugins=plugins,
|
||||||
config_overrides=config,
|
config_overrides=config,
|
||||||
@@ -657,7 +491,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
|||||||
if not selected_plugins:
|
if not selected_plugins:
|
||||||
return
|
return
|
||||||
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
|
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
|
||||||
timeout_seconds = int(config.get("TIMEOUT") or 60)
|
timeout_seconds = config["TIMEOUT"]
|
||||||
stdout_is_tty = sys.stdout.isatty()
|
stdout_is_tty = sys.stdout.isatty()
|
||||||
stderr_is_tty = sys.stderr.isatty()
|
stderr_is_tty = sys.stderr.isatty()
|
||||||
interactive_tty = stdout_is_tty or stderr_is_tty
|
interactive_tty = stdout_is_tty or stderr_is_tty
|
||||||
@@ -668,7 +502,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
|||||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||||
if os.path.exists("/dev/tty"):
|
if os.path.exists("/dev/tty"):
|
||||||
try:
|
try:
|
||||||
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
|
||||||
stream = live_stream
|
stream = live_stream
|
||||||
except OSError:
|
except OSError:
|
||||||
live_stream = None
|
live_stream = None
|
||||||
@@ -707,20 +541,21 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
|||||||
plugins_label=plugins_label,
|
plugins_label=plugins_label,
|
||||||
)
|
)
|
||||||
with live_ui if live_ui is not None else nullcontext():
|
with live_ui if live_ui is not None else nullcontext():
|
||||||
_attach_bus_trace(bus)
|
|
||||||
results = await abx_install_plugins(
|
results = await abx_install_plugins(
|
||||||
plugin_names=plugin_names,
|
plugin_names=plugin_names,
|
||||||
plugins=plugins,
|
plugins=plugins,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
config_overrides=config,
|
config_overrides=config,
|
||||||
|
derived_config_overrides=derived_config,
|
||||||
emit_jsonl=False,
|
emit_jsonl=False,
|
||||||
bus=bus,
|
bus=bus,
|
||||||
|
machine_service=None,
|
||||||
|
binary_service=None,
|
||||||
|
process_service=None,
|
||||||
)
|
)
|
||||||
await abx_services.process.wait_for_background_monitors()
|
|
||||||
if live_ui is not None:
|
if live_ui is not None:
|
||||||
live_ui.print_summary(results, output_dir=output_dir)
|
live_ui.print_summary(results, output_dir=output_dir)
|
||||||
finally:
|
finally:
|
||||||
await _stop_bus_trace(bus)
|
|
||||||
await bus.stop()
|
await bus.stop()
|
||||||
try:
|
try:
|
||||||
if live_stream is not None:
|
if live_stream is not None:
|
||||||
@@ -739,6 +574,12 @@ def recover_orphaned_crawls() -> int:
|
|||||||
from archivebox.machine.models import Process
|
from archivebox.machine.models import Process
|
||||||
|
|
||||||
active_crawl_ids: set[str] = set()
|
active_crawl_ids: set[str] = set()
|
||||||
|
orphaned_crawls = list(
|
||||||
|
Crawl.objects.filter(
|
||||||
|
status=Crawl.StatusChoices.STARTED,
|
||||||
|
retry_at__isnull=True,
|
||||||
|
).prefetch_related("snapshot_set"),
|
||||||
|
)
|
||||||
running_processes = Process.objects.filter(
|
running_processes = Process.objects.filter(
|
||||||
status=Process.StatusChoices.RUNNING,
|
status=Process.StatusChoices.RUNNING,
|
||||||
process_type__in=[
|
process_type__in=[
|
||||||
@@ -746,23 +587,27 @@ def recover_orphaned_crawls() -> int:
|
|||||||
Process.TypeChoices.HOOK,
|
Process.TypeChoices.HOOK,
|
||||||
Process.TypeChoices.BINARY,
|
Process.TypeChoices.BINARY,
|
||||||
],
|
],
|
||||||
).only("env")
|
).only("pwd")
|
||||||
|
|
||||||
for proc in running_processes:
|
for proc in running_processes:
|
||||||
env = proc.env or {}
|
if not proc.pwd:
|
||||||
if not isinstance(env, dict):
|
|
||||||
continue
|
continue
|
||||||
crawl_id = env.get("CRAWL_ID")
|
proc_pwd = Path(proc.pwd)
|
||||||
if crawl_id:
|
for crawl in orphaned_crawls:
|
||||||
active_crawl_ids.add(str(crawl_id))
|
matched_snapshot = None
|
||||||
|
for snapshot in crawl.snapshot_set.all():
|
||||||
|
try:
|
||||||
|
proc_pwd.relative_to(snapshot.output_dir)
|
||||||
|
matched_snapshot = snapshot
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if matched_snapshot is not None:
|
||||||
|
active_crawl_ids.add(str(crawl.id))
|
||||||
|
break
|
||||||
|
|
||||||
recovered = 0
|
recovered = 0
|
||||||
now = timezone.now()
|
now = timezone.now()
|
||||||
orphaned_crawls = Crawl.objects.filter(
|
|
||||||
status=Crawl.StatusChoices.STARTED,
|
|
||||||
retry_at__isnull=True,
|
|
||||||
).prefetch_related("snapshot_set")
|
|
||||||
|
|
||||||
for crawl in orphaned_crawls:
|
for crawl in orphaned_crawls:
|
||||||
if str(crawl.id) in active_crawl_ids:
|
if str(crawl.id) in active_crawl_ids:
|
||||||
continue
|
continue
|
||||||
@@ -788,6 +633,11 @@ def recover_orphaned_snapshots() -> int:
|
|||||||
from archivebox.machine.models import Process
|
from archivebox.machine.models import Process
|
||||||
|
|
||||||
active_snapshot_ids: set[str] = set()
|
active_snapshot_ids: set[str] = set()
|
||||||
|
orphaned_snapshots = list(
|
||||||
|
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||||
|
.select_related("crawl")
|
||||||
|
.prefetch_related("archiveresult_set"),
|
||||||
|
)
|
||||||
running_processes = Process.objects.filter(
|
running_processes = Process.objects.filter(
|
||||||
status=Process.StatusChoices.RUNNING,
|
status=Process.StatusChoices.RUNNING,
|
||||||
process_type__in=[
|
process_type__in=[
|
||||||
@@ -795,24 +645,22 @@ def recover_orphaned_snapshots() -> int:
|
|||||||
Process.TypeChoices.HOOK,
|
Process.TypeChoices.HOOK,
|
||||||
Process.TypeChoices.BINARY,
|
Process.TypeChoices.BINARY,
|
||||||
],
|
],
|
||||||
).only("env")
|
).only("pwd")
|
||||||
|
|
||||||
for proc in running_processes:
|
for proc in running_processes:
|
||||||
env = proc.env or {}
|
if not proc.pwd:
|
||||||
if not isinstance(env, dict):
|
|
||||||
continue
|
continue
|
||||||
snapshot_id = env.get("SNAPSHOT_ID")
|
proc_pwd = Path(proc.pwd)
|
||||||
if snapshot_id:
|
for snapshot in orphaned_snapshots:
|
||||||
active_snapshot_ids.add(str(snapshot_id))
|
try:
|
||||||
|
proc_pwd.relative_to(snapshot.output_dir)
|
||||||
|
active_snapshot_ids.add(str(snapshot.id))
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
recovered = 0
|
recovered = 0
|
||||||
now = timezone.now()
|
now = timezone.now()
|
||||||
orphaned_snapshots = (
|
|
||||||
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
|
||||||
.select_related("crawl")
|
|
||||||
.prefetch_related("archiveresult_set")
|
|
||||||
)
|
|
||||||
|
|
||||||
for snapshot in orphaned_snapshots:
|
for snapshot in orphaned_snapshots:
|
||||||
if str(snapshot.id) in active_snapshot_ids:
|
if str(snapshot.id) in active_snapshot_ids:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -7,8 +7,6 @@ from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
|||||||
from abx_dl.limits import CrawlLimitState
|
from abx_dl.limits import CrawlLimitState
|
||||||
from abx_dl.services.base import BaseService
|
from abx_dl.services.base import BaseService
|
||||||
|
|
||||||
from .db import run_db_op
|
|
||||||
|
|
||||||
|
|
||||||
class SnapshotService(BaseService):
|
class SnapshotService(BaseService):
|
||||||
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
||||||
@@ -18,120 +16,96 @@ class SnapshotService(BaseService):
|
|||||||
self.crawl_id = crawl_id
|
self.crawl_id = crawl_id
|
||||||
self.schedule_snapshot = schedule_snapshot
|
self.schedule_snapshot = schedule_snapshot
|
||||||
super().__init__(bus)
|
super().__init__(bus)
|
||||||
|
self.bus.on(SnapshotEvent, self.on_SnapshotEvent)
|
||||||
|
self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent)
|
||||||
|
|
||||||
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
|
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
||||||
snapshot_id = await run_db_op(self._project_snapshot, event)
|
|
||||||
if snapshot_id:
|
|
||||||
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
|
|
||||||
if snapshot_id and event.depth > 0:
|
|
||||||
await self.schedule_snapshot(snapshot_id)
|
|
||||||
|
|
||||||
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
|
|
||||||
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
|
|
||||||
if snapshot_id:
|
|
||||||
await sync_to_async(self._write_snapshot_details)(snapshot_id)
|
|
||||||
|
|
||||||
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||||
|
snapshot_id: str | None = None
|
||||||
|
snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).afirst()
|
||||||
|
|
||||||
if event.depth == 0:
|
if snapshot is not None:
|
||||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
|
|
||||||
if snapshot is None:
|
|
||||||
return None
|
|
||||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||||
snapshot.retry_at = None
|
snapshot.retry_at = None
|
||||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||||
return str(snapshot.id)
|
snapshot_id = str(snapshot.id)
|
||||||
|
elif event.depth > 0:
|
||||||
|
if event.depth <= crawl.max_depth and self._crawl_limit_stop_reason(crawl) != "max_size":
|
||||||
|
parent_event = await self.bus.find(
|
||||||
|
SnapshotEvent,
|
||||||
|
past=True,
|
||||||
|
future=False,
|
||||||
|
where=lambda candidate: candidate.depth == event.depth - 1 and self.bus.event_is_child_of(event, candidate),
|
||||||
|
)
|
||||||
|
parent_snapshot = None
|
||||||
|
if parent_event is not None:
|
||||||
|
parent_snapshot = await Snapshot.objects.filter(id=parent_event.snapshot_id, crawl=crawl).afirst()
|
||||||
|
if parent_snapshot is not None and self._url_passes_filters(crawl, parent_snapshot, event.url):
|
||||||
|
snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)(
|
||||||
|
{
|
||||||
|
"url": event.url,
|
||||||
|
"depth": event.depth,
|
||||||
|
"parent_snapshot_id": str(parent_snapshot.id),
|
||||||
|
"crawl_id": str(crawl.id),
|
||||||
|
},
|
||||||
|
overrides={
|
||||||
|
"crawl": crawl,
|
||||||
|
"snapshot": parent_snapshot,
|
||||||
|
"created_by_id": crawl.created_by_id,
|
||||||
|
},
|
||||||
|
queue_for_extraction=False,
|
||||||
|
)
|
||||||
|
if snapshot is not None and snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||||
|
snapshot.retry_at = None
|
||||||
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||||
|
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||||
|
snapshot_id = str(snapshot.id)
|
||||||
|
|
||||||
if event.depth > crawl.max_depth:
|
if snapshot_id:
|
||||||
return None
|
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||||
if self._crawl_limit_stop_reason(crawl) == "max_size":
|
if snapshot is not None:
|
||||||
return None
|
await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)()
|
||||||
|
if snapshot_id and event.depth > 0:
|
||||||
|
await self.schedule_snapshot(snapshot_id)
|
||||||
|
|
||||||
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
|
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
||||||
if parent_snapshot is None:
|
from archivebox.core.models import Snapshot
|
||||||
return None
|
|
||||||
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
|
|
||||||
return None
|
|
||||||
|
|
||||||
snapshot = Snapshot.from_json(
|
snapshot = await Snapshot.objects.select_related("crawl").filter(id=event.snapshot_id).afirst()
|
||||||
{
|
snapshot_id: str | None = None
|
||||||
"url": event.url,
|
if snapshot is not None:
|
||||||
"depth": event.depth,
|
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||||
"parent_snapshot_id": str(parent_snapshot.id),
|
snapshot.retry_at = None
|
||||||
"crawl_id": str(crawl.id),
|
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||||
},
|
await snapshot.asave(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||||
overrides={
|
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
||||||
"crawl": crawl,
|
await (
|
||||||
"snapshot": parent_snapshot,
|
Snapshot.objects.filter(
|
||||||
"created_by_id": crawl.created_by_id,
|
crawl_id=snapshot.crawl_id,
|
||||||
},
|
status=Snapshot.StatusChoices.QUEUED,
|
||||||
queue_for_extraction=False,
|
)
|
||||||
)
|
.exclude(id=snapshot.id)
|
||||||
if snapshot is None:
|
.aupdate(
|
||||||
return None
|
status=Snapshot.StatusChoices.SEALED,
|
||||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
retry_at=None,
|
||||||
return None
|
modified_at=timezone.now(),
|
||||||
snapshot.retry_at = None
|
)
|
||||||
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
)
|
||||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
snapshot_id = str(snapshot.id)
|
||||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
if snapshot_id:
|
||||||
return str(snapshot.id)
|
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||||
|
if snapshot is not None:
|
||||||
|
await sync_to_async(snapshot.write_index_jsonl, thread_sensitive=True)()
|
||||||
|
await sync_to_async(snapshot.write_json_details, thread_sensitive=True)()
|
||||||
|
await sync_to_async(snapshot.write_html_details, thread_sensitive=True)()
|
||||||
|
|
||||||
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
||||||
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
||||||
|
|
||||||
def _seal_snapshot(self, snapshot_id: str) -> str | None:
|
|
||||||
from archivebox.core.models import Snapshot
|
|
||||||
|
|
||||||
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
|
|
||||||
if snapshot is None:
|
|
||||||
return None
|
|
||||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
|
||||||
snapshot.retry_at = None
|
|
||||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
|
||||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
|
||||||
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
|
||||||
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
|
|
||||||
return str(snapshot.id)
|
|
||||||
|
|
||||||
def _crawl_limit_stop_reason(self, crawl) -> str:
|
def _crawl_limit_stop_reason(self, crawl) -> str:
|
||||||
config = dict(crawl.config or {})
|
config = dict(crawl.config or {})
|
||||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
return CrawlLimitState.from_config(config).get_stop_reason()
|
||||||
|
|
||||||
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
|
|
||||||
from archivebox.core.models import Snapshot
|
|
||||||
|
|
||||||
return (
|
|
||||||
Snapshot.objects.filter(
|
|
||||||
crawl_id=crawl_id,
|
|
||||||
status=Snapshot.StatusChoices.QUEUED,
|
|
||||||
)
|
|
||||||
.exclude(id=exclude_snapshot_id)
|
|
||||||
.update(
|
|
||||||
status=Snapshot.StatusChoices.SEALED,
|
|
||||||
retry_at=None,
|
|
||||||
modified_at=timezone.now(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
|
|
||||||
from archivebox.core.models import Snapshot
|
|
||||||
|
|
||||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
|
||||||
if snapshot is not None:
|
|
||||||
snapshot.ensure_crawl_symlink()
|
|
||||||
|
|
||||||
def _write_snapshot_details(self, snapshot_id: str) -> None:
|
|
||||||
from archivebox.core.models import Snapshot
|
|
||||||
|
|
||||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
|
||||||
if snapshot is None:
|
|
||||||
return
|
|
||||||
snapshot.write_index_jsonl()
|
|
||||||
snapshot.write_json_details()
|
|
||||||
snapshot.write_html_details()
|
|
||||||
|
|||||||
@@ -3,20 +3,20 @@ from __future__ import annotations
|
|||||||
from abx_dl.events import TagEvent
|
from abx_dl.events import TagEvent
|
||||||
from abx_dl.services.base import BaseService
|
from abx_dl.services.base import BaseService
|
||||||
|
|
||||||
from .db import run_db_op
|
|
||||||
|
|
||||||
|
|
||||||
class TagService(BaseService):
|
class TagService(BaseService):
|
||||||
LISTENS_TO = [TagEvent]
|
LISTENS_TO = [TagEvent]
|
||||||
EMITS = []
|
EMITS = []
|
||||||
|
|
||||||
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
|
def __init__(self, bus):
|
||||||
await run_db_op(self._project, event)
|
super().__init__(bus)
|
||||||
|
self.bus.on(TagEvent, self.on_TagEvent__save_to_db)
|
||||||
|
|
||||||
def _project(self, event: TagEvent) -> None:
|
async def on_TagEvent__save_to_db(self, event: TagEvent) -> None:
|
||||||
from archivebox.core.models import Snapshot, Tag
|
from archivebox.core.models import Snapshot, SnapshotTag, Tag
|
||||||
|
|
||||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).afirst()
|
||||||
if snapshot is None:
|
if snapshot is None:
|
||||||
return
|
return
|
||||||
Tag.from_json({"name": event.name}, overrides={"snapshot": snapshot})
|
tag, _ = await Tag.objects.aget_or_create(name=event.name)
|
||||||
|
await SnapshotTag.objects.aget_or_create(snapshot=snapshot, tag=tag)
|
||||||
|
|||||||
@@ -312,7 +312,7 @@ CREATE TABLE IF NOT EXISTS machine_dependency (
|
|||||||
modified_at DATETIME,
|
modified_at DATETIME,
|
||||||
bin_name VARCHAR(63) NOT NULL UNIQUE,
|
bin_name VARCHAR(63) NOT NULL UNIQUE,
|
||||||
bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
|
bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
|
||||||
custom_cmds TEXT DEFAULT '{}',
|
overrides TEXT DEFAULT '{}',
|
||||||
config TEXT DEFAULT '{}'
|
config TEXT DEFAULT '{}'
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -973,7 +973,6 @@ def seed_0_8_data(db_path: Path) -> dict[str, list[dict]]:
|
|||||||
("machine", "0003_alter_installedbinary_options_and_more"),
|
("machine", "0003_alter_installedbinary_options_and_more"),
|
||||||
("machine", "0004_alter_installedbinary_abspath_and_more"),
|
("machine", "0004_alter_installedbinary_abspath_and_more"),
|
||||||
# Then the new migrations after squashing
|
# Then the new migrations after squashing
|
||||||
("machine", "0002_rename_custom_cmds_to_overrides"),
|
|
||||||
("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"),
|
("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"),
|
||||||
("machine", "0004_drop_dependency_table"),
|
("machine", "0004_drop_dependency_table"),
|
||||||
# Crawls must come before core.0024 because 0024_b depends on it
|
# Crawls must come before core.0024 because 0024_b depends on it
|
||||||
|
|||||||
@@ -144,13 +144,13 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
|
|||||||
pwd=str(snapshot.output_dir / "wget"),
|
pwd=str(snapshot.output_dir / "wget"),
|
||||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||||
env={
|
env={
|
||||||
"SOURCE_URL": "https://example.com",
|
|
||||||
"SAFE_FLAG": "1",
|
"SAFE_FLAG": "1",
|
||||||
"API_KEY": "super-secret-key",
|
"API_KEY": "super-secret-key",
|
||||||
"ACCESS_TOKEN": "super-secret-token",
|
"ACCESS_TOKEN": "super-secret-token",
|
||||||
"SHARED_SECRET": "super-secret-secret",
|
"SHARED_SECRET": "super-secret-secret",
|
||||||
},
|
},
|
||||||
status=Process.StatusChoices.EXITED,
|
status=Process.StatusChoices.EXITED,
|
||||||
|
url="https://example.com",
|
||||||
)
|
)
|
||||||
result = ArchiveResult.objects.create(
|
result = ArchiveResult.objects.create(
|
||||||
snapshot=snapshot,
|
snapshot=snapshot,
|
||||||
@@ -164,7 +164,7 @@ def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
|
|||||||
cmd_html = str(admin.cmd_str(result))
|
cmd_html = str(admin.cmd_str(result))
|
||||||
|
|
||||||
assert "SAFE_FLAG=1" in cmd_html
|
assert "SAFE_FLAG=1" in cmd_html
|
||||||
assert "SOURCE_URL=https://example.com" in cmd_html
|
assert "https://example.com" in cmd_html
|
||||||
assert "API_KEY" not in cmd_html
|
assert "API_KEY" not in cmd_html
|
||||||
assert "ACCESS_TOKEN" not in cmd_html
|
assert "ACCESS_TOKEN" not in cmd_html
|
||||||
assert "SHARED_SECRET" not in cmd_html
|
assert "SHARED_SECRET" not in cmd_html
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ Tests cover:
|
|||||||
- Snapshot progress statistics
|
- Snapshot progress statistics
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import pytest
|
import pytest
|
||||||
import uuid
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -822,7 +823,6 @@ class TestAdminSnapshotListView:
|
|||||||
pwd="/tmp/archivebox",
|
pwd="/tmp/archivebox",
|
||||||
cmd=["python", "/tmp/job.py", "--url=https://example.com"],
|
cmd=["python", "/tmp/job.py", "--url=https://example.com"],
|
||||||
env={
|
env={
|
||||||
"SNAPSHOT_ID": "abc123",
|
|
||||||
"ENABLED": True,
|
"ENABLED": True,
|
||||||
"API_KEY": "super-secret-key",
|
"API_KEY": "super-secret-key",
|
||||||
"ACCESS_TOKEN": "super-secret-token",
|
"ACCESS_TOKEN": "super-secret-token",
|
||||||
@@ -843,7 +843,6 @@ class TestAdminSnapshotListView:
|
|||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert b"Kill" in response.content
|
assert b"Kill" in response.content
|
||||||
assert b"python /tmp/job.py --url=https://example.com" in response.content
|
assert b"python /tmp/job.py --url=https://example.com" in response.content
|
||||||
assert b"SNAPSHOT_ID=abc123" in response.content
|
|
||||||
assert b"ENABLED=True" in response.content
|
assert b"ENABLED=True" in response.content
|
||||||
assert b"52s" in response.content
|
assert b"52s" in response.content
|
||||||
assert b"API_KEY=" not in response.content
|
assert b"API_KEY=" not in response.content
|
||||||
@@ -1065,7 +1064,7 @@ class TestAdminSnapshotListView:
|
|||||||
pid=54321,
|
pid=54321,
|
||||||
exit_code=0,
|
exit_code=0,
|
||||||
cmd=["/plugins/title/on_Snapshot__54_title.js", "--url=https://example.com"],
|
cmd=["/plugins/title/on_Snapshot__54_title.js", "--url=https://example.com"],
|
||||||
env={"SNAPSHOT_ID": str(snapshot.id)},
|
env={"EXTRA_CONTEXT": json.dumps({"snapshot_id": str(snapshot.id)})},
|
||||||
started_at=timezone.now(),
|
started_at=timezone.now(),
|
||||||
ended_at=timezone.now(),
|
ended_at=timezone.now(),
|
||||||
)
|
)
|
||||||
@@ -1252,11 +1251,8 @@ class TestLiveProgressView:
|
|||||||
process_type=Process.TypeChoices.HOOK,
|
process_type=Process.TypeChoices.HOOK,
|
||||||
status=Process.StatusChoices.RUNNING,
|
status=Process.StatusChoices.RUNNING,
|
||||||
pid=pid,
|
pid=pid,
|
||||||
|
pwd=str(snapshot.output_dir / "chrome"),
|
||||||
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js", "--url=https://example.com"],
|
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js", "--url=https://example.com"],
|
||||||
env={
|
|
||||||
"CRAWL_ID": str(snapshot.crawl_id),
|
|
||||||
"SNAPSHOT_ID": str(snapshot.id),
|
|
||||||
},
|
|
||||||
started_at=timezone.now(),
|
started_at=timezone.now(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1290,11 +1286,8 @@ class TestLiveProgressView:
|
|||||||
process_type=Process.TypeChoices.HOOK,
|
process_type=Process.TypeChoices.HOOK,
|
||||||
status=Process.StatusChoices.RUNNING,
|
status=Process.StatusChoices.RUNNING,
|
||||||
pid=pid,
|
pid=pid,
|
||||||
|
pwd=str(snapshot.output_dir / "title"),
|
||||||
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
|
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
|
||||||
env={
|
|
||||||
"CRAWL_ID": str(snapshot.crawl_id),
|
|
||||||
"SNAPSHOT_ID": str(snapshot.id),
|
|
||||||
},
|
|
||||||
started_at=timezone.now(),
|
started_at=timezone.now(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1327,11 +1320,8 @@ class TestLiveProgressView:
|
|||||||
process_type=Process.TypeChoices.HOOK,
|
process_type=Process.TypeChoices.HOOK,
|
||||||
status=Process.StatusChoices.RUNNING,
|
status=Process.StatusChoices.RUNNING,
|
||||||
pid=os.getpid(),
|
pid=os.getpid(),
|
||||||
|
pwd=str(snapshot.output_dir / "chrome"),
|
||||||
cmd=["/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
|
cmd=["/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
|
||||||
env={
|
|
||||||
"CRAWL_ID": str(snapshot.crawl_id),
|
|
||||||
"SNAPSHOT_ID": str(snapshot.id),
|
|
||||||
},
|
|
||||||
started_at=timezone.now(),
|
started_at=timezone.now(),
|
||||||
)
|
)
|
||||||
ArchiveResult.objects.create(
|
ArchiveResult.objects.create(
|
||||||
@@ -1369,11 +1359,8 @@ class TestLiveProgressView:
|
|||||||
status=Process.StatusChoices.EXITED,
|
status=Process.StatusChoices.EXITED,
|
||||||
exit_code=0,
|
exit_code=0,
|
||||||
pid=99999,
|
pid=99999,
|
||||||
|
pwd=str(snapshot.output_dir / "title"),
|
||||||
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
|
cmd=["/plugins/title/on_Snapshot__10_title.py", "--url=https://example.com"],
|
||||||
env={
|
|
||||||
"CRAWL_ID": str(snapshot.crawl_id),
|
|
||||||
"SNAPSHOT_ID": str(snapshot.id),
|
|
||||||
},
|
|
||||||
started_at=timezone.now(),
|
started_at=timezone.now(),
|
||||||
ended_at=timezone.now(),
|
ended_at=timezone.now(),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -5,12 +5,12 @@ import pytest
|
|||||||
from django.db import connection
|
from django.db import connection
|
||||||
|
|
||||||
|
|
||||||
from abx_dl.events import BinaryRequestEvent, ProcessCompletedEvent, ProcessStartedEvent
|
from abx_dl.events import ArchiveResultEvent, BinaryRequestEvent, ProcessEvent, ProcessStartedEvent
|
||||||
from abx_dl.orchestrator import create_bus
|
from abx_dl.orchestrator import create_bus
|
||||||
from abx_dl.output_files import OutputFile
|
from abx_dl.output_files import OutputFile
|
||||||
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.django_db
|
pytestmark = pytest.mark.django_db(transaction=True)
|
||||||
|
|
||||||
|
|
||||||
def _cleanup_machine_process_rows() -> None:
|
def _cleanup_machine_process_rows() -> None:
|
||||||
@@ -75,8 +75,8 @@ def _create_iface(machine):
|
|||||||
|
|
||||||
def test_process_completed_projects_inline_archiveresult():
|
def test_process_completed_projects_inline_archiveresult():
|
||||||
from archivebox.core.models import ArchiveResult
|
from archivebox.core.models import ArchiveResult
|
||||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
from archivebox.services.archive_result_service import ArchiveResultService
|
||||||
from archivebox.services.process_service import ProcessService
|
import asyncio
|
||||||
|
|
||||||
snapshot = _create_snapshot()
|
snapshot = _create_snapshot()
|
||||||
plugin_dir = Path(snapshot.output_dir) / "wget"
|
plugin_dir = Path(snapshot.output_dir) / "wget"
|
||||||
@@ -84,37 +84,23 @@ def test_process_completed_projects_inline_archiveresult():
|
|||||||
(plugin_dir / "index.html").write_text("<html>ok</html>")
|
(plugin_dir / "index.html").write_text("<html>ok</html>")
|
||||||
|
|
||||||
bus = create_bus(name="test_inline_archiveresult")
|
bus = create_bus(name="test_inline_archiveresult")
|
||||||
process_service = ProcessService(bus)
|
service = ArchiveResultService(bus)
|
||||||
service = ArchiveResultService(bus, process_service=process_service)
|
|
||||||
|
|
||||||
event = ProcessCompletedEvent(
|
event = ArchiveResultEvent(
|
||||||
plugin_name="wget",
|
|
||||||
hook_name="on_Snapshot__06_wget.finite.bg",
|
|
||||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"wget/index.html"}\n' % snapshot.id,
|
|
||||||
stderr="",
|
|
||||||
exit_code=0,
|
|
||||||
output_dir=str(plugin_dir),
|
|
||||||
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
|
|
||||||
process_id="proc-inline",
|
|
||||||
snapshot_id=str(snapshot.id),
|
snapshot_id=str(snapshot.id),
|
||||||
|
plugin="wget",
|
||||||
|
hook_name="on_Snapshot__06_wget.finite.bg",
|
||||||
|
status="succeeded",
|
||||||
|
output_str="wget/index.html",
|
||||||
|
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
|
||||||
start_ts="2026-03-22T12:00:00+00:00",
|
start_ts="2026-03-22T12:00:00+00:00",
|
||||||
end_ts="2026-03-22T12:00:01+00:00",
|
end_ts="2026-03-22T12:00:01+00:00",
|
||||||
)
|
)
|
||||||
|
|
||||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
async def emit_event() -> None:
|
||||||
service._project_from_process_completed(
|
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||||
event,
|
|
||||||
{
|
asyncio.run(emit_event())
|
||||||
"snapshot_id": str(snapshot.id),
|
|
||||||
"plugin": "wget",
|
|
||||||
"hook_name": "on_Snapshot__06_wget.finite.bg",
|
|
||||||
"status": "succeeded",
|
|
||||||
"output_str": "wget/index.html",
|
|
||||||
},
|
|
||||||
output_files,
|
|
||||||
output_size,
|
|
||||||
output_mimetypes,
|
|
||||||
)
|
|
||||||
|
|
||||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
|
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
|
||||||
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
|
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||||
@@ -127,45 +113,31 @@ def test_process_completed_projects_inline_archiveresult():
|
|||||||
|
|
||||||
def test_process_completed_projects_synthetic_failed_archiveresult():
|
def test_process_completed_projects_synthetic_failed_archiveresult():
|
||||||
from archivebox.core.models import ArchiveResult
|
from archivebox.core.models import ArchiveResult
|
||||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
from archivebox.services.archive_result_service import ArchiveResultService
|
||||||
from archivebox.services.process_service import ProcessService
|
import asyncio
|
||||||
|
|
||||||
snapshot = _create_snapshot()
|
snapshot = _create_snapshot()
|
||||||
plugin_dir = Path(snapshot.output_dir) / "chrome"
|
plugin_dir = Path(snapshot.output_dir) / "chrome"
|
||||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
bus = create_bus(name="test_synthetic_archiveresult")
|
bus = create_bus(name="test_synthetic_archiveresult")
|
||||||
process_service = ProcessService(bus)
|
service = ArchiveResultService(bus)
|
||||||
service = ArchiveResultService(bus, process_service=process_service)
|
|
||||||
|
|
||||||
event = ProcessCompletedEvent(
|
event = ArchiveResultEvent(
|
||||||
plugin_name="chrome",
|
|
||||||
hook_name="on_Snapshot__11_chrome_wait",
|
|
||||||
stdout="",
|
|
||||||
stderr="Hook timed out after 60 seconds",
|
|
||||||
exit_code=-1,
|
|
||||||
output_dir=str(plugin_dir),
|
|
||||||
output_files=[],
|
|
||||||
process_id="proc-failed",
|
|
||||||
snapshot_id=str(snapshot.id),
|
snapshot_id=str(snapshot.id),
|
||||||
|
plugin="chrome",
|
||||||
|
hook_name="on_Snapshot__11_chrome_wait",
|
||||||
|
status="failed",
|
||||||
|
output_str="Hook timed out after 60 seconds",
|
||||||
|
error="Hook timed out after 60 seconds",
|
||||||
start_ts="2026-03-22T12:00:00+00:00",
|
start_ts="2026-03-22T12:00:00+00:00",
|
||||||
end_ts="2026-03-22T12:01:00+00:00",
|
end_ts="2026-03-22T12:01:00+00:00",
|
||||||
)
|
)
|
||||||
|
|
||||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
async def emit_event() -> None:
|
||||||
service._project_from_process_completed(
|
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||||
event,
|
|
||||||
{
|
asyncio.run(emit_event())
|
||||||
"plugin": "chrome",
|
|
||||||
"hook_name": "on_Snapshot__11_chrome_wait",
|
|
||||||
"status": "failed",
|
|
||||||
"output_str": "Hook timed out after 60 seconds",
|
|
||||||
"error": "Hook timed out after 60 seconds",
|
|
||||||
},
|
|
||||||
output_files,
|
|
||||||
output_size,
|
|
||||||
output_mimetypes,
|
|
||||||
)
|
|
||||||
|
|
||||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
|
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
|
||||||
assert result.status == ArchiveResult.StatusChoices.FAILED
|
assert result.status == ArchiveResult.StatusChoices.FAILED
|
||||||
@@ -176,45 +148,30 @@ def test_process_completed_projects_synthetic_failed_archiveresult():
|
|||||||
|
|
||||||
def test_process_completed_projects_noresults_archiveresult():
|
def test_process_completed_projects_noresults_archiveresult():
|
||||||
from archivebox.core.models import ArchiveResult
|
from archivebox.core.models import ArchiveResult
|
||||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
from archivebox.services.archive_result_service import ArchiveResultService
|
||||||
from archivebox.services.process_service import ProcessService
|
import asyncio
|
||||||
|
|
||||||
snapshot = _create_snapshot()
|
snapshot = _create_snapshot()
|
||||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
bus = create_bus(name="test_noresults_archiveresult")
|
bus = create_bus(name="test_noresults_archiveresult")
|
||||||
process_service = ProcessService(bus)
|
service = ArchiveResultService(bus)
|
||||||
service = ArchiveResultService(bus, process_service=process_service)
|
|
||||||
|
|
||||||
event = ProcessCompletedEvent(
|
event = ArchiveResultEvent(
|
||||||
plugin_name="title",
|
|
||||||
hook_name="on_Snapshot__54_title.js",
|
|
||||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
|
|
||||||
stderr="",
|
|
||||||
exit_code=0,
|
|
||||||
output_dir=str(plugin_dir),
|
|
||||||
output_files=[],
|
|
||||||
process_id="proc-noresults",
|
|
||||||
snapshot_id=str(snapshot.id),
|
snapshot_id=str(snapshot.id),
|
||||||
|
plugin="title",
|
||||||
|
hook_name="on_Snapshot__54_title.js",
|
||||||
|
status="noresults",
|
||||||
|
output_str="No title found",
|
||||||
start_ts="2026-03-22T12:00:00+00:00",
|
start_ts="2026-03-22T12:00:00+00:00",
|
||||||
end_ts="2026-03-22T12:00:01+00:00",
|
end_ts="2026-03-22T12:00:01+00:00",
|
||||||
)
|
)
|
||||||
|
|
||||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
async def emit_event() -> None:
|
||||||
service._project_from_process_completed(
|
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||||
event,
|
|
||||||
{
|
asyncio.run(emit_event())
|
||||||
"snapshot_id": str(snapshot.id),
|
|
||||||
"plugin": "title",
|
|
||||||
"hook_name": "on_Snapshot__54_title.js",
|
|
||||||
"status": "noresults",
|
|
||||||
"output_str": "No title found",
|
|
||||||
},
|
|
||||||
output_files,
|
|
||||||
output_size,
|
|
||||||
output_mimetypes,
|
|
||||||
)
|
|
||||||
|
|
||||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
|
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
|
||||||
assert result.status == ArchiveResult.StatusChoices.NORESULTS
|
assert result.status == ArchiveResult.StatusChoices.NORESULTS
|
||||||
@@ -258,45 +215,30 @@ def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state():
|
|||||||
|
|
||||||
|
|
||||||
def test_process_completed_projects_snapshot_title_from_output_str():
|
def test_process_completed_projects_snapshot_title_from_output_str():
|
||||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
from archivebox.services.archive_result_service import ArchiveResultService
|
||||||
from archivebox.services.process_service import ProcessService
|
import asyncio
|
||||||
|
|
||||||
snapshot = _create_snapshot()
|
snapshot = _create_snapshot()
|
||||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
bus = create_bus(name="test_snapshot_title_output_str")
|
bus = create_bus(name="test_snapshot_title_output_str")
|
||||||
process_service = ProcessService(bus)
|
service = ArchiveResultService(bus)
|
||||||
service = ArchiveResultService(bus, process_service=process_service)
|
|
||||||
|
|
||||||
event = ProcessCompletedEvent(
|
event = ArchiveResultEvent(
|
||||||
plugin_name="title",
|
|
||||||
hook_name="on_Snapshot__54_title.js",
|
|
||||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"Example Domain"}\n' % snapshot.id,
|
|
||||||
stderr="",
|
|
||||||
exit_code=0,
|
|
||||||
output_dir=str(plugin_dir),
|
|
||||||
output_files=[],
|
|
||||||
process_id="proc-title-output-str",
|
|
||||||
snapshot_id=str(snapshot.id),
|
snapshot_id=str(snapshot.id),
|
||||||
|
plugin="title",
|
||||||
|
hook_name="on_Snapshot__54_title.js",
|
||||||
|
status="succeeded",
|
||||||
|
output_str="Example Domain",
|
||||||
start_ts="2026-03-22T12:00:00+00:00",
|
start_ts="2026-03-22T12:00:00+00:00",
|
||||||
end_ts="2026-03-22T12:00:01+00:00",
|
end_ts="2026-03-22T12:00:01+00:00",
|
||||||
)
|
)
|
||||||
|
|
||||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
async def emit_event() -> None:
|
||||||
service._project_from_process_completed(
|
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||||
event,
|
|
||||||
{
|
asyncio.run(emit_event())
|
||||||
"snapshot_id": str(snapshot.id),
|
|
||||||
"plugin": "title",
|
|
||||||
"hook_name": "on_Snapshot__54_title.js",
|
|
||||||
"status": "succeeded",
|
|
||||||
"output_str": "Example Domain",
|
|
||||||
},
|
|
||||||
output_files,
|
|
||||||
output_size,
|
|
||||||
output_mimetypes,
|
|
||||||
)
|
|
||||||
|
|
||||||
snapshot.refresh_from_db()
|
snapshot.refresh_from_db()
|
||||||
assert snapshot.title == "Example Domain"
|
assert snapshot.title == "Example Domain"
|
||||||
@@ -304,8 +246,8 @@ def test_process_completed_projects_snapshot_title_from_output_str():
|
|||||||
|
|
||||||
|
|
||||||
def test_process_completed_projects_snapshot_title_from_title_file():
|
def test_process_completed_projects_snapshot_title_from_title_file():
|
||||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
from archivebox.services.archive_result_service import ArchiveResultService
|
||||||
from archivebox.services.process_service import ProcessService
|
import asyncio
|
||||||
|
|
||||||
snapshot = _create_snapshot()
|
snapshot = _create_snapshot()
|
||||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||||
@@ -313,37 +255,23 @@ def test_process_completed_projects_snapshot_title_from_title_file():
|
|||||||
(plugin_dir / "title.txt").write_text("Example Domain")
|
(plugin_dir / "title.txt").write_text("Example Domain")
|
||||||
|
|
||||||
bus = create_bus(name="test_snapshot_title_file")
|
bus = create_bus(name="test_snapshot_title_file")
|
||||||
process_service = ProcessService(bus)
|
service = ArchiveResultService(bus)
|
||||||
service = ArchiveResultService(bus, process_service=process_service)
|
|
||||||
|
|
||||||
event = ProcessCompletedEvent(
|
event = ArchiveResultEvent(
|
||||||
plugin_name="title",
|
|
||||||
hook_name="on_Snapshot__54_title.js",
|
|
||||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
|
|
||||||
stderr="",
|
|
||||||
exit_code=0,
|
|
||||||
output_dir=str(plugin_dir),
|
|
||||||
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
|
|
||||||
process_id="proc-title-file",
|
|
||||||
snapshot_id=str(snapshot.id),
|
snapshot_id=str(snapshot.id),
|
||||||
|
plugin="title",
|
||||||
|
hook_name="on_Snapshot__54_title.js",
|
||||||
|
status="noresults",
|
||||||
|
output_str="No title found",
|
||||||
|
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
|
||||||
start_ts="2026-03-22T12:00:00+00:00",
|
start_ts="2026-03-22T12:00:00+00:00",
|
||||||
end_ts="2026-03-22T12:00:01+00:00",
|
end_ts="2026-03-22T12:00:01+00:00",
|
||||||
)
|
)
|
||||||
|
|
||||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
async def emit_event() -> None:
|
||||||
service._project_from_process_completed(
|
await service.on_ArchiveResultEvent__save_to_db(event)
|
||||||
event,
|
|
||||||
{
|
asyncio.run(emit_event())
|
||||||
"snapshot_id": str(snapshot.id),
|
|
||||||
"plugin": "title",
|
|
||||||
"hook_name": "on_Snapshot__54_title.js",
|
|
||||||
"status": "noresults",
|
|
||||||
"output_str": "No title found",
|
|
||||||
},
|
|
||||||
output_files,
|
|
||||||
output_size,
|
|
||||||
output_mimetypes,
|
|
||||||
)
|
|
||||||
|
|
||||||
snapshot.refresh_from_db()
|
snapshot.refresh_from_db()
|
||||||
assert snapshot.title == "Example Domain"
|
assert snapshot.title == "Example Domain"
|
||||||
@@ -410,9 +338,12 @@ def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path):
|
|||||||
assert output_mimetypes == "application/warc"
|
assert output_mimetypes == "application/warc"
|
||||||
|
|
||||||
|
|
||||||
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
|
@pytest.mark.django_db(transaction=True)
|
||||||
|
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch, tmp_path):
|
||||||
from archivebox.machine.models import Binary, NetworkInterface
|
from archivebox.machine.models import Binary, NetworkInterface
|
||||||
from archivebox.services.process_service import ProcessService
|
from archivebox.machine.models import Process as MachineProcess
|
||||||
|
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
|
||||||
|
from abx_dl.services.process_service import ProcessService as DlProcessService
|
||||||
|
|
||||||
machine = _create_machine()
|
machine = _create_machine()
|
||||||
iface = _create_iface(machine)
|
iface = _create_iface(machine)
|
||||||
@@ -428,35 +359,60 @@ def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(
|
|||||||
status=Binary.StatusChoices.INSTALLED,
|
status=Binary.StatusChoices.INSTALLED,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
hook_path = tmp_path / "on_Snapshot__57_mercury.py"
|
||||||
|
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
|
||||||
|
hook_path.chmod(0o755)
|
||||||
|
output_dir = tmp_path / "mercury"
|
||||||
|
output_dir.mkdir()
|
||||||
|
|
||||||
bus = create_bus(name="test_process_started_binary_hydration")
|
bus = create_bus(name="test_process_started_binary_hydration")
|
||||||
service = ProcessService(bus)
|
DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
|
||||||
event = ProcessStartedEvent(
|
ArchiveBoxProcessService(bus)
|
||||||
plugin_name="mercury",
|
|
||||||
hook_name="on_Snapshot__57_mercury.py",
|
async def run_test() -> None:
|
||||||
hook_path="/plugins/mercury/on_Snapshot__57_mercury.py",
|
await bus.emit(
|
||||||
hook_args=["--url=https://example.com"],
|
ProcessEvent(
|
||||||
output_dir="/tmp/mercury",
|
plugin_name="mercury",
|
||||||
env={
|
hook_name="on_Snapshot__57_mercury.py",
|
||||||
"MERCURY_BINARY": binary.abspath,
|
hook_path=str(hook_path),
|
||||||
"NODE_BINARY": "/tmp/node",
|
hook_args=["--url=https://example.com"],
|
||||||
},
|
is_background=False,
|
||||||
timeout=60,
|
output_dir=str(output_dir),
|
||||||
pid=4321,
|
env={
|
||||||
process_id="proc-mercury",
|
"MERCURY_BINARY": binary.abspath,
|
||||||
snapshot_id="",
|
"NODE_BINARY": "/tmp/node",
|
||||||
start_ts="2026-03-22T12:00:00+00:00",
|
},
|
||||||
|
timeout=60,
|
||||||
|
url="https://example.com",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
started = await bus.find(
|
||||||
|
ProcessStartedEvent,
|
||||||
|
past=True,
|
||||||
|
future=False,
|
||||||
|
hook_name="on_Snapshot__57_mercury.py",
|
||||||
|
output_dir=str(output_dir),
|
||||||
|
)
|
||||||
|
assert started is not None
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
asyncio.run(run_test())
|
||||||
|
|
||||||
|
process = MachineProcess.objects.get(
|
||||||
|
pwd=str(output_dir),
|
||||||
|
cmd=[str(hook_path), "--url=https://example.com"],
|
||||||
)
|
)
|
||||||
|
|
||||||
service._project_started(event)
|
|
||||||
|
|
||||||
process = service._get_or_create_process(event)
|
|
||||||
assert process.binary_id == binary.id
|
assert process.binary_id == binary.id
|
||||||
assert process.iface_id == iface.id
|
assert process.iface_id == iface.id
|
||||||
|
|
||||||
|
|
||||||
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch):
|
@pytest.mark.django_db(transaction=True)
|
||||||
|
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch, tmp_path):
|
||||||
from archivebox.machine.models import Binary, NetworkInterface
|
from archivebox.machine.models import Binary, NetworkInterface
|
||||||
from archivebox.services.process_service import ProcessService
|
from archivebox.machine.models import Process as MachineProcess
|
||||||
|
from archivebox.services.process_service import ProcessService as ArchiveBoxProcessService
|
||||||
|
from abx_dl.services.process_service import ProcessService as DlProcessService
|
||||||
|
|
||||||
machine = _create_machine()
|
machine = _create_machine()
|
||||||
iface = _create_iface(machine)
|
iface = _create_iface(machine)
|
||||||
@@ -472,27 +428,47 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
|
|||||||
status=Binary.StatusChoices.INSTALLED,
|
status=Binary.StatusChoices.INSTALLED,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
hook_path = tmp_path / "on_Snapshot__75_parse_dom_outlinks.js"
|
||||||
|
hook_path.write_text("#!/bin/bash\nexit 0\n", encoding="utf-8")
|
||||||
|
hook_path.chmod(0o755)
|
||||||
|
output_dir = tmp_path / "parse-dom-outlinks"
|
||||||
|
output_dir.mkdir()
|
||||||
|
|
||||||
bus = create_bus(name="test_process_started_node_fallback")
|
bus = create_bus(name="test_process_started_node_fallback")
|
||||||
service = ProcessService(bus)
|
DlProcessService(bus, emit_jsonl=False, stderr_is_tty=False)
|
||||||
event = ProcessStartedEvent(
|
ArchiveBoxProcessService(bus)
|
||||||
plugin_name="parse_dom_outlinks",
|
|
||||||
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
|
async def run_test() -> None:
|
||||||
hook_path="/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js",
|
await bus.emit(
|
||||||
hook_args=["--url=https://example.com"],
|
ProcessEvent(
|
||||||
output_dir="/tmp/parse-dom-outlinks",
|
plugin_name="parse_dom_outlinks",
|
||||||
env={
|
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
|
||||||
"NODE_BINARY": node.abspath,
|
hook_path=str(hook_path),
|
||||||
},
|
hook_args=["--url=https://example.com"],
|
||||||
timeout=60,
|
is_background=False,
|
||||||
pid=9876,
|
output_dir=str(output_dir),
|
||||||
process_id="proc-parse-dom-outlinks",
|
env={"NODE_BINARY": node.abspath},
|
||||||
snapshot_id="",
|
timeout=60,
|
||||||
start_ts="2026-03-22T12:00:00+00:00",
|
url="https://example.com",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
started = await bus.find(
|
||||||
|
ProcessStartedEvent,
|
||||||
|
past=True,
|
||||||
|
future=False,
|
||||||
|
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
|
||||||
|
output_dir=str(output_dir),
|
||||||
|
)
|
||||||
|
assert started is not None
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
asyncio.run(run_test())
|
||||||
|
|
||||||
|
process = MachineProcess.objects.get(
|
||||||
|
pwd=str(output_dir),
|
||||||
|
cmd=[str(hook_path), "--url=https://example.com"],
|
||||||
)
|
)
|
||||||
|
|
||||||
service._project_started(event)
|
|
||||||
|
|
||||||
process = service._get_or_create_process(event)
|
|
||||||
assert process.binary_id == node.id
|
assert process.binary_id == node.id
|
||||||
assert process.iface_id == iface.id
|
assert process.iface_id == iface.id
|
||||||
|
|
||||||
@@ -500,6 +476,7 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
|
|||||||
def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
||||||
from archivebox.machine.models import Binary, Machine
|
from archivebox.machine.models import Binary, Machine
|
||||||
from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService
|
from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService
|
||||||
|
import asyncio
|
||||||
|
|
||||||
machine = _create_machine()
|
machine = _create_machine()
|
||||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||||
@@ -522,7 +499,7 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
|||||||
binproviders="provider",
|
binproviders="provider",
|
||||||
)
|
)
|
||||||
|
|
||||||
service._project_binary(event)
|
asyncio.run(service.on_BinaryRequestEvent(event))
|
||||||
|
|
||||||
binary.refresh_from_db()
|
binary.refresh_from_db()
|
||||||
assert Binary.objects.filter(machine=machine, name="wget").count() == 1
|
assert Binary.objects.filter(machine=machine, name="wget").count() == 1
|
||||||
|
|||||||
@@ -378,11 +378,8 @@ class TestRecoverOrphanedCrawls:
|
|||||||
machine=machine,
|
machine=machine,
|
||||||
process_type=Process.TypeChoices.HOOK,
|
process_type=Process.TypeChoices.HOOK,
|
||||||
status=Process.StatusChoices.RUNNING,
|
status=Process.StatusChoices.RUNNING,
|
||||||
|
pwd=str(snapshot.output_dir / "chrome"),
|
||||||
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js"],
|
cmd=["/plugins/chrome/on_CrawlSetup__91_chrome_wait.js"],
|
||||||
env={
|
|
||||||
"CRAWL_ID": str(crawl.id),
|
|
||||||
"SNAPSHOT_ID": str(snapshot.id),
|
|
||||||
},
|
|
||||||
started_at=timezone.now(),
|
started_at=timezone.now(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -464,23 +464,24 @@ class TestDependencyRecordOutput(unittest.TestCase):
|
|||||||
self.assertEqual(data["name"], "wget")
|
self.assertEqual(data["name"], "wget")
|
||||||
self.assertTrue(data["abspath"].startswith("/"))
|
self.assertTrue(data["abspath"].startswith("/"))
|
||||||
|
|
||||||
def test_dependency_record_outputs_machine_config(self):
|
def test_dependency_record_outputs_binary_jsonl(self):
|
||||||
"""Dependency resolution should output Machine config update JSONL."""
|
"""Dependency resolution should output Binary JSONL."""
|
||||||
hook_output = json.dumps(
|
hook_output = json.dumps(
|
||||||
{
|
{
|
||||||
"type": "Machine",
|
"type": "Binary",
|
||||||
"config": {
|
"name": "wget",
|
||||||
"WGET_BINARY": "/usr/bin/wget",
|
"abspath": "/usr/bin/wget",
|
||||||
},
|
"version": "1.21.3",
|
||||||
|
"binprovider": "env",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
from archivebox.machine.models import Process
|
from archivebox.machine.models import Process
|
||||||
|
|
||||||
data = Process.parse_records_from_text(hook_output)[0]
|
data = Process.parse_records_from_text(hook_output)[0]
|
||||||
self.assertEqual(data["type"], "Machine")
|
self.assertEqual(data["type"], "Binary")
|
||||||
self.assertIn("config", data)
|
self.assertEqual(data["name"], "wget")
|
||||||
self.assertEqual(data["config"]["WGET_BINARY"], "/usr/bin/wget")
|
self.assertEqual(data["abspath"], "/usr/bin/wget")
|
||||||
|
|
||||||
|
|
||||||
class TestSnapshotHookOutput(unittest.TestCase):
|
class TestSnapshotHookOutput(unittest.TestCase):
|
||||||
|
|||||||
@@ -269,12 +269,12 @@ class TestBinaryModel(TestCase):
|
|||||||
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
|
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
|
||||||
self.assertGreater(binary.modified_at, old_modified)
|
self.assertGreater(binary.modified_at, old_modified)
|
||||||
|
|
||||||
def test_binary_from_json_preserves_install_args_overrides(self):
|
def test_binary_from_json_preserves_provider_overrides(self):
|
||||||
"""Binary.from_json() should persist canonical install_args overrides unchanged."""
|
"""Binary.from_json() should persist provider overrides unchanged."""
|
||||||
overrides = {
|
overrides = {
|
||||||
"apt": {"install_args": ["chromium"]},
|
"apt": {"install_args": ["chromium"]},
|
||||||
"npm": {"install_args": "puppeteer"},
|
"npm": {"install_args": "puppeteer"},
|
||||||
"custom": {"install_args": ["bash", "-lc", "echo ok"]},
|
"custom": {"install": "bash -lc 'echo ok'"},
|
||||||
}
|
}
|
||||||
|
|
||||||
binary = Binary.from_json(
|
binary = Binary.from_json(
|
||||||
|
|||||||
@@ -1,69 +1,4 @@
|
|||||||
import asyncio
|
|
||||||
import json
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
|
|
||||||
from abx_dl.orchestrator import create_bus
|
|
||||||
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.django_db
|
pytestmark = pytest.mark.django_db
|
||||||
|
|
||||||
|
|
||||||
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
|
|
||||||
from archivebox.services import process_service as process_service_module
|
|
||||||
from archivebox.services.process_service import ProcessService
|
|
||||||
|
|
||||||
bus = create_bus(name="test_process_service_inline_process_event")
|
|
||||||
ProcessService(bus)
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
process_service_module,
|
|
||||||
"_ensure_worker",
|
|
||||||
lambda event: {
|
|
||||||
"pid": 4321,
|
|
||||||
"start": 1711111111.0,
|
|
||||||
"statename": "RUNNING",
|
|
||||||
"exitstatus": 0,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
async def run_test():
|
|
||||||
await bus.emit(
|
|
||||||
ProcessStdoutEvent(
|
|
||||||
line=json.dumps(
|
|
||||||
{
|
|
||||||
"type": "ProcessEvent",
|
|
||||||
"plugin_name": "search_backend_sonic",
|
|
||||||
"hook_name": "worker_sonic",
|
|
||||||
"hook_path": "/usr/bin/sonic",
|
|
||||||
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
|
|
||||||
"is_background": True,
|
|
||||||
"daemon": True,
|
|
||||||
"url": "tcp://127.0.0.1:1491",
|
|
||||||
"output_dir": "/tmp/sonic",
|
|
||||||
"env": {},
|
|
||||||
"process_type": "worker",
|
|
||||||
"worker_type": "sonic",
|
|
||||||
"process_id": "worker:sonic",
|
|
||||||
"output_str": "127.0.0.1:1491",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
plugin_name="search_backend_sonic",
|
|
||||||
hook_name="on_CrawlSetup__55_sonic_start.py",
|
|
||||||
output_dir="/tmp/search_backend_sonic",
|
|
||||||
snapshot_id="snap-1",
|
|
||||||
process_id="proc-hook",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
|
|
||||||
await bus.stop()
|
|
||||||
return started
|
|
||||||
|
|
||||||
started = asyncio.run(run_test())
|
|
||||||
assert started is not None
|
|
||||||
assert started.hook_name == "worker_sonic"
|
|
||||||
assert started.process_type == "worker"
|
|
||||||
assert started.worker_type == "sonic"
|
|
||||||
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
|
|
||||||
assert getattr(started, "output_str", "") == "127.0.0.1:1491"
|
|
||||||
|
|||||||
@@ -34,18 +34,6 @@ class _DummyService:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class _DummyAbxServices:
|
|
||||||
def __init__(self):
|
|
||||||
self.process = SimpleNamespace(wait_for_background_monitors=self._wait)
|
|
||||||
|
|
||||||
async def _wait(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def _call_sync(func, *args, **kwargs):
|
|
||||||
return func(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
@@ -82,18 +70,18 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
|||||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||||
|
|
||||||
download_calls = []
|
download_calls = []
|
||||||
|
|
||||||
async def fake_download(*, url, bus, snapshot, **kwargs):
|
async def fake_download(*, url, bus, config_overrides, **kwargs):
|
||||||
|
extra_context = json.loads(config_overrides["EXTRA_CONTEXT"])
|
||||||
download_calls.append(
|
download_calls.append(
|
||||||
{
|
{
|
||||||
"url": url,
|
"url": url,
|
||||||
"bus": bus,
|
"bus": bus,
|
||||||
"snapshot_id": snapshot.id,
|
"snapshot_id": extra_context["snapshot_id"],
|
||||||
"source_url": snapshot.url,
|
"source_url": url,
|
||||||
"abx_snapshot_id": snapshot.id,
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
@@ -113,9 +101,8 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
|||||||
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
|
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
|
||||||
"tags": snapshot_a.tags_str(),
|
"tags": snapshot_a.tags_str(),
|
||||||
"depth": snapshot_a.depth,
|
"depth": snapshot_a.depth,
|
||||||
"parent_snapshot_id": str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
|
|
||||||
"output_dir": str(snapshot_a.output_dir),
|
"output_dir": str(snapshot_a.output_dir),
|
||||||
"config": crawl_runner._snapshot_config(snapshot_a),
|
"config": crawl_runner.load_snapshot_payload(str(snapshot_a.id))["config"],
|
||||||
},
|
},
|
||||||
str(snapshot_b.id): {
|
str(snapshot_b.id): {
|
||||||
"id": str(snapshot_b.id),
|
"id": str(snapshot_b.id),
|
||||||
@@ -127,17 +114,16 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
|||||||
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
|
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
|
||||||
"tags": snapshot_b.tags_str(),
|
"tags": snapshot_b.tags_str(),
|
||||||
"depth": snapshot_b.depth,
|
"depth": snapshot_b.depth,
|
||||||
"parent_snapshot_id": str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
|
|
||||||
"output_dir": str(snapshot_b.output_dir),
|
"output_dir": str(snapshot_b.output_dir),
|
||||||
"config": crawl_runner._snapshot_config(snapshot_b),
|
"config": crawl_runner.load_snapshot_payload(str(snapshot_b.id))["config"],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
monkeypatch.setattr(crawl_runner, "_load_snapshot_run_data", lambda snapshot_id: snapshot_data[snapshot_id])
|
monkeypatch.setattr(crawl_runner, "load_snapshot_payload", lambda snapshot_id: snapshot_data[snapshot_id])
|
||||||
|
|
||||||
async def run_both():
|
async def run_both():
|
||||||
await asyncio.gather(
|
await asyncio.gather(
|
||||||
crawl_runner._run_snapshot(str(snapshot_a.id)),
|
crawl_runner.run_snapshot(str(snapshot_a.id)),
|
||||||
crawl_runner._run_snapshot(str(snapshot_b.id)),
|
crawl_runner.run_snapshot(str(snapshot_b.id)),
|
||||||
)
|
)
|
||||||
|
|
||||||
asyncio.run(run_both())
|
asyncio.run(run_both())
|
||||||
@@ -243,10 +229,10 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
|||||||
refresh_calls = []
|
refresh_calls = []
|
||||||
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
|
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
|
||||||
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
|
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
|
||||||
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {})
|
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
|
||||||
|
|
||||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||||
crawl_runner._prepare()
|
crawl_runner.load_run_state()
|
||||||
|
|
||||||
assert refresh_calls == [True]
|
assert refresh_calls == [True]
|
||||||
assert proc.iface is not None
|
assert proc.iface is not None
|
||||||
@@ -254,10 +240,12 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
|||||||
assert saved_updates == [("iface", "machine", "modified_at")]
|
assert saved_updates == [("iface", "machine", "modified_at")]
|
||||||
|
|
||||||
|
|
||||||
def test_installed_binary_config_overrides_include_valid_installed_binaries(monkeypatch):
|
def test_load_run_state_uses_machine_config_as_derived_config(monkeypatch):
|
||||||
from archivebox.machine.models import Binary, Machine
|
from archivebox.machine.models import Machine, NetworkInterface, Process
|
||||||
from archivebox.services import runner as runner_module
|
from archivebox.services import runner as runner_module
|
||||||
from abx_dl.models import Plugin
|
from archivebox.config import configset as configset_module
|
||||||
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
machine = Machine.objects.create(
|
machine = Machine.objects.create(
|
||||||
guid="test-guid-runner-overrides",
|
guid="test-guid-runner-overrides",
|
||||||
@@ -273,143 +261,30 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
|
|||||||
os_release="14.0",
|
os_release="14.0",
|
||||||
os_kernel="Darwin",
|
os_kernel="Darwin",
|
||||||
stats={},
|
stats={},
|
||||||
config={},
|
config={"WGET_BINARY": "/tmp/wget", "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}},
|
||||||
)
|
)
|
||||||
mercury_binary = Binary.objects.create(
|
crawl = Crawl.objects.create(
|
||||||
machine=machine,
|
urls="https://example.com",
|
||||||
name="postlight-parser",
|
created_by_id=get_or_create_system_user_pk(),
|
||||||
abspath=sys.executable,
|
|
||||||
version="2.0.0",
|
|
||||||
binprovider="pip",
|
|
||||||
binproviders="env,pip",
|
|
||||||
status=Binary.StatusChoices.INSTALLED,
|
|
||||||
)
|
|
||||||
wget_binary = Binary.objects.create(
|
|
||||||
machine=machine,
|
|
||||||
name="wget",
|
|
||||||
abspath="/tmp/not-an-executable",
|
|
||||||
version="1.0.0",
|
|
||||||
binprovider="env",
|
|
||||||
binproviders="env",
|
|
||||||
status=Binary.StatusChoices.INSTALLED,
|
|
||||||
)
|
|
||||||
puppeteer_binary = Binary.objects.create(
|
|
||||||
machine=machine,
|
|
||||||
name="puppeteer",
|
|
||||||
abspath="/tmp/shared-lib/npm/node_modules/.bin/puppeteer",
|
|
||||||
version="24.40.0",
|
|
||||||
binprovider="npm",
|
|
||||||
binproviders="npm",
|
|
||||||
status=Binary.StatusChoices.INSTALLED,
|
|
||||||
)
|
|
||||||
ytdlp_binary = Binary.objects.create(
|
|
||||||
machine=machine,
|
|
||||||
name="yt-dlp",
|
|
||||||
abspath="/tmp/shared-lib/pip/venv/bin/yt-dlp",
|
|
||||||
version="2026.3.17",
|
|
||||||
binprovider="pip",
|
|
||||||
binproviders="pip",
|
|
||||||
status=Binary.StatusChoices.INSTALLED,
|
|
||||||
)
|
)
|
||||||
|
proc = SimpleNamespace(iface_id=str(machine.id), machine_id=str(machine.id), iface=None, machine=machine, save=lambda **kwargs: None)
|
||||||
|
|
||||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
Path,
|
NetworkInterface,
|
||||||
"is_file",
|
"current",
|
||||||
lambda self: (
|
classmethod(lambda cls, refresh=False: SimpleNamespace(id=machine.id, machine=machine)),
|
||||||
str(self) in {sys.executable, mercury_binary.abspath, wget_binary.abspath, puppeteer_binary.abspath, ytdlp_binary.abspath}
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
|
||||||
runner_module.os,
|
|
||||||
"access",
|
|
||||||
lambda path, mode: str(path) in {sys.executable, puppeteer_binary.abspath, ytdlp_binary.abspath},
|
|
||||||
)
|
|
||||||
|
|
||||||
overrides = runner_module._installed_binary_config_overrides(
|
|
||||||
{
|
|
||||||
"mercury": Plugin(
|
|
||||||
name="mercury",
|
|
||||||
path=Path("."),
|
|
||||||
hooks=[],
|
|
||||||
config_schema={"MERCURY_BINARY": {"type": "string", "default": "postlight-parser"}},
|
|
||||||
),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
assert overrides["MERCURY_BINARY"] == sys.executable
|
|
||||||
assert "POSTLIGHT_PARSER_BINARY" not in overrides
|
|
||||||
assert "WGET_BINARY" not in overrides
|
|
||||||
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
|
|
||||||
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
|
|
||||||
assert overrides["PIP_HOME"] == "/tmp/shared-lib/pip"
|
|
||||||
assert overrides["PIP_BIN_DIR"] == "/tmp/shared-lib/pip/venv/bin"
|
|
||||||
assert overrides["NPM_HOME"] == "/tmp/shared-lib/npm"
|
|
||||||
assert overrides["NPM_BIN_DIR"] == "/tmp/shared-lib/npm/node_modules/.bin"
|
|
||||||
assert overrides["NODE_MODULES_DIR"] == "/tmp/shared-lib/npm/node_modules"
|
|
||||||
assert overrides["NODE_MODULE_DIR"] == "/tmp/shared-lib/npm/node_modules"
|
|
||||||
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
|
|
||||||
|
|
||||||
|
|
||||||
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
|
|
||||||
from archivebox.machine.models import Binary, Machine
|
|
||||||
from archivebox.services import runner as runner_module
|
|
||||||
from abx_dl.models import Plugin
|
|
||||||
|
|
||||||
machine = Machine.objects.create(
|
|
||||||
guid="test-guid-runner-singlefile-cache",
|
|
||||||
hostname="runner-host-singlefile",
|
|
||||||
hw_in_docker=False,
|
|
||||||
hw_in_vm=False,
|
|
||||||
hw_manufacturer="Test",
|
|
||||||
hw_product="Test Product",
|
|
||||||
hw_uuid="test-hw-runner-singlefile-cache",
|
|
||||||
os_arch="arm64",
|
|
||||||
os_family="darwin",
|
|
||||||
os_platform="macOS",
|
|
||||||
os_release="14.0",
|
|
||||||
os_kernel="Darwin",
|
|
||||||
stats={},
|
|
||||||
config={},
|
|
||||||
)
|
|
||||||
singlefile_extension = Binary.objects.create(
|
|
||||||
machine=machine,
|
|
||||||
name="singlefile",
|
|
||||||
abspath="/tmp/shared-lib/bin/singlefile",
|
|
||||||
version="1.0.0",
|
|
||||||
binprovider="chromewebstore",
|
|
||||||
binproviders="chromewebstore",
|
|
||||||
status=Binary.StatusChoices.INSTALLED,
|
|
||||||
)
|
|
||||||
|
|
||||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||||
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath)
|
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
|
||||||
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
|
|
||||||
|
|
||||||
overrides = runner_module._installed_binary_config_overrides(
|
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||||
{
|
crawl_runner.load_run_state()
|
||||||
"singlefile": Plugin(
|
|
||||||
name="singlefile",
|
|
||||||
path=Path("."),
|
|
||||||
hooks=[],
|
|
||||||
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
|
|
||||||
binaries=[
|
|
||||||
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
|
|
||||||
{"name": "singlefile", "binproviders": "chromewebstore"},
|
|
||||||
],
|
|
||||||
),
|
|
||||||
},
|
|
||||||
config={"SINGLEFILE_BINARY": "single-file"},
|
|
||||||
)
|
|
||||||
|
|
||||||
assert "SINGLEFILE_BINARY" not in overrides
|
assert crawl_runner.derived_config == machine.config
|
||||||
assert "LIB_DIR" not in overrides
|
|
||||||
assert "LIB_BIN_DIR" not in overrides
|
|
||||||
|
|
||||||
|
|
||||||
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
|
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch, tmp_path):
|
||||||
import asgiref.sync
|
|
||||||
|
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
from archivebox.services import runner as runner_module
|
from archivebox.services import runner as runner_module
|
||||||
@@ -428,12 +303,6 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
|
|||||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "_limit_stop_reason", lambda config: "max_size")
|
|
||||||
monkeypatch.setattr(
|
|
||||||
asgiref.sync,
|
|
||||||
"sync_to_async",
|
|
||||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
runner_module,
|
runner_module,
|
||||||
"download",
|
"download",
|
||||||
@@ -441,8 +310,21 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
|
|||||||
)
|
)
|
||||||
|
|
||||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||||
|
state_dir = tmp_path / ".abx-dl"
|
||||||
|
state_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
(state_dir / "limits.json").write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"admitted_snapshot_ids": ["child-1"],
|
||||||
|
"counted_process_ids": ["proc-1"],
|
||||||
|
"total_size": 32,
|
||||||
|
"stop_reason": "max_size",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
cancelled: list[str] = []
|
cancelled: list[str] = []
|
||||||
crawl_runner._load_snapshot_run_data = lambda snapshot_id: {
|
crawl_runner.load_snapshot_payload = lambda snapshot_id: {
|
||||||
"id": snapshot_id,
|
"id": snapshot_id,
|
||||||
"url": "https://example.com/child",
|
"url": "https://example.com/child",
|
||||||
"title": "",
|
"title": "",
|
||||||
@@ -452,22 +334,23 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
|
|||||||
"tags": "",
|
"tags": "",
|
||||||
"depth": 1,
|
"depth": 1,
|
||||||
"status": "queued",
|
"status": "queued",
|
||||||
"parent_snapshot_id": None,
|
|
||||||
"output_dir": "/tmp/child",
|
"output_dir": "/tmp/child",
|
||||||
"config": {"CRAWL_DIR": "/tmp/crawl", "MAX_SIZE": 16},
|
"config": {"CRAWL_DIR": str(tmp_path), "MAX_SIZE": 16},
|
||||||
}
|
}
|
||||||
crawl_runner._cancel_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
|
crawl_runner.seal_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
|
||||||
|
|
||||||
asyncio.run(crawl_runner._run_snapshot("child-1"))
|
asyncio.run(crawl_runner.run_snapshot("child-1"))
|
||||||
|
|
||||||
assert cancelled == ["child-1"]
|
assert cancelled == ["child-1"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db(transaction=True)
|
||||||
def test_seal_snapshot_cancels_queued_descendants_after_max_size():
|
def test_seal_snapshot_cancels_queued_descendants_after_max_size():
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
from archivebox.services.snapshot_service import SnapshotService
|
from archivebox.services.snapshot_service import SnapshotService
|
||||||
|
from abx_dl.events import SnapshotCompletedEvent
|
||||||
from abx_dl.orchestrator import create_bus
|
from abx_dl.orchestrator import create_bus
|
||||||
|
|
||||||
crawl = Crawl.objects.create(
|
crawl = Crawl.objects.create(
|
||||||
@@ -505,13 +388,22 @@ def test_seal_snapshot_cancels_queued_descendants_after_max_size():
|
|||||||
bus = create_bus(name="test_snapshot_limit_cancel")
|
bus = create_bus(name="test_snapshot_limit_cancel")
|
||||||
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
|
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
|
||||||
try:
|
try:
|
||||||
sealed_id = service._seal_snapshot(str(root.id))
|
|
||||||
|
async def emit_event() -> None:
|
||||||
|
await service.on_SnapshotCompletedEvent(
|
||||||
|
SnapshotCompletedEvent(
|
||||||
|
url=root.url,
|
||||||
|
snapshot_id=str(root.id),
|
||||||
|
output_dir=str(root.output_dir),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
asyncio.run(emit_event())
|
||||||
finally:
|
finally:
|
||||||
asyncio.run(bus.stop())
|
asyncio.run(bus.stop())
|
||||||
|
|
||||||
root.refresh_from_db()
|
root.refresh_from_db()
|
||||||
child.refresh_from_db()
|
child.refresh_from_db()
|
||||||
assert sealed_id == str(root.id)
|
|
||||||
assert root.status == Snapshot.StatusChoices.SEALED
|
assert root.status == Snapshot.StatusChoices.SEALED
|
||||||
assert child.status == Snapshot.StatusChoices.SEALED
|
assert child.status == Snapshot.StatusChoices.SEALED
|
||||||
assert child.retry_at is None
|
assert child.retry_at is None
|
||||||
@@ -548,7 +440,6 @@ def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
|
|||||||
|
|
||||||
|
|
||||||
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
|
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
|
||||||
import asgiref.sync
|
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
@@ -565,35 +456,23 @@ def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
|
|||||||
status=Snapshot.StatusChoices.STARTED,
|
status=Snapshot.StatusChoices.STARTED,
|
||||||
)
|
)
|
||||||
|
|
||||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
|
||||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
|
||||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
|
||||||
monkeypatch.setattr(
|
|
||||||
asgiref.sync,
|
|
||||||
"sync_to_async",
|
|
||||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
|
||||||
monkeypatch.setattr(crawl, "is_finished", lambda: False)
|
|
||||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
|
||||||
|
|
||||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||||
|
|
||||||
|
crawl.refresh_from_db()
|
||||||
assert crawl.status != Crawl.StatusChoices.SEALED
|
assert crawl.status != Crawl.StatusChoices.SEALED
|
||||||
assert crawl.retry_at is not None
|
assert crawl.retry_at is not None
|
||||||
|
|
||||||
|
|
||||||
def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
def test_crawl_runner_calls_load_and_finalize_run_state(monkeypatch):
|
||||||
import asgiref.sync
|
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
@@ -618,50 +497,34 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
|||||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
|
||||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
|
||||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
|
||||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
|
||||||
monkeypatch.setattr(crawl, "cleanup", lambda: None)
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||||
|
|
||||||
sync_to_async_wrapped: list[str] = []
|
method_calls: list[str] = []
|
||||||
sync_to_async_active = False
|
|
||||||
|
|
||||||
def fake_sync_to_async(func, thread_sensitive=True):
|
def wrapped_finalize(self):
|
||||||
async def wrapper(*args, **kwargs):
|
method_calls.append("finalize_run_state")
|
||||||
nonlocal sync_to_async_active
|
return None
|
||||||
sync_to_async_wrapped.append(getattr(func, "__name__", repr(func)))
|
|
||||||
previous = sync_to_async_active
|
|
||||||
sync_to_async_active = True
|
|
||||||
try:
|
|
||||||
return func(*args, **kwargs)
|
|
||||||
finally:
|
|
||||||
sync_to_async_active = previous
|
|
||||||
|
|
||||||
return wrapper
|
def wrapped_load(self):
|
||||||
|
method_calls.append("load_run_state")
|
||||||
|
return [str(snapshot.id)]
|
||||||
|
|
||||||
def guarded_is_finished():
|
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", wrapped_finalize)
|
||||||
assert sync_to_async_active is True
|
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", wrapped_load)
|
||||||
return False
|
|
||||||
|
|
||||||
monkeypatch.setattr(asgiref.sync, "sync_to_async", fake_sync_to_async)
|
|
||||||
monkeypatch.setattr(crawl, "is_finished", guarded_is_finished)
|
|
||||||
|
|
||||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||||
|
|
||||||
crawl.refresh_from_db()
|
crawl.refresh_from_db()
|
||||||
assert crawl.status == Crawl.StatusChoices.STARTED
|
assert crawl.status == Crawl.StatusChoices.STARTED
|
||||||
assert crawl.retry_at is not None
|
assert crawl.retry_at is not None
|
||||||
assert "guarded_is_finished" in sync_to_async_wrapped
|
assert method_calls == ["load_run_state", "finalize_run_state"]
|
||||||
|
|
||||||
|
|
||||||
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
||||||
@@ -680,7 +543,7 @@ def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
|||||||
task.set_exception(RuntimeError("snapshot failed"))
|
task.set_exception(RuntimeError("snapshot failed"))
|
||||||
crawl_runner.snapshot_tasks["snap-1"] = task
|
crawl_runner.snapshot_tasks["snap-1"] = task
|
||||||
with pytest.raises(RuntimeError, match="snapshot failed"):
|
with pytest.raises(RuntimeError, match="snapshot failed"):
|
||||||
await crawl_runner._wait_for_snapshot_tasks()
|
await crawl_runner.wait_for_snapshot_tasks()
|
||||||
|
|
||||||
asyncio.run(run_test())
|
asyncio.run(run_test())
|
||||||
|
|
||||||
@@ -702,14 +565,13 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
|
|||||||
async def run_test():
|
async def run_test():
|
||||||
task = asyncio.create_task(finish_snapshot())
|
task = asyncio.create_task(finish_snapshot())
|
||||||
crawl_runner.snapshot_tasks["snap-1"] = task
|
crawl_runner.snapshot_tasks["snap-1"] = task
|
||||||
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
|
await asyncio.wait_for(crawl_runner.wait_for_snapshot_tasks(), timeout=0.5)
|
||||||
assert crawl_runner.snapshot_tasks == {}
|
assert crawl_runner.snapshot_tasks == {}
|
||||||
|
|
||||||
asyncio.run(run_test())
|
asyncio.run(run_test())
|
||||||
|
|
||||||
|
|
||||||
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||||
import asgiref.sync
|
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
@@ -726,30 +588,18 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
|||||||
status=Snapshot.StatusChoices.STARTED,
|
status=Snapshot.StatusChoices.STARTED,
|
||||||
)
|
)
|
||||||
|
|
||||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
|
||||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
|
||||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
|
||||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
|
||||||
monkeypatch.setattr(
|
|
||||||
asgiref.sync,
|
|
||||||
"sync_to_async",
|
|
||||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
|
||||||
monkeypatch.setattr(crawl, "is_finished", lambda: False)
|
|
||||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
|
||||||
|
|
||||||
cleanup_calls = []
|
cleanup_calls = []
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
runner_module.CrawlRunner,
|
runner_module.CrawlRunner,
|
||||||
"_run_crawl_cleanup",
|
"run_crawl_cleanup",
|
||||||
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
|
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
|
||||||
)
|
)
|
||||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||||
@@ -757,17 +607,20 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
|||||||
assert cleanup_calls == ["abx_cleanup"]
|
assert cleanup_calls == ["abx_cleanup"]
|
||||||
|
|
||||||
|
|
||||||
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
|
def test_abx_process_service_background_process_finishes_after_process_exit(monkeypatch, tmp_path):
|
||||||
from abx_dl.models import Process as AbxProcess, now_iso
|
from abx_dl.models import Process as AbxProcess, now_iso
|
||||||
from abx_dl.services.process_service import ProcessService
|
from abx_dl.services.process_service import ProcessService
|
||||||
from abx_dl.events import ProcessCompletedEvent
|
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||||
|
|
||||||
service = object.__new__(ProcessService)
|
service = object.__new__(ProcessService)
|
||||||
service.emit_jsonl = False
|
service.emit_jsonl = False
|
||||||
emitted_events = []
|
emitted_events = []
|
||||||
|
|
||||||
async def fake_emit_event(event, *, detach_from_parent):
|
class FakeBus:
|
||||||
emitted_events.append((event, detach_from_parent))
|
async def emit(self, event):
|
||||||
|
emitted_events.append(event)
|
||||||
|
|
||||||
|
service.bus = FakeBus()
|
||||||
|
|
||||||
async def fake_stream_stdout(**kwargs):
|
async def fake_stream_stdout(**kwargs):
|
||||||
try:
|
try:
|
||||||
@@ -775,19 +628,8 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
|||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
return ["daemon output\n"]
|
return ["daemon output\n"]
|
||||||
|
|
||||||
service._emit_event = fake_emit_event
|
|
||||||
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
|
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
|
||||||
|
|
||||||
class FakeAsyncProcess:
|
|
||||||
def __init__(self):
|
|
||||||
self.pid = 42424
|
|
||||||
self.returncode = None
|
|
||||||
|
|
||||||
async def wait(self):
|
|
||||||
await asyncio.sleep(0)
|
|
||||||
self.returncode = 0
|
|
||||||
return 0
|
|
||||||
|
|
||||||
plugin_output_dir = tmp_path / "chrome"
|
plugin_output_dir = tmp_path / "chrome"
|
||||||
plugin_output_dir.mkdir()
|
plugin_output_dir.mkdir()
|
||||||
stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log"
|
stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log"
|
||||||
@@ -804,41 +646,45 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
|||||||
plugin="chrome",
|
plugin="chrome",
|
||||||
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
|
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
|
||||||
)
|
)
|
||||||
process = FakeAsyncProcess()
|
|
||||||
event = SimpleNamespace(
|
|
||||||
plugin_name="chrome",
|
|
||||||
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
|
|
||||||
hook_path="hook",
|
|
||||||
hook_args=["--url=https://example.org/"],
|
|
||||||
env={},
|
|
||||||
output_dir=str(plugin_output_dir),
|
|
||||||
timeout=60,
|
|
||||||
snapshot_id="snap-1",
|
|
||||||
is_background=True,
|
|
||||||
url="https://example.org/",
|
|
||||||
process_type="hook",
|
|
||||||
worker_type="hook",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def run_test():
|
async def run_test():
|
||||||
|
process = await asyncio.create_subprocess_exec(
|
||||||
|
sys.executable,
|
||||||
|
"-c",
|
||||||
|
"pass",
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
event = ProcessStartedEvent(
|
||||||
|
plugin_name="chrome",
|
||||||
|
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
|
||||||
|
hook_path="hook",
|
||||||
|
hook_args=["--url=https://example.org/"],
|
||||||
|
env={},
|
||||||
|
output_dir=str(plugin_output_dir),
|
||||||
|
timeout=60,
|
||||||
|
pid=process.pid,
|
||||||
|
is_background=True,
|
||||||
|
url="https://example.org/",
|
||||||
|
process_type="hook",
|
||||||
|
worker_type="hook",
|
||||||
|
start_ts=proc.started_at or "",
|
||||||
|
subprocess=process,
|
||||||
|
stdout_file=stdout_file,
|
||||||
|
stderr_file=stderr_file,
|
||||||
|
pid_file=pid_file,
|
||||||
|
cmd_file=plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.sh",
|
||||||
|
files_before=set(),
|
||||||
|
)
|
||||||
await asyncio.wait_for(
|
await asyncio.wait_for(
|
||||||
service._monitor_background_process(
|
service.on_ProcessStartedEvent(event),
|
||||||
event=event,
|
|
||||||
proc=proc,
|
|
||||||
process=process,
|
|
||||||
plugin_output_dir=plugin_output_dir,
|
|
||||||
stdout_file=stdout_file,
|
|
||||||
stderr_file=stderr_file,
|
|
||||||
pid_file=pid_file,
|
|
||||||
files_before=set(),
|
|
||||||
),
|
|
||||||
timeout=0.5,
|
timeout=0.5,
|
||||||
)
|
)
|
||||||
|
|
||||||
asyncio.run(run_test())
|
asyncio.run(run_test())
|
||||||
|
|
||||||
assert pid_file.exists() is False
|
assert pid_file.exists() is False
|
||||||
assert any(isinstance(event, ProcessCompletedEvent) for event, _ in emitted_events)
|
assert any(isinstance(event, ProcessCompletedEvent) for event in emitted_events)
|
||||||
|
|
||||||
|
|
||||||
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
|
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
|
||||||
|
|||||||
48
archivebox/tests/test_tag_service.py
Normal file
48
archivebox/tests/test_tag_service.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
import asyncio
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from abx_dl.events import TagEvent
|
||||||
|
from abx_dl.orchestrator import create_bus
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.django_db(transaction=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_snapshot():
|
||||||
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
|
from archivebox.crawls.models import Crawl
|
||||||
|
from archivebox.core.models import Snapshot
|
||||||
|
|
||||||
|
crawl = Crawl.objects.create(
|
||||||
|
urls="https://example.com",
|
||||||
|
created_by_id=get_or_create_system_user_pk(),
|
||||||
|
)
|
||||||
|
return Snapshot.objects.create(
|
||||||
|
url="https://example.com",
|
||||||
|
crawl=crawl,
|
||||||
|
status=Snapshot.StatusChoices.STARTED,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tag_event_projects_tag_to_snapshot():
|
||||||
|
from archivebox.core.models import Tag
|
||||||
|
from archivebox.services.tag_service import TagService
|
||||||
|
|
||||||
|
snapshot = _create_snapshot()
|
||||||
|
bus = create_bus(name="test_tag_service")
|
||||||
|
TagService(bus)
|
||||||
|
|
||||||
|
async def emit_tag_event() -> None:
|
||||||
|
await bus.emit(
|
||||||
|
TagEvent(
|
||||||
|
name="example",
|
||||||
|
snapshot_id=str(snapshot.id),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
asyncio.run(emit_tag_event())
|
||||||
|
|
||||||
|
snapshot.refresh_from_db()
|
||||||
|
assert snapshot.tags.filter(name="example").exists()
|
||||||
|
assert Tag.objects.filter(name="example").exists()
|
||||||
2
docs
2
docs
Submodule docs updated: be25d9bfa2...7244076ece
@@ -42,7 +42,7 @@ Crawl.run()
|
|||||||
{'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}}
|
{'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}}
|
||||||
|
|
||||||
# ❌ WRONG - uses different field names
|
# ❌ WRONG - uses different field names
|
||||||
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'custom_cmds': {...}}
|
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'overrides': {...}}
|
||||||
```
|
```
|
||||||
|
|
||||||
4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery.
|
4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery.
|
||||||
@@ -84,7 +84,7 @@ Crawl.run()
|
|||||||
# ❌ WRONG - complex transformation logic
|
# ❌ WRONG - complex transformation logic
|
||||||
if obj.get('type') == 'Dependency':
|
if obj.get('type') == 'Dependency':
|
||||||
dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields
|
dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields
|
||||||
dep.custom_commands = transform_overrides(obj['overrides']) # transforming data
|
dep.overrides = transform_overrides(obj['overrides']) # transforming data
|
||||||
```
|
```
|
||||||
|
|
||||||
### Pattern Consistency
|
### Pattern Consistency
|
||||||
|
|||||||
@@ -159,6 +159,11 @@ environments = ["sys_platform == 'darwin'", "sys_platform == 'linux'"]
|
|||||||
package = true
|
package = true
|
||||||
# compile-bytecode = true
|
# compile-bytecode = true
|
||||||
|
|
||||||
|
[tool.uv.sources]
|
||||||
|
abx-pkg = { path = "../abx-pkg", editable = true }
|
||||||
|
abx-plugins = { path = "../abx-plugins", editable = true }
|
||||||
|
abx-dl = { path = "../abx-dl", editable = true }
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["pdm-backend"]
|
requires = ["pdm-backend"]
|
||||||
build-backend = "pdm.backend"
|
build-backend = "pdm.backend"
|
||||||
|
|||||||
Reference in New Issue
Block a user