mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
update working changes
This commit is contained in:
@@ -9,12 +9,11 @@ from typing import Any
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent
|
||||
from abx_dl.events import ArchiveResultEvent, ProcessCompletedEvent, ProcessStartedEvent, SnapshotEvent
|
||||
from abx_dl.output_files import guess_mimetype
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
from .process_service import ProcessService, parse_event_datetime
|
||||
from .process_service import parse_event_datetime
|
||||
|
||||
|
||||
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
|
||||
@@ -209,79 +208,41 @@ class ArchiveResultService(BaseService):
|
||||
LISTENS_TO = [ArchiveResultEvent, ProcessCompletedEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus, *, process_service: ProcessService):
|
||||
self.process_service = process_service
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(ArchiveResultEvent, self.on_ArchiveResultEvent__save_to_db)
|
||||
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
|
||||
|
||||
async def on_ArchiveResultEvent__Outer(self, event: ArchiveResultEvent) -> None:
|
||||
snapshot_output_dir = await run_db_op(self._get_snapshot_output_dir, event.snapshot_id)
|
||||
if snapshot_output_dir is None:
|
||||
return
|
||||
plugin_dir = Path(snapshot_output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
await run_db_op(self._project, event, output_files, output_size, output_mimetypes)
|
||||
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
if not event.snapshot_id or not event.hook_name.startswith("on_Snapshot"):
|
||||
return
|
||||
|
||||
plugin_dir = Path(event.output_dir)
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
records = _iter_archiveresult_records(event.stdout)
|
||||
if records:
|
||||
for record in records:
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
return
|
||||
|
||||
synthetic_record = {
|
||||
"plugin": event.plugin_name,
|
||||
"hook_name": event.hook_name,
|
||||
"status": "failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
||||
"output_str": event.stderr if event.exit_code != 0 else "",
|
||||
"error": event.stderr if event.exit_code != 0 else "",
|
||||
}
|
||||
await run_db_op(
|
||||
self._project_from_process_completed,
|
||||
event,
|
||||
synthetic_record,
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
def _get_snapshot_output_dir(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).only("output_dir").first()
|
||||
return str(snapshot.output_dir) if snapshot is not None else None
|
||||
|
||||
def _project(
|
||||
self,
|
||||
event: ArchiveResultEvent,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
async def on_ArchiveResultEvent__save_to_db(self, event: ArchiveResultEvent) -> None:
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
||||
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||
if snapshot is None:
|
||||
return
|
||||
|
||||
plugin_dir = Path(snapshot.output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = await sync_to_async(_resolve_output_metadata)(event.output_files, plugin_dir)
|
||||
process_started = await self.bus.find(
|
||||
ProcessStartedEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
|
||||
)
|
||||
process = None
|
||||
db_process_id = self.process_service.get_db_process_id(event.process_id)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process_started is not None:
|
||||
started_at = parse_event_datetime(process_started.start_ts)
|
||||
if started_at is None:
|
||||
raise ValueError("ProcessStartedEvent.start_ts is required")
|
||||
process_query = Process.objects.filter(
|
||||
pwd=process_started.output_dir,
|
||||
cmd=[process_started.hook_path, *process_started.hook_args],
|
||||
started_at=started_at,
|
||||
)
|
||||
if process_started.pid:
|
||||
process_query = process_query.filter(pid=process_started.pid)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
|
||||
result, _created = ArchiveResult.objects.get_or_create(
|
||||
result, _created = await ArchiveResult.objects.aget_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=event.plugin,
|
||||
hook_name=event.hook_name,
|
||||
@@ -302,32 +263,54 @@ class ArchiveResultService(BaseService):
|
||||
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
if event.error:
|
||||
result.notes = event.error
|
||||
result.save()
|
||||
await result.asave()
|
||||
|
||||
next_title = _extract_snapshot_title(str(snapshot.output_dir), event.plugin, result.output_str, snapshot_url=snapshot.url)
|
||||
if next_title and _should_update_snapshot_title(snapshot.title or "", next_title, snapshot_url=snapshot.url):
|
||||
snapshot.title = next_title
|
||||
snapshot.save(update_fields=["title", "modified_at"])
|
||||
await snapshot.asave(update_fields=["title", "modified_at"])
|
||||
|
||||
def _project_from_process_completed(
|
||||
self,
|
||||
event: ProcessCompletedEvent,
|
||||
record: dict,
|
||||
output_files: dict[str, dict],
|
||||
output_size: int,
|
||||
output_mimetypes: str,
|
||||
) -> None:
|
||||
archive_result_event = ArchiveResultEvent(
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
plugin=record.get("plugin") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or event.hook_name,
|
||||
status=record.get("status") or "",
|
||||
process_id=event.process_id,
|
||||
output_str=record.get("output_str") or "",
|
||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||
output_files=event.output_files,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
|
||||
if not event.hook_name.startswith("on_Snapshot"):
|
||||
return
|
||||
snapshot_event = await self.bus.find(
|
||||
SnapshotEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
where=lambda candidate: self.bus.event_is_child_of(event, candidate),
|
||||
)
|
||||
if snapshot_event is None:
|
||||
return
|
||||
|
||||
records = _iter_archiveresult_records(event.stdout)
|
||||
if records:
|
||||
for record in records:
|
||||
await self.bus.emit(
|
||||
ArchiveResultEvent(
|
||||
snapshot_id=record.get("snapshot_id") or snapshot_event.snapshot_id,
|
||||
plugin=record.get("plugin") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or event.hook_name,
|
||||
status=record.get("status") or "",
|
||||
output_str=record.get("output_str") or "",
|
||||
output_json=record.get("output_json") if isinstance(record.get("output_json"), dict) else None,
|
||||
output_files=event.output_files,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=record.get("error") or (event.stderr if event.exit_code != 0 else ""),
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
await self.bus.emit(
|
||||
ArchiveResultEvent(
|
||||
snapshot_id=snapshot_event.snapshot_id,
|
||||
plugin=event.plugin_name,
|
||||
hook_name=event.hook_name,
|
||||
status="failed" if event.exit_code != 0 else ("succeeded" if _has_content_files(event.output_files) else "skipped"),
|
||||
output_str=event.stderr if event.exit_code != 0 else "",
|
||||
output_files=event.output_files,
|
||||
start_ts=event.start_ts,
|
||||
end_ts=event.end_ts,
|
||||
error=event.stderr if event.exit_code != 0 else "",
|
||||
),
|
||||
)
|
||||
self._project(archive_result_event, output_files, output_size, output_mimetypes)
|
||||
|
||||
@@ -1,20 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from abx_dl.events import BinaryRequestEvent, BinaryEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class BinaryService(BaseService):
|
||||
LISTENS_TO = [BinaryRequestEvent, BinaryEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
|
||||
await run_db_op(self._project_binary, event)
|
||||
cached = await run_db_op(self._load_cached_binary, event)
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(BinaryRequestEvent, self.on_BinaryRequestEvent)
|
||||
self.bus.on(BinaryEvent, self.on_BinaryEvent)
|
||||
|
||||
async def on_BinaryRequestEvent(self, event: BinaryRequestEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
existing = await Binary.objects.filter(machine=machine, name=event.name).afirst()
|
||||
if existing and existing.status == Binary.StatusChoices.INSTALLED:
|
||||
changed = False
|
||||
if event.binproviders and existing.binproviders != event.binproviders:
|
||||
existing.binproviders = event.binproviders
|
||||
changed = True
|
||||
if event.overrides and existing.overrides != event.overrides:
|
||||
existing.overrides = event.overrides
|
||||
changed = True
|
||||
if changed:
|
||||
await existing.asave(update_fields=["binproviders", "overrides", "modified_at"])
|
||||
elif existing is None:
|
||||
await Binary.objects.acreate(
|
||||
machine=machine,
|
||||
name=event.name,
|
||||
binproviders=event.binproviders,
|
||||
overrides=event.overrides or {},
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
installed = (
|
||||
await Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("-modified_at")
|
||||
.afirst()
|
||||
)
|
||||
cached = None
|
||||
if installed is not None:
|
||||
cached = {
|
||||
"abspath": installed.abspath,
|
||||
"version": installed.version or "",
|
||||
"sha256": installed.sha256 or "",
|
||||
"binproviders": installed.binproviders or "",
|
||||
"binprovider": installed.binprovider or "",
|
||||
"machine_id": str(installed.machine_id),
|
||||
"overrides": installed.overrides or {},
|
||||
}
|
||||
if cached is not None:
|
||||
await self.bus.emit(
|
||||
BinaryEvent(
|
||||
@@ -28,126 +70,34 @@ class BinaryService(BaseService):
|
||||
binprovider=cached["binprovider"],
|
||||
overrides=event.overrides or cached["overrides"],
|
||||
binary_id=event.binary_id,
|
||||
machine_id=event.machine_id or cached["machine_id"],
|
||||
machine_id=cached["machine_id"],
|
||||
),
|
||||
)
|
||||
|
||||
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
|
||||
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
|
||||
await run_db_op(self._project_installed_binary, event, resolved)
|
||||
|
||||
def _project_binary(self, event: BinaryRequestEvent) -> None:
|
||||
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
existing = Binary.objects.filter(machine=machine, name=event.name).first()
|
||||
if existing and existing.status == Binary.StatusChoices.INSTALLED:
|
||||
changed = False
|
||||
if event.binproviders and existing.binproviders != event.binproviders:
|
||||
existing.binproviders = event.binproviders
|
||||
changed = True
|
||||
if event.overrides and existing.overrides != event.overrides:
|
||||
existing.overrides = event.overrides
|
||||
changed = True
|
||||
if changed:
|
||||
existing.save(update_fields=["binproviders", "overrides", "modified_at"])
|
||||
return
|
||||
|
||||
Binary.from_json(
|
||||
{
|
||||
"name": event.name,
|
||||
"binproviders": event.binproviders,
|
||||
"overrides": event.overrides or {},
|
||||
},
|
||||
)
|
||||
|
||||
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
installed = (
|
||||
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("-modified_at")
|
||||
.first()
|
||||
)
|
||||
if installed is None:
|
||||
return None
|
||||
return {
|
||||
"abspath": installed.abspath,
|
||||
"version": installed.version or "",
|
||||
"sha256": installed.sha256 or "",
|
||||
"binproviders": installed.binproviders or "",
|
||||
"binprovider": installed.binprovider or "",
|
||||
"machine_id": str(installed.machine_id),
|
||||
"overrides": installed.overrides or {},
|
||||
}
|
||||
|
||||
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
|
||||
resolved = {
|
||||
"abspath": event.abspath or "",
|
||||
"version": event.version or "",
|
||||
"sha256": event.sha256 or "",
|
||||
"binproviders": event.binproviders or "",
|
||||
"binprovider": event.binprovider or "",
|
||||
}
|
||||
if resolved["abspath"] and resolved["version"] and resolved["binprovider"]:
|
||||
return resolved
|
||||
|
||||
if resolved["abspath"] and not resolved["version"]:
|
||||
try:
|
||||
from abx_pkg.semver import bin_version
|
||||
|
||||
detected_version = bin_version(resolved["abspath"])
|
||||
except Exception:
|
||||
detected_version = None
|
||||
if detected_version:
|
||||
resolved["version"] = str(detected_version)
|
||||
if resolved["version"] and resolved["binprovider"]:
|
||||
return resolved
|
||||
|
||||
try:
|
||||
from abx_dl.dependencies import load_binary
|
||||
|
||||
allowed_providers = resolved["binproviders"] or resolved["binprovider"] or "env,pip,npm,brew,apt"
|
||||
spec = {
|
||||
"name": event.name,
|
||||
"binproviders": allowed_providers,
|
||||
"overrides": event.overrides or {},
|
||||
}
|
||||
binary = load_binary(spec)
|
||||
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
|
||||
resolved["version"] = str(binary.version or resolved["version"] or "")
|
||||
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
|
||||
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
|
||||
resolved["binprovider"] = str(binary.loaded_binprovider.name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return resolved
|
||||
|
||||
def _project_installed_binary(self, event: BinaryEvent, resolved: dict[str, str]) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
binary, _ = Binary.objects.get_or_create(
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
binary, _ = await Binary.objects.aget_or_create(
|
||||
machine=machine,
|
||||
name=event.name,
|
||||
defaults={
|
||||
"status": Binary.StatusChoices.QUEUED,
|
||||
},
|
||||
)
|
||||
binary.abspath = resolved["abspath"] or binary.abspath
|
||||
binary.version = resolved["version"] or binary.version
|
||||
binary.sha256 = resolved["sha256"] or binary.sha256
|
||||
if resolved["binproviders"]:
|
||||
binary.binproviders = resolved["binproviders"]
|
||||
binary.binprovider = resolved["binprovider"] or binary.binprovider
|
||||
binary.abspath = event.abspath
|
||||
if event.version:
|
||||
binary.version = event.version
|
||||
if event.sha256:
|
||||
binary.sha256 = event.sha256
|
||||
if event.binproviders:
|
||||
binary.binproviders = event.binproviders
|
||||
if event.binprovider:
|
||||
binary.binprovider = event.binprovider
|
||||
if event.overrides and binary.overrides != event.overrides:
|
||||
binary.overrides = event.overrides
|
||||
binary.status = Binary.StatusChoices.INSTALLED
|
||||
binary.retry_at = None
|
||||
binary.save(
|
||||
await binary.asave(
|
||||
update_fields=["abspath", "version", "sha256", "binproviders", "binprovider", "overrides", "status", "retry_at", "modified_at"],
|
||||
)
|
||||
|
||||
@@ -3,8 +3,6 @@ from __future__ import annotations
|
||||
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class CrawlService(BaseService):
|
||||
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
|
||||
@@ -13,32 +11,42 @@ class CrawlService(BaseService):
|
||||
def __init__(self, bus, *, crawl_id: str):
|
||||
self.crawl_id = crawl_id
|
||||
super().__init__(bus)
|
||||
self.bus.on(CrawlSetupEvent, self.on_CrawlSetupEvent__save_to_db)
|
||||
self.bus.on(CrawlStartEvent, self.on_CrawlStartEvent__save_to_db)
|
||||
self.bus.on(CrawlCleanupEvent, self.on_CrawlCleanupEvent__save_to_db)
|
||||
self.bus.on(CrawlCompletedEvent, self.on_CrawlCompletedEvent__save_to_db)
|
||||
|
||||
async def on_CrawlSetupEvent__Outer(self, event: CrawlSetupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlStartEvent__Outer(self, event: CrawlStartEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCleanupEvent__Outer(self, event: CrawlCleanupEvent) -> None:
|
||||
await run_db_op(self._mark_started)
|
||||
|
||||
async def on_CrawlCompletedEvent__Outer(self, event: CrawlCompletedEvent) -> None:
|
||||
await run_db_op(self._mark_completed)
|
||||
|
||||
def _mark_started(self) -> None:
|
||||
async def on_CrawlSetupEvent__save_to_db(self, event: CrawlSetupEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
def _mark_completed(self) -> None:
|
||||
async def on_CrawlStartEvent__save_to_db(self, event: CrawlStartEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = None
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
async def on_CrawlCleanupEvent__save_to_db(self, event: CrawlCleanupEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = None
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
async def on_CrawlCompletedEvent__save_to_db(self, event: CrawlCompletedEvent) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
await crawl.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.db import close_old_connections
|
||||
|
||||
|
||||
def _run_db_op(func, *args, **kwargs):
|
||||
close_old_connections()
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
finally:
|
||||
close_old_connections()
|
||||
|
||||
|
||||
async def run_db_op(func, *args, **kwargs):
|
||||
return await sync_to_async(_run_db_op, thread_sensitive=True)(func, *args, **kwargs)
|
||||
@@ -1,22 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from abx_dl.events import MachineEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class MachineService(BaseService):
|
||||
LISTENS_TO = [MachineEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_MachineEvent__Outer(self, event: MachineEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(MachineEvent, self.on_MachineEvent__save_to_db)
|
||||
|
||||
def _project(self, event: MachineEvent) -> None:
|
||||
async def on_MachineEvent__save_to_db(self, event: MachineEvent) -> None:
|
||||
from archivebox.machine.models import Machine, _sanitize_machine_config
|
||||
|
||||
machine = Machine.current()
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
config = dict(machine.config or {})
|
||||
|
||||
if event.config is not None:
|
||||
@@ -29,4 +30,4 @@ class MachineService(BaseService):
|
||||
return
|
||||
|
||||
machine.config = _sanitize_machine_config(config)
|
||||
machine.save(update_fields=["config", "modified_at"])
|
||||
await machine.asave(update_fields=["config", "modified_at"])
|
||||
|
||||
@@ -1,29 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone as datetime_timezone
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import socket
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, ClassVar
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
from typing import ClassVar
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abxbus import BaseEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
|
||||
WORKER_READY_TIMEOUT = 10.0
|
||||
|
||||
|
||||
def parse_event_datetime(value: str | None):
|
||||
if not value:
|
||||
@@ -37,308 +23,133 @@ def parse_event_datetime(value: str | None):
|
||||
return dt
|
||||
|
||||
|
||||
def _is_port_listening(host: str, port: int) -> bool:
|
||||
if not host or not port:
|
||||
return False
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
|
||||
if not url:
|
||||
return None
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
|
||||
return None
|
||||
return parsed.hostname, parsed.port
|
||||
|
||||
|
||||
def _supervisor_env(env: dict[str, str]) -> str:
|
||||
pairs = []
|
||||
for key, value in env.items():
|
||||
escaped = value.replace('"', '\\"')
|
||||
pairs.append(f'{key}="{escaped}"')
|
||||
return ",".join(pairs)
|
||||
|
||||
|
||||
def _iso_from_epoch(value: object) -> str:
|
||||
if not isinstance(value, (int, float)) or value <= 0:
|
||||
return ""
|
||||
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _int_from_object(value: object) -> int:
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
||||
|
||||
output_dir = Path(process_event.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
worker_name = process_event.hook_name
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
|
||||
|
||||
existing = get_worker(supervisor, worker_name)
|
||||
if (
|
||||
isinstance(existing, dict)
|
||||
and existing.get("statename") == "RUNNING"
|
||||
and (worker_socket is None or _is_port_listening(*worker_socket))
|
||||
):
|
||||
return existing
|
||||
|
||||
daemon = {
|
||||
"name": worker_name,
|
||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
||||
"directory": str(output_dir),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
if process_event.env:
|
||||
daemon["environment"] = _supervisor_env(process_event.env)
|
||||
|
||||
proc = start_worker(supervisor, daemon)
|
||||
deadline = time.monotonic() + WORKER_READY_TIMEOUT
|
||||
while time.monotonic() < deadline:
|
||||
current = get_worker(supervisor, worker_name)
|
||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
||||
if worker_socket is None or _is_port_listening(*worker_socket):
|
||||
return current
|
||||
time.sleep(0.1)
|
||||
return proc if isinstance(proc, dict) else {}
|
||||
|
||||
|
||||
class ProcessService(BaseService):
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = []
|
||||
|
||||
def __init__(self, bus):
|
||||
self.process_ids: dict[str, str] = {}
|
||||
super().__init__(bus)
|
||||
self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db)
|
||||
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
|
||||
|
||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
||||
try:
|
||||
record = json.loads(event.line)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return
|
||||
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
|
||||
return
|
||||
|
||||
passthrough_fields: dict[str, Any] = {
|
||||
key: value
|
||||
for key, value in record.items()
|
||||
if key
|
||||
not in {
|
||||
"type",
|
||||
"plugin_name",
|
||||
"hook_name",
|
||||
"hook_path",
|
||||
"hook_args",
|
||||
"is_background",
|
||||
"output_dir",
|
||||
"env",
|
||||
"snapshot_id",
|
||||
"process_id",
|
||||
"url",
|
||||
"timeout",
|
||||
"daemon",
|
||||
"process_type",
|
||||
"worker_type",
|
||||
"event_timeout",
|
||||
"event_handler_timeout",
|
||||
}
|
||||
}
|
||||
process_event = ProcessEvent(
|
||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or "process",
|
||||
hook_path=record["hook_path"],
|
||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
||||
is_background=bool(record.get("is_background", True)),
|
||||
output_dir=record.get("output_dir") or event.output_dir,
|
||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
timeout=int(record.get("timeout") or 60),
|
||||
daemon=bool(record.get("daemon", False)),
|
||||
url=str(record.get("url") or ""),
|
||||
process_type=str(record.get("process_type") or ""),
|
||||
worker_type=str(record.get("worker_type") or ""),
|
||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
||||
**passthrough_fields,
|
||||
)
|
||||
if not process_event.daemon:
|
||||
await self.bus.emit(process_event)
|
||||
return
|
||||
|
||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
||||
start_ts = _iso_from_epoch(proc.get("start"))
|
||||
pid = _int_from_object(proc.get("pid"))
|
||||
statename = str(proc.get("statename") or "")
|
||||
exitstatus = _int_from_object(proc.get("exitstatus"))
|
||||
process_type = process_event.process_type or "worker"
|
||||
worker_type = process_event.worker_type or process_event.plugin_name
|
||||
|
||||
if statename == "RUNNING" and pid:
|
||||
await self.bus.emit(
|
||||
ProcessStartedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
output_dir=process_event.output_dir,
|
||||
env=process_event.env,
|
||||
timeout=process_event.timeout,
|
||||
pid=pid,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
is_background=True,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
stderr = (
|
||||
f"Worker {process_event.hook_name} failed to start"
|
||||
if not statename
|
||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
||||
)
|
||||
await self.bus.emit(
|
||||
ProcessCompletedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
env=process_event.env,
|
||||
stdout="",
|
||||
stderr=stderr,
|
||||
exit_code=exitstatus or 1,
|
||||
output_dir=process_event.output_dir,
|
||||
is_background=True,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
pid=pid,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
raise RuntimeError(stderr)
|
||||
|
||||
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
|
||||
await run_db_op(self._project_started, event)
|
||||
|
||||
async def on_ProcessCompletedEvent__Outer(self, event: ProcessCompletedEvent) -> None:
|
||||
await run_db_op(self._project_completed, event)
|
||||
|
||||
def get_db_process_id(self, process_id: str) -> str | None:
|
||||
return self.process_ids.get(process_id)
|
||||
|
||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> Process:
|
||||
async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None:
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
db_process_id = self.process_ids.get(event.process_id)
|
||||
iface = NetworkInterface.current(refresh=True)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process is not None:
|
||||
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
return process
|
||||
|
||||
process_type = getattr(event, "process_type", "") or (
|
||||
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
|
||||
process_type = event.process_type or (
|
||||
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
|
||||
)
|
||||
worker_type = getattr(event, "worker_type", "") or ""
|
||||
if process_type == Process.TypeChoices.WORKER and worker_type:
|
||||
existing = (
|
||||
Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
)
|
||||
.order_by("-modified_at")
|
||||
.first()
|
||||
)
|
||||
if existing is not None:
|
||||
self.process_ids[event.process_id] = str(existing.id)
|
||||
return existing
|
||||
process = Process.objects.create(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
worker_type = event.worker_type or ""
|
||||
started_at = parse_event_datetime(event.start_ts)
|
||||
if started_at is None:
|
||||
raise ValueError("ProcessStartedEvent.start_ts is required")
|
||||
process_query = Process.objects.filter(
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
timeout=getattr(event, "timeout", 60),
|
||||
pid=event.pid or None,
|
||||
url=getattr(event, "url", "") or None,
|
||||
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
started_at=started_at,
|
||||
)
|
||||
self.process_ids[event.process_id] = str(process.id)
|
||||
return process
|
||||
if event.pid:
|
||||
process_query = process_query.filter(pid=event.pid)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
if process is None:
|
||||
process = await Process.objects.acreate(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
timeout=event.timeout,
|
||||
pid=event.pid or None,
|
||||
url=event.url or None,
|
||||
started_at=started_at,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
)
|
||||
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
await process.asave(update_fields=["iface", "machine", "modified_at"])
|
||||
|
||||
def _project_started(self, event: ProcessStartedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.timeout = event.timeout
|
||||
process.pid = event.pid or None
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
process.url = event.url or process.url
|
||||
process.process_type = process_type or process.process_type
|
||||
process.worker_type = worker_type or process.worker_type
|
||||
process.started_at = started_at
|
||||
process.status = process.StatusChoices.RUNNING
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
|
||||
plugin_name=event.plugin_name,
|
||||
hook_path=event.hook_path,
|
||||
)
|
||||
await process.asave()
|
||||
|
||||
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
|
||||
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
|
||||
process_type = event.process_type or (
|
||||
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
|
||||
)
|
||||
worker_type = event.worker_type or ""
|
||||
started_at = parse_event_datetime(event.start_ts)
|
||||
if started_at is None:
|
||||
raise ValueError("ProcessCompletedEvent.start_ts is required")
|
||||
process_query = Process.objects.filter(
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
started_at=started_at,
|
||||
)
|
||||
if event.pid:
|
||||
process_query = process_query.filter(pid=event.pid)
|
||||
process = await process_query.order_by("-modified_at").afirst()
|
||||
if process is None:
|
||||
process = await Process.objects.acreate(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
timeout=event.timeout,
|
||||
pid=event.pid or None,
|
||||
url=event.url or None,
|
||||
started_at=started_at,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
)
|
||||
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
await process.asave(update_fields=["iface", "machine", "modified_at"])
|
||||
|
||||
def _project_completed(self, event: ProcessCompletedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
if not process.cmd:
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
process.url = event.url or process.url
|
||||
process.process_type = process_type or process.process_type
|
||||
process.worker_type = worker_type or process.worker_type
|
||||
process.started_at = started_at
|
||||
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
process.stdout = event.stdout
|
||||
process.stderr = event.stderr
|
||||
process.exit_code = event.exit_code
|
||||
process.status = process.StatusChoices.EXITED
|
||||
process.retry_at = None
|
||||
process.hydrate_binary_from_context(plugin_name=event.plugin_name, hook_path=event.hook_path)
|
||||
process.save()
|
||||
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
|
||||
plugin_name=event.plugin_name,
|
||||
hook_path=event.hook_path,
|
||||
)
|
||||
await process.asave()
|
||||
|
||||
@@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -13,12 +12,13 @@ from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Any
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
from rich.console import Console
|
||||
|
||||
from abx_dl.events import BinaryRequestEvent
|
||||
from abx_dl.limits import CrawlLimitState
|
||||
from abx_dl.models import Plugin, Snapshot as AbxSnapshot, discover_plugins, filter_plugins
|
||||
from abx_dl.models import Plugin, discover_plugins, filter_plugins
|
||||
from abx_dl.orchestrator import (
|
||||
create_bus,
|
||||
download,
|
||||
@@ -40,150 +40,9 @@ def _bus_name(prefix: str, identifier: str) -> str:
|
||||
return f"{prefix}_{normalized}"
|
||||
|
||||
|
||||
def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
|
||||
raw = str(config.get("PLUGINS") or "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
return [name.strip() for name in raw.split(",") if name.strip()]
|
||||
|
||||
|
||||
def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str] | None) -> int:
|
||||
selected = filter_plugins(plugins, selected_plugins) if selected_plugins else plugins
|
||||
return sum(
|
||||
1
|
||||
for plugin in selected.values()
|
||||
for hook in plugin.hooks
|
||||
if "Install" in hook.name or "CrawlSetup" in hook.name or "Snapshot" in hook.name
|
||||
)
|
||||
|
||||
|
||||
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
|
||||
|
||||
|
||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
|
||||
keys: list[str] = []
|
||||
|
||||
for plugin in plugins.values():
|
||||
for spec in plugin.binaries:
|
||||
template_name = str(spec.get("name") or "").strip()
|
||||
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
|
||||
if match is None:
|
||||
continue
|
||||
key = match.group(1)
|
||||
configured_value = config.get(key)
|
||||
if configured_value is not None and str(configured_value).strip() == binary_name:
|
||||
keys.append(key)
|
||||
for key, prop in plugin.config_schema.items():
|
||||
if key.endswith("_BINARY") and prop.get("default") == binary_name:
|
||||
keys.append(key)
|
||||
|
||||
return list(dict.fromkeys(keys))
|
||||
|
||||
|
||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
active_config = dict(config or {})
|
||||
overrides: dict[str, str] = {}
|
||||
shared_lib_dir: Path | None = None
|
||||
pip_home: Path | None = None
|
||||
pip_bin_dir: Path | None = None
|
||||
npm_home: Path | None = None
|
||||
node_modules_dir: Path | None = None
|
||||
npm_bin_dir: Path | None = None
|
||||
binaries = (
|
||||
Binary.objects.filter(machine=machine, status=Binary.StatusChoices.INSTALLED).exclude(abspath="").exclude(abspath__isnull=True)
|
||||
)
|
||||
|
||||
for binary in binaries:
|
||||
try:
|
||||
resolved_path = Path(binary.abspath).expanduser()
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
|
||||
continue
|
||||
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
|
||||
overrides[key] = binary.abspath
|
||||
|
||||
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
|
||||
npm_bin_dir = npm_bin_dir or resolved_path.parent
|
||||
node_modules_dir = node_modules_dir or resolved_path.parent.parent
|
||||
npm_home = npm_home or resolved_path.parent.parent.parent
|
||||
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
|
||||
elif (
|
||||
resolved_path.parent.name == "bin"
|
||||
and resolved_path.parent.parent.name == "venv"
|
||||
and resolved_path.parent.parent.parent.name == "pip"
|
||||
):
|
||||
pip_bin_dir = pip_bin_dir or resolved_path.parent
|
||||
pip_home = pip_home or resolved_path.parent.parent.parent
|
||||
shared_lib_dir = shared_lib_dir or resolved_path.parent.parent.parent.parent
|
||||
|
||||
if shared_lib_dir is not None:
|
||||
overrides["LIB_DIR"] = str(shared_lib_dir)
|
||||
overrides["LIB_BIN_DIR"] = str(shared_lib_dir / "bin")
|
||||
if pip_home is not None:
|
||||
overrides["PIP_HOME"] = str(pip_home)
|
||||
if pip_bin_dir is not None:
|
||||
overrides["PIP_BIN_DIR"] = str(pip_bin_dir)
|
||||
if npm_home is not None:
|
||||
overrides["NPM_HOME"] = str(npm_home)
|
||||
if node_modules_dir is not None:
|
||||
overrides["NODE_MODULES_DIR"] = str(node_modules_dir)
|
||||
overrides["NODE_MODULE_DIR"] = str(node_modules_dir)
|
||||
overrides["NODE_PATH"] = str(node_modules_dir)
|
||||
if npm_bin_dir is not None:
|
||||
overrides["NPM_BIN_DIR"] = str(npm_bin_dir)
|
||||
|
||||
return overrides
|
||||
|
||||
|
||||
def _limit_stop_reason(config: dict[str, Any]) -> str:
|
||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
||||
|
||||
|
||||
def _attach_bus_trace(bus) -> None:
|
||||
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
|
||||
if not trace_target:
|
||||
return
|
||||
if getattr(bus, "_archivebox_trace_task", None) is not None:
|
||||
return
|
||||
|
||||
trace_path = None if trace_target in {"1", "-", "stderr"} else Path(trace_target)
|
||||
stop_event = asyncio.Event()
|
||||
|
||||
async def trace_loop() -> None:
|
||||
seen_event_ids: set[str] = set()
|
||||
while not stop_event.is_set():
|
||||
for event_id, event in list(bus.event_history.items()):
|
||||
if event_id in seen_event_ids:
|
||||
continue
|
||||
seen_event_ids.add(event_id)
|
||||
payload = event.model_dump(mode="json")
|
||||
payload["bus_name"] = bus.name
|
||||
line = json.dumps(payload, ensure_ascii=False, default=str, separators=(",", ":"))
|
||||
if trace_path is None:
|
||||
print(line, file=sys.stderr, flush=True)
|
||||
else:
|
||||
trace_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with trace_path.open("a", encoding="utf-8") as handle:
|
||||
handle.write(line + "\n")
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
bus._archivebox_trace_stop = stop_event
|
||||
bus._archivebox_trace_task = asyncio.create_task(trace_loop())
|
||||
|
||||
|
||||
async def _stop_bus_trace(bus) -> None:
|
||||
stop_event = getattr(bus, "_archivebox_trace_stop", None)
|
||||
trace_task = getattr(bus, "_archivebox_trace_task", None)
|
||||
if stop_event is None or trace_task is None:
|
||||
return
|
||||
stop_event.set()
|
||||
await asyncio.gather(trace_task, return_exceptions=True)
|
||||
bus._archivebox_trace_stop = None
|
||||
bus._archivebox_trace_task = None
|
||||
return sum(1 for plugin in selected.values() for hook in plugin.hooks if "CrawlSetup" in hook.name or "Snapshot" in hook.name)
|
||||
|
||||
|
||||
def ensure_background_runner(*, allow_under_pytest: bool = False) -> bool:
|
||||
@@ -235,22 +94,25 @@ class CrawlRunner:
|
||||
self.crawl = crawl
|
||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||
self.plugins = discover_plugins()
|
||||
self.process_service = ProcessService(self.bus)
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
ProcessService(self.bus)
|
||||
BinaryService(self.bus)
|
||||
TagService(self.bus)
|
||||
CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||
self.snapshot_service = SnapshotService(
|
||||
|
||||
async def ignore_snapshot(_snapshot_id: str) -> None:
|
||||
return None
|
||||
|
||||
SnapshotService(
|
||||
self.bus,
|
||||
crawl_id=str(crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
schedule_snapshot=self.enqueue_snapshot if process_discovered_snapshots_inline else ignore_snapshot,
|
||||
)
|
||||
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
|
||||
ArchiveResultService(self.bus)
|
||||
self.selected_plugins = selected_plugins
|
||||
self.initial_snapshot_ids = snapshot_ids
|
||||
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
|
||||
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
|
||||
self.abx_services = None
|
||||
self.persona = None
|
||||
self.base_config: dict[str, Any] = {}
|
||||
self.derived_config: dict[str, Any] = {}
|
||||
@@ -258,15 +120,11 @@ class CrawlRunner:
|
||||
self._live_stream = None
|
||||
|
||||
async def run(self) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
try:
|
||||
await sync_to_async(self._prepare, thread_sensitive=True)()
|
||||
snapshot_ids = await sync_to_async(self.load_run_state, thread_sensitive=True)()
|
||||
live_ui = self._create_live_ui()
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(self.bus)
|
||||
self.abx_services = setup_abx_services(
|
||||
setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides={
|
||||
@@ -278,18 +136,14 @@ class CrawlRunner:
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
await self.run_crawl_setup(root_snapshot_id)
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
await self._wait_for_snapshot_tasks()
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
if self.abx_services is not None:
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
await self.wait_for_snapshot_tasks()
|
||||
await self.run_crawl_cleanup(root_snapshot_id)
|
||||
finally:
|
||||
await _stop_bus_trace(self.bus)
|
||||
await self.bus.stop()
|
||||
if self._live_stream is not None:
|
||||
try:
|
||||
@@ -297,33 +151,16 @@ class CrawlRunner:
|
||||
except Exception:
|
||||
pass
|
||||
self._live_stream = None
|
||||
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
|
||||
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
|
||||
crawl_is_finished = await sync_to_async(crawl.is_finished, thread_sensitive=True)()
|
||||
if crawl_is_finished:
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
else:
|
||||
if crawl.status == Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
elif crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = crawl.retry_at or timezone.now()
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
await sync_to_async(self.finalize_run_state, thread_sensitive=True)()
|
||||
|
||||
async def enqueue_snapshot(self, snapshot_id: str) -> None:
|
||||
task = self.snapshot_tasks.get(snapshot_id)
|
||||
if task is not None and not task.done():
|
||||
return
|
||||
task = asyncio.create_task(self._run_snapshot(snapshot_id))
|
||||
task = asyncio.create_task(self.run_snapshot(snapshot_id))
|
||||
self.snapshot_tasks[snapshot_id] = task
|
||||
|
||||
async def leave_snapshot_queued(self, snapshot_id: str) -> None:
|
||||
return None
|
||||
|
||||
async def _wait_for_snapshot_tasks(self) -> None:
|
||||
async def wait_for_snapshot_tasks(self) -> None:
|
||||
while True:
|
||||
pending_tasks: list[asyncio.Task[None]] = []
|
||||
for snapshot_id, task in list(self.snapshot_tasks.items()):
|
||||
@@ -339,9 +176,9 @@ class CrawlRunner:
|
||||
for task in done:
|
||||
task.result()
|
||||
|
||||
def _prepare(self) -> None:
|
||||
def load_run_state(self) -> list[str]:
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
from archivebox.machine.models import Machine, NetworkInterface, Process
|
||||
|
||||
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
|
||||
current_iface = NetworkInterface.current(refresh=True)
|
||||
@@ -352,17 +189,42 @@ class CrawlRunner:
|
||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
|
||||
self.derived_config = dict(Machine.current().config)
|
||||
self.base_config["ABX_RUNTIME"] = "archivebox"
|
||||
if self.selected_plugins is None:
|
||||
self.selected_plugins = _selected_plugins_from_config(self.base_config)
|
||||
raw_plugins = self.base_config["PLUGINS"].strip()
|
||||
self.selected_plugins = [name.strip() for name in raw_plugins.split(",") if name.strip()] if raw_plugins else None
|
||||
if self.persona:
|
||||
chrome_binary = str(self.base_config.get("CHROME_BINARY") or "")
|
||||
self.base_config.update(self.persona.prepare_runtime_for_crawl(self.crawl, chrome_binary=chrome_binary))
|
||||
self.base_config.update(
|
||||
self.persona.prepare_runtime_for_crawl(
|
||||
self.crawl,
|
||||
chrome_binary=self.base_config["CHROME_BINARY"],
|
||||
),
|
||||
)
|
||||
if self.initial_snapshot_ids:
|
||||
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
|
||||
created = self.crawl.create_snapshots_from_urls()
|
||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||
return [str(snapshot.id) for snapshot in snapshots]
|
||||
|
||||
def finalize_run_state(self) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
def _cleanup_persona(self) -> None:
|
||||
if self.persona:
|
||||
self.persona.cleanup_runtime_for_crawl(self.crawl)
|
||||
crawl = Crawl.objects.get(id=self.crawl.id)
|
||||
if crawl.is_finished():
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
return
|
||||
if crawl.status == Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.QUEUED
|
||||
elif crawl.status != Crawl.StatusChoices.STARTED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = crawl.retry_at or timezone.now()
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
def _create_live_ui(self) -> LiveBusUI | None:
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
@@ -373,7 +235,7 @@ class CrawlRunner:
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
self._live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
|
||||
stream = self._live_stream
|
||||
except OSError:
|
||||
self._live_stream = None
|
||||
@@ -399,7 +261,7 @@ class CrawlRunner:
|
||||
live_ui = LiveBusUI(
|
||||
self.bus,
|
||||
total_hooks=_count_selected_hooks(self.plugins, self.selected_plugins),
|
||||
timeout_seconds=int(self.base_config.get("TIMEOUT") or 60),
|
||||
timeout_seconds=self.base_config["TIMEOUT"],
|
||||
ui_console=ui_console,
|
||||
interactive_tty=True,
|
||||
)
|
||||
@@ -410,128 +272,24 @@ class CrawlRunner:
|
||||
)
|
||||
return live_ui
|
||||
|
||||
def _create_root_snapshots(self) -> list[str]:
|
||||
created = self.crawl.create_snapshots_from_urls()
|
||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||
return [str(snapshot.id) for snapshot in snapshots]
|
||||
|
||||
def _initial_snapshot_ids(self) -> list[str]:
|
||||
if self.initial_snapshot_ids:
|
||||
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
|
||||
return self._create_root_snapshots()
|
||||
|
||||
def _snapshot_config(self, snapshot) -> dict[str, Any]:
|
||||
def load_snapshot_payload(self, snapshot_id: str) -> dict[str, Any]:
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
config = get_config(crawl=self.crawl, snapshot=snapshot)
|
||||
config.update(self.base_config)
|
||||
config["CRAWL_DIR"] = str(self.crawl.output_dir)
|
||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||
config["SNAPSHOT_ID"] = str(snapshot.id)
|
||||
config["SNAPSHOT_DEPTH"] = snapshot.depth
|
||||
config["CRAWL_ID"] = str(self.crawl.id)
|
||||
config["SOURCE_URL"] = snapshot.url
|
||||
if snapshot.parent_snapshot_id:
|
||||
config["PARENT_SNAPSHOT_ID"] = str(snapshot.parent_snapshot_id)
|
||||
return config
|
||||
|
||||
async def _run_crawl_setup(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
setup_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=setup_snapshot,
|
||||
crawl_setup_only=True,
|
||||
)
|
||||
|
||||
async def _run_crawl_cleanup(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
cleanup_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=cleanup_snapshot,
|
||||
crawl_cleanup_only=True,
|
||||
)
|
||||
|
||||
async def _run_snapshot(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
async with self.snapshot_semaphore:
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
if snapshot["status"] == "sealed":
|
||||
return
|
||||
if snapshot["depth"] > 0 and _limit_stop_reason(snapshot["config"]) == "max_size":
|
||||
await sync_to_async(self._cancel_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
|
||||
return
|
||||
abx_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
try:
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
|
||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
extra_context: dict[str, Any] = {}
|
||||
if config.get("EXTRA_CONTEXT"):
|
||||
parsed_extra_context = json.loads(str(config["EXTRA_CONTEXT"]))
|
||||
if not isinstance(parsed_extra_context, dict):
|
||||
raise TypeError("EXTRA_CONTEXT must decode to an object")
|
||||
extra_context = parsed_extra_context
|
||||
extra_context["snapshot_id"] = str(snapshot.id)
|
||||
extra_context["snapshot_depth"] = snapshot.depth
|
||||
config["EXTRA_CONTEXT"] = json.dumps(extra_context, separators=(",", ":"), sort_keys=True)
|
||||
return {
|
||||
"id": str(snapshot.id),
|
||||
"url": snapshot.url,
|
||||
@@ -542,12 +300,91 @@ class CrawlRunner:
|
||||
"tags": snapshot.tags_str(),
|
||||
"depth": snapshot.depth,
|
||||
"status": snapshot.status,
|
||||
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot.output_dir),
|
||||
"config": self._snapshot_config(snapshot),
|
||||
"config": config,
|
||||
}
|
||||
|
||||
def _cancel_snapshot_due_to_limit(self, snapshot_id: str) -> None:
|
||||
async def run_crawl_setup(self, snapshot_id: str) -> None:
|
||||
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
derived_config_overrides=self.derived_config,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
install_enabled=True,
|
||||
crawl_setup_enabled=True,
|
||||
crawl_start_enabled=False,
|
||||
snapshot_cleanup_enabled=False,
|
||||
crawl_cleanup_enabled=False,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
archive_result_service=None,
|
||||
tag_service=None,
|
||||
)
|
||||
|
||||
async def run_crawl_cleanup(self, snapshot_id: str) -> None:
|
||||
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||
await download(
|
||||
bus=self.bus,
|
||||
url=snapshot["url"],
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
plugins=self.plugins,
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
derived_config_overrides=self.derived_config,
|
||||
emit_jsonl=False,
|
||||
install_enabled=False,
|
||||
crawl_setup_enabled=False,
|
||||
crawl_start_enabled=False,
|
||||
snapshot_cleanup_enabled=False,
|
||||
crawl_cleanup_enabled=True,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
archive_result_service=None,
|
||||
tag_service=None,
|
||||
)
|
||||
|
||||
async def run_snapshot(self, snapshot_id: str) -> None:
|
||||
async with self.snapshot_semaphore:
|
||||
snapshot = await sync_to_async(self.load_snapshot_payload, thread_sensitive=True)(snapshot_id)
|
||||
if snapshot["status"] == "sealed":
|
||||
return
|
||||
if snapshot["depth"] > 0 and CrawlLimitState.from_config(snapshot["config"]).get_stop_reason() == "max_size":
|
||||
await sync_to_async(self.seal_snapshot_due_to_limit, thread_sensitive=True)(snapshot_id)
|
||||
return
|
||||
try:
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
derived_config_overrides=self.derived_config,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
install_enabled=False,
|
||||
crawl_setup_enabled=False,
|
||||
crawl_start_enabled=True,
|
||||
snapshot_cleanup_enabled=True,
|
||||
crawl_cleanup_enabled=False,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
archive_result_service=None,
|
||||
tag_service=None,
|
||||
)
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
|
||||
def seal_snapshot_due_to_limit(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||
@@ -579,21 +416,20 @@ def run_crawl(
|
||||
|
||||
|
||||
async def _run_binary(binary_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Binary
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
|
||||
binary = await Binary.objects.aget(id=binary_id)
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
derived_config = dict(machine.config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
||||
process_service = ProcessService(bus)
|
||||
ProcessService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
ArchiveResultService(bus)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
@@ -605,7 +441,6 @@ async def _run_binary(binary_id: str) -> None:
|
||||
)
|
||||
|
||||
try:
|
||||
_attach_bus_trace(bus)
|
||||
await bus.emit(
|
||||
BinaryRequestEvent(
|
||||
name=binary.name,
|
||||
@@ -619,7 +454,6 @@ async def _run_binary(binary_id: str) -> None:
|
||||
),
|
||||
)
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
|
||||
|
||||
@@ -628,20 +462,20 @@ def run_binary(binary_id: str) -> None:
|
||||
|
||||
|
||||
async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
machine = await sync_to_async(Machine.current, thread_sensitive=True)()
|
||||
derived_config = dict(machine.config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
ProcessService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
ArchiveResultService(bus)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
@@ -657,7 +491,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
if not selected_plugins:
|
||||
return
|
||||
plugins_label = ", ".join(plugin_names) if plugin_names else f"all ({len(plugins)} available)"
|
||||
timeout_seconds = int(config.get("TIMEOUT") or 60)
|
||||
timeout_seconds = config["TIMEOUT"]
|
||||
stdout_is_tty = sys.stdout.isatty()
|
||||
stderr_is_tty = sys.stderr.isatty()
|
||||
interactive_tty = stdout_is_tty or stderr_is_tty
|
||||
@@ -668,7 +502,7 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
stream = sys.stderr if stderr_is_tty else sys.stdout
|
||||
if os.path.exists("/dev/tty"):
|
||||
try:
|
||||
live_stream = open("/dev/tty", "w", buffering=1, encoding=getattr(stream, "encoding", None) or "utf-8")
|
||||
live_stream = open("/dev/tty", "w", buffering=1, encoding=stream.encoding or "utf-8")
|
||||
stream = live_stream
|
||||
except OSError:
|
||||
live_stream = None
|
||||
@@ -707,20 +541,21 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
plugins_label=plugins_label,
|
||||
)
|
||||
with live_ui if live_ui is not None else nullcontext():
|
||||
_attach_bus_trace(bus)
|
||||
results = await abx_install_plugins(
|
||||
plugin_names=plugin_names,
|
||||
plugins=plugins,
|
||||
output_dir=output_dir,
|
||||
config_overrides=config,
|
||||
derived_config_overrides=derived_config,
|
||||
emit_jsonl=False,
|
||||
bus=bus,
|
||||
machine_service=None,
|
||||
binary_service=None,
|
||||
process_service=None,
|
||||
)
|
||||
await abx_services.process.wait_for_background_monitors()
|
||||
if live_ui is not None:
|
||||
live_ui.print_summary(results, output_dir=output_dir)
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
try:
|
||||
if live_stream is not None:
|
||||
@@ -739,6 +574,12 @@ def recover_orphaned_crawls() -> int:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_crawl_ids: set[str] = set()
|
||||
orphaned_crawls = list(
|
||||
Crawl.objects.filter(
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at__isnull=True,
|
||||
).prefetch_related("snapshot_set"),
|
||||
)
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
@@ -746,23 +587,27 @@ def recover_orphaned_crawls() -> int:
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
).only("pwd")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
if not proc.pwd:
|
||||
continue
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
if crawl_id:
|
||||
active_crawl_ids.add(str(crawl_id))
|
||||
proc_pwd = Path(proc.pwd)
|
||||
for crawl in orphaned_crawls:
|
||||
matched_snapshot = None
|
||||
for snapshot in crawl.snapshot_set.all():
|
||||
try:
|
||||
proc_pwd.relative_to(snapshot.output_dir)
|
||||
matched_snapshot = snapshot
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
if matched_snapshot is not None:
|
||||
active_crawl_ids.add(str(crawl.id))
|
||||
break
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_crawls = Crawl.objects.filter(
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at__isnull=True,
|
||||
).prefetch_related("snapshot_set")
|
||||
|
||||
for crawl in orphaned_crawls:
|
||||
if str(crawl.id) in active_crawl_ids:
|
||||
continue
|
||||
@@ -788,6 +633,11 @@ def recover_orphaned_snapshots() -> int:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
active_snapshot_ids: set[str] = set()
|
||||
orphaned_snapshots = list(
|
||||
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
.select_related("crawl")
|
||||
.prefetch_related("archiveresult_set"),
|
||||
)
|
||||
running_processes = Process.objects.filter(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
@@ -795,24 +645,22 @@ def recover_orphaned_snapshots() -> int:
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).only("env")
|
||||
).only("pwd")
|
||||
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
if not proc.pwd:
|
||||
continue
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if snapshot_id:
|
||||
active_snapshot_ids.add(str(snapshot_id))
|
||||
proc_pwd = Path(proc.pwd)
|
||||
for snapshot in orphaned_snapshots:
|
||||
try:
|
||||
proc_pwd.relative_to(snapshot.output_dir)
|
||||
active_snapshot_ids.add(str(snapshot.id))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
recovered = 0
|
||||
now = timezone.now()
|
||||
orphaned_snapshots = (
|
||||
Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED, retry_at__isnull=True)
|
||||
.select_related("crawl")
|
||||
.prefetch_related("archiveresult_set")
|
||||
)
|
||||
|
||||
for snapshot in orphaned_snapshots:
|
||||
if str(snapshot.id) in active_snapshot_ids:
|
||||
continue
|
||||
|
||||
@@ -7,8 +7,6 @@ from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
||||
from abx_dl.limits import CrawlLimitState
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class SnapshotService(BaseService):
|
||||
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
||||
@@ -18,120 +16,96 @@ class SnapshotService(BaseService):
|
||||
self.crawl_id = crawl_id
|
||||
self.schedule_snapshot = schedule_snapshot
|
||||
super().__init__(bus)
|
||||
self.bus.on(SnapshotEvent, self.on_SnapshotEvent)
|
||||
self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent)
|
||||
|
||||
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._project_snapshot, event)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
|
||||
if snapshot_id and event.depth > 0:
|
||||
await self.schedule_snapshot(snapshot_id)
|
||||
|
||||
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
|
||||
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
|
||||
if snapshot_id:
|
||||
await sync_to_async(self._write_snapshot_details)(snapshot_id)
|
||||
|
||||
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
||||
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
||||
snapshot_id: str | None = None
|
||||
snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).afirst()
|
||||
|
||||
if event.depth == 0:
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
|
||||
if snapshot is None:
|
||||
return None
|
||||
if snapshot is not None:
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
return str(snapshot.id)
|
||||
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot_id = str(snapshot.id)
|
||||
elif event.depth > 0:
|
||||
if event.depth <= crawl.max_depth and self._crawl_limit_stop_reason(crawl) != "max_size":
|
||||
parent_event = await self.bus.find(
|
||||
SnapshotEvent,
|
||||
past=True,
|
||||
future=False,
|
||||
where=lambda candidate: candidate.depth == event.depth - 1 and self.bus.event_is_child_of(event, candidate),
|
||||
)
|
||||
parent_snapshot = None
|
||||
if parent_event is not None:
|
||||
parent_snapshot = await Snapshot.objects.filter(id=parent_event.snapshot_id, crawl=crawl).afirst()
|
||||
if parent_snapshot is not None and self._url_passes_filters(crawl, parent_snapshot, event.url):
|
||||
snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)(
|
||||
{
|
||||
"url": event.url,
|
||||
"depth": event.depth,
|
||||
"parent_snapshot_id": str(parent_snapshot.id),
|
||||
"crawl_id": str(crawl.id),
|
||||
},
|
||||
overrides={
|
||||
"crawl": crawl,
|
||||
"snapshot": parent_snapshot,
|
||||
"created_by_id": crawl.created_by_id,
|
||||
},
|
||||
queue_for_extraction=False,
|
||||
)
|
||||
if snapshot is not None and snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
snapshot.retry_at = None
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot_id = str(snapshot.id)
|
||||
|
||||
if event.depth > crawl.max_depth:
|
||||
return None
|
||||
if self._crawl_limit_stop_reason(crawl) == "max_size":
|
||||
return None
|
||||
if snapshot_id:
|
||||
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||
if snapshot is not None:
|
||||
await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)()
|
||||
if snapshot_id and event.depth > 0:
|
||||
await self.schedule_snapshot(snapshot_id)
|
||||
|
||||
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
|
||||
if parent_snapshot is None:
|
||||
return None
|
||||
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
|
||||
return None
|
||||
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.from_json(
|
||||
{
|
||||
"url": event.url,
|
||||
"depth": event.depth,
|
||||
"parent_snapshot_id": str(parent_snapshot.id),
|
||||
"crawl_id": str(crawl.id),
|
||||
},
|
||||
overrides={
|
||||
"crawl": crawl,
|
||||
"snapshot": parent_snapshot,
|
||||
"created_by_id": crawl.created_by_id,
|
||||
},
|
||||
queue_for_extraction=False,
|
||||
)
|
||||
if snapshot is None:
|
||||
return None
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
return None
|
||||
snapshot.retry_at = None
|
||||
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
return str(snapshot.id)
|
||||
snapshot = await Snapshot.objects.select_related("crawl").filter(id=event.snapshot_id).afirst()
|
||||
snapshot_id: str | None = None
|
||||
if snapshot is not None:
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
await snapshot.asave(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
||||
await (
|
||||
Snapshot.objects.filter(
|
||||
crawl_id=snapshot.crawl_id,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
.exclude(id=snapshot.id)
|
||||
.aupdate(
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
)
|
||||
snapshot_id = str(snapshot.id)
|
||||
if snapshot_id:
|
||||
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
||||
if snapshot is not None:
|
||||
await sync_to_async(snapshot.write_index_jsonl, thread_sensitive=True)()
|
||||
await sync_to_async(snapshot.write_json_details, thread_sensitive=True)()
|
||||
await sync_to_async(snapshot.write_html_details, thread_sensitive=True)()
|
||||
|
||||
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
||||
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
||||
|
||||
def _seal_snapshot(self, snapshot_id: str) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return None
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
||||
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
|
||||
return str(snapshot.id)
|
||||
|
||||
def _crawl_limit_stop_reason(self, crawl) -> str:
|
||||
config = dict(crawl.config or {})
|
||||
config["CRAWL_DIR"] = str(crawl.output_dir)
|
||||
return CrawlLimitState.from_config(config).get_stop_reason()
|
||||
|
||||
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
return (
|
||||
Snapshot.objects.filter(
|
||||
crawl_id=crawl_id,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
.exclude(id=exclude_snapshot_id)
|
||||
.update(
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
)
|
||||
|
||||
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is not None:
|
||||
snapshot.ensure_crawl_symlink()
|
||||
|
||||
def _write_snapshot_details(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
||||
if snapshot is None:
|
||||
return
|
||||
snapshot.write_index_jsonl()
|
||||
snapshot.write_json_details()
|
||||
snapshot.write_html_details()
|
||||
|
||||
@@ -3,20 +3,20 @@ from __future__ import annotations
|
||||
from abx_dl.events import TagEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
|
||||
|
||||
class TagService(BaseService):
|
||||
LISTENS_TO = [TagEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_TagEvent__Outer(self, event: TagEvent) -> None:
|
||||
await run_db_op(self._project, event)
|
||||
def __init__(self, bus):
|
||||
super().__init__(bus)
|
||||
self.bus.on(TagEvent, self.on_TagEvent__save_to_db)
|
||||
|
||||
def _project(self, event: TagEvent) -> None:
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
async def on_TagEvent__save_to_db(self, event: TagEvent) -> None:
|
||||
from archivebox.core.models import Snapshot, SnapshotTag, Tag
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
||||
snapshot = await Snapshot.objects.filter(id=event.snapshot_id).afirst()
|
||||
if snapshot is None:
|
||||
return
|
||||
Tag.from_json({"name": event.name}, overrides={"snapshot": snapshot})
|
||||
tag, _ = await Tag.objects.aget_or_create(name=event.name)
|
||||
await SnapshotTag.objects.aget_or_create(snapshot=snapshot, tag=tag)
|
||||
|
||||
Reference in New Issue
Block a user