Files
ArchiveBox/archivebox/services/process_service.py
2026-03-25 05:36:07 -07:00

156 lines
6.4 KiB
Python

from __future__ import annotations
from datetime import datetime
from typing import ClassVar
from asgiref.sync import sync_to_async
from django.utils import timezone
from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.services.base import BaseService
def parse_event_datetime(value: str | None):
if not value:
return None
try:
dt = datetime.fromisoformat(value)
except ValueError:
return None
if timezone.is_naive(dt):
return timezone.make_aware(dt, timezone.get_current_timezone())
return dt
class ProcessService(BaseService):
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStartedEvent, ProcessCompletedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = []
def __init__(self, bus):
super().__init__(bus)
self.bus.on(ProcessStartedEvent, self.on_ProcessStartedEvent__save_to_db)
self.bus.on(ProcessCompletedEvent, self.on_ProcessCompletedEvent__save_to_db)
async def on_ProcessStartedEvent__save_to_db(self, event: ProcessStartedEvent) -> None:
from archivebox.machine.models import NetworkInterface, Process
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
process_type = event.process_type or (
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
)
worker_type = event.worker_type or ""
started_at = parse_event_datetime(event.start_ts)
if started_at is None:
raise ValueError("ProcessStartedEvent.start_ts is required")
process_query = Process.objects.filter(
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
started_at=started_at,
)
if event.pid:
process_query = process_query.filter(pid=event.pid)
process = await process_query.order_by("-modified_at").afirst()
if process is None:
process = await Process.objects.acreate(
machine=iface.machine,
iface=iface,
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
started_at=started_at,
status=Process.StatusChoices.RUNNING,
retry_at=None,
)
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
await process.asave(update_fields=["iface", "machine", "modified_at"])
process.pwd = event.output_dir
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.timeout = event.timeout
process.pid = event.pid or None
process.url = event.url or process.url
process.process_type = process_type or process.process_type
process.worker_type = worker_type or process.worker_type
process.started_at = started_at
process.status = process.StatusChoices.RUNNING
process.retry_at = None
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
plugin_name=event.plugin_name,
hook_path=event.hook_path,
)
await process.asave()
async def on_ProcessCompletedEvent__save_to_db(self, event: ProcessCompletedEvent) -> None:
from archivebox.machine.models import NetworkInterface, Process
iface = await sync_to_async(NetworkInterface.current, thread_sensitive=True)(refresh=True)
process_type = event.process_type or (
Process.TypeChoices.BINARY if event.hook_name.startswith("on_BinaryRequest") else Process.TypeChoices.HOOK
)
worker_type = event.worker_type or ""
started_at = parse_event_datetime(event.start_ts)
if started_at is None:
raise ValueError("ProcessCompletedEvent.start_ts is required")
process_query = Process.objects.filter(
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
started_at=started_at,
)
if event.pid:
process_query = process_query.filter(pid=event.pid)
process = await process_query.order_by("-modified_at").afirst()
if process is None:
process = await Process.objects.acreate(
machine=iface.machine,
iface=iface,
process_type=process_type,
worker_type=worker_type,
pwd=event.output_dir,
cmd=[event.hook_path, *event.hook_args],
env=event.env,
timeout=event.timeout,
pid=event.pid or None,
url=event.url or None,
started_at=started_at,
status=Process.StatusChoices.RUNNING,
retry_at=None,
)
elif process.iface_id != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
await process.asave(update_fields=["iface", "machine", "modified_at"])
process.pwd = event.output_dir
if not process.cmd:
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.url = event.url or process.url
process.process_type = process_type or process.process_type
process.worker_type = worker_type or process.worker_type
process.started_at = started_at
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
process.stdout = event.stdout
process.stderr = event.stderr
process.exit_code = event.exit_code
process.status = process.StatusChoices.EXITED
process.retry_at = None
await sync_to_async(process.hydrate_binary_from_context, thread_sensitive=True)(
plugin_name=event.plugin_name,
hook_path=event.hook_path,
)
await process.asave()