mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Refactor ArchiveBox onto abx-dl bus runner
This commit is contained in:
22
archivebox/services/__init__.py
Normal file
22
archivebox/services/__init__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_service import ProcessService
|
||||
from .runner import run_binary, run_crawl, run_install, run_pending_crawls
|
||||
from .snapshot_service import SnapshotService
|
||||
from .tag_service import TagService
|
||||
|
||||
__all__ = [
|
||||
"ArchiveResultService",
|
||||
"BinaryService",
|
||||
"CrawlService",
|
||||
"MachineService",
|
||||
"ProcessService",
|
||||
"SnapshotService",
|
||||
"TagService",
|
||||
"run_binary",
|
||||
"run_crawl",
|
||||
"run_install",
|
||||
"run_pending_crawls",
|
||||
]
|
||||
103
archivebox/services/archive_result_service.py
Normal file
103
archivebox/services/archive_result_service.py
Normal file
@@ -0,0 +1,103 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import mimetypes
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ArchiveResultEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .process_service import ProcessService, parse_event_datetime
|
||||
|
||||
|
||||
def _collect_output_metadata(plugin_dir: Path) -> tuple[dict[str, dict], int, str]:
|
||||
exclude_names = {"stdout.log", "stderr.log", "process.pid", "hook.pid", "listener.pid", "cmd.sh"}
|
||||
output_files: dict[str, dict] = {}
|
||||
mime_sizes: dict[str, int] = defaultdict(int)
|
||||
total_size = 0
|
||||
|
||||
if not plugin_dir.exists():
|
||||
return output_files, total_size, ""
|
||||
|
||||
for file_path in plugin_dir.rglob("*"):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
if ".hooks" in file_path.parts:
|
||||
continue
|
||||
if file_path.name in exclude_names:
|
||||
continue
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
except OSError:
|
||||
continue
|
||||
mime_type, _ = mimetypes.guess_type(str(file_path))
|
||||
mime_type = mime_type or "application/octet-stream"
|
||||
relative_path = str(file_path.relative_to(plugin_dir))
|
||||
output_files[relative_path] = {}
|
||||
mime_sizes[mime_type] += stat.st_size
|
||||
total_size += stat.st_size
|
||||
|
||||
output_mimetypes = ",".join(
|
||||
mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True)
|
||||
)
|
||||
return output_files, total_size, output_mimetypes
|
||||
|
||||
|
||||
def _normalize_status(status: str) -> str:
|
||||
if status == "noresult":
|
||||
return "skipped"
|
||||
return status or "failed"
|
||||
|
||||
|
||||
class ArchiveResultService(BaseService):
|
||||
LISTENS_TO = [ArchiveResultEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus, *, process_service: ProcessService):
|
||||
self.process_service = process_service
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ArchiveResultEvent(self, event: ArchiveResultEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
|
||||
def _project(self, event: ArchiveResultEvent) -> None:
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return
|
||||
|
||||
process = None
|
||||
db_process_id = self.process_service.get_db_process_id(event.process_id)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
|
||||
result, _created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=event.plugin,
|
||||
hook_name=event.hook_name,
|
||||
defaults={
|
||||
"status": ArchiveResult.StatusChoices.STARTED,
|
||||
"process": process,
|
||||
},
|
||||
)
|
||||
|
||||
plugin_dir = Path(snapshot.output_dir) / event.plugin
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
result.process = process or result.process
|
||||
result.status = _normalize_status(event.status)
|
||||
result.output_str = event.output_str
|
||||
result.output_json = event.output_json
|
||||
result.output_files = output_files
|
||||
result.output_size = output_size
|
||||
result.output_mimetypes = output_mimetypes
|
||||
result.start_ts = parse_event_datetime(event.start_ts) or result.start_ts or timezone.now()
|
||||
result.end_ts = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
result.retry_at = None
|
||||
if event.error:
|
||||
result.notes = event.error
|
||||
result.save()
|
||||
64
archivebox/services/binary_service.py
Normal file
64
archivebox/services/binary_service.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from abx_dl.events import BinaryEvent, BinaryInstalledEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
class BinaryService(BaseService):
|
||||
LISTENS_TO = [BinaryEvent, BinaryInstalledEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_BinaryEvent(self, event: BinaryEvent) -> None:
|
||||
await sync_to_async(self._project_binary, thread_sensitive=True)(event)
|
||||
|
||||
async def on_BinaryInstalledEvent(self, event: BinaryInstalledEvent) -> None:
|
||||
await sync_to_async(self._project_installed_binary, thread_sensitive=True)(event)
|
||||
|
||||
def _project_binary(self, event: BinaryEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
existing = Binary.objects.filter(machine=machine, name=event.name).first()
|
||||
if existing and existing.status == Binary.StatusChoices.INSTALLED:
|
||||
changed = False
|
||||
if event.binproviders and existing.binproviders != event.binproviders:
|
||||
existing.binproviders = event.binproviders
|
||||
changed = True
|
||||
if event.overrides and existing.overrides != event.overrides:
|
||||
existing.overrides = event.overrides
|
||||
changed = True
|
||||
if changed:
|
||||
existing.save(update_fields=["binproviders", "overrides", "modified_at"])
|
||||
return
|
||||
|
||||
Binary.from_json(
|
||||
{
|
||||
"name": event.name,
|
||||
"abspath": event.abspath,
|
||||
"version": event.version,
|
||||
"sha256": event.sha256,
|
||||
"binproviders": event.binproviders,
|
||||
"binprovider": event.binprovider,
|
||||
"overrides": event.overrides or {},
|
||||
},
|
||||
)
|
||||
|
||||
def _project_installed_binary(self, event: BinaryInstalledEvent) -> None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
binary, _ = Binary.objects.get_or_create(
|
||||
machine=machine,
|
||||
name=event.name,
|
||||
defaults={
|
||||
"status": Binary.StatusChoices.QUEUED,
|
||||
},
|
||||
)
|
||||
binary.abspath = event.abspath or binary.abspath
|
||||
binary.version = event.version or binary.version
|
||||
binary.sha256 = event.sha256 or binary.sha256
|
||||
binary.binprovider = event.binprovider or binary.binprovider
|
||||
binary.status = Binary.StatusChoices.INSTALLED
|
||||
binary.retry_at = None
|
||||
binary.save(update_fields=["abspath", "version", "sha256", "binprovider", "status", "retry_at", "modified_at"])
|
||||
45
archivebox/services/crawl_service.py
Normal file
45
archivebox/services/crawl_service.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import CrawlCleanupEvent, CrawlCompletedEvent, CrawlSetupEvent, CrawlStartEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
class CrawlService(BaseService):
|
||||
LISTENS_TO = [CrawlSetupEvent, CrawlStartEvent, CrawlCleanupEvent, CrawlCompletedEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus, *, crawl_id: str):
|
||||
self.crawl_id = crawl_id
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_CrawlSetupEvent(self, event: CrawlSetupEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
|
||||
async def on_CrawlStartEvent(self, event: CrawlStartEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
|
||||
async def on_CrawlCleanupEvent(self, event: CrawlCleanupEvent) -> None:
|
||||
await sync_to_async(self._mark_started, thread_sensitive=True)()
|
||||
|
||||
async def on_CrawlCompletedEvent(self, event: CrawlCompletedEvent) -> None:
|
||||
await sync_to_async(self._mark_completed, thread_sensitive=True)()
|
||||
|
||||
def _mark_started(self) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.STARTED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
def _mark_completed(self) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
31
archivebox/services/machine_service.py
Normal file
31
archivebox/services/machine_service.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from abx_dl.events import MachineEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
class MachineService(BaseService):
|
||||
LISTENS_TO = [MachineEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_MachineEvent(self, event: MachineEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
|
||||
def _project(self, event: MachineEvent) -> None:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
config = dict(machine.config or {})
|
||||
|
||||
if event.config is not None:
|
||||
config.update(event.config)
|
||||
elif event.method == "update":
|
||||
key = event.key.replace("config/", "", 1).strip()
|
||||
if key:
|
||||
config[key] = event.value
|
||||
else:
|
||||
return
|
||||
|
||||
machine.config = config
|
||||
machine.save(update_fields=["config", "modified_at"])
|
||||
95
archivebox/services/process_service.py
Normal file
95
archivebox/services/process_service.py
Normal file
@@ -0,0 +1,95 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
|
||||
def parse_event_datetime(value: str | None):
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
dt = datetime.fromisoformat(value)
|
||||
except ValueError:
|
||||
return None
|
||||
if timezone.is_naive(dt):
|
||||
return timezone.make_aware(dt, timezone.get_current_timezone())
|
||||
return dt
|
||||
|
||||
|
||||
class ProcessService(BaseService):
|
||||
LISTENS_TO = [ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus):
|
||||
self.process_ids: dict[str, str] = {}
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ProcessStartedEvent(self, event: ProcessStartedEvent) -> None:
|
||||
await sync_to_async(self._project_started, thread_sensitive=True)(event)
|
||||
|
||||
async def on_ProcessCompletedEvent(self, event: ProcessCompletedEvent) -> None:
|
||||
await sync_to_async(self._project_completed, thread_sensitive=True)(event)
|
||||
|
||||
def get_db_process_id(self, process_id: str) -> str | None:
|
||||
return self.process_ids.get(process_id)
|
||||
|
||||
def _get_or_create_process(self, event: ProcessStartedEvent | ProcessCompletedEvent) -> "Process":
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
db_process_id = self.process_ids.get(event.process_id)
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process is not None:
|
||||
return process
|
||||
|
||||
process_type = Process.TypeChoices.BINARY if event.hook_name.startswith("on_Binary") else Process.TypeChoices.HOOK
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
process_type=process_type,
|
||||
pwd=event.output_dir,
|
||||
cmd=[event.hook_path, *event.hook_args],
|
||||
env=event.env,
|
||||
timeout=getattr(event, "timeout", 60),
|
||||
pid=event.pid or None,
|
||||
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
)
|
||||
self.process_ids[event.process_id] = str(process.id)
|
||||
return process
|
||||
|
||||
def _project_started(self, event: ProcessStartedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.timeout = event.timeout
|
||||
process.pid = event.pid or None
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
process.status = process.StatusChoices.RUNNING
|
||||
process.retry_at = None
|
||||
process.save()
|
||||
|
||||
def _project_completed(self, event: ProcessCompletedEvent) -> None:
|
||||
process = self._get_or_create_process(event)
|
||||
process.pwd = event.output_dir
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
process.ended_at = parse_event_datetime(event.end_ts) or timezone.now()
|
||||
process.stdout = event.stdout
|
||||
process.stderr = event.stderr
|
||||
process.exit_code = event.exit_code
|
||||
process.status = process.StatusChoices.EXITED
|
||||
process.retry_at = None
|
||||
process.save()
|
||||
454
archivebox/services/runner.py
Normal file
454
archivebox/services/runner.py
Normal file
@@ -0,0 +1,454 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import BinaryEvent
|
||||
from abx_dl.models import INSTALL_URL, Snapshot as AbxSnapshot, discover_plugins
|
||||
from abx_dl.orchestrator import create_bus, download, install_plugins as abx_install_plugins, setup_services as setup_abx_services
|
||||
|
||||
from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_service import ProcessService
|
||||
from .snapshot_service import SnapshotService
|
||||
from .tag_service import TagService
|
||||
|
||||
|
||||
def _bus_name(prefix: str, identifier: str) -> str:
|
||||
normalized = "".join(ch if ch.isalnum() else "_" for ch in identifier)
|
||||
return f"{prefix}_{normalized}"
|
||||
|
||||
|
||||
def _selected_plugins_from_config(config: dict[str, Any]) -> list[str] | None:
|
||||
raw = str(config.get("PLUGINS") or "").strip()
|
||||
if not raw:
|
||||
return None
|
||||
return [name.strip() for name in raw.split(",") if name.strip()]
|
||||
|
||||
|
||||
def _attach_bus_trace(bus) -> None:
|
||||
trace_target = (os.environ.get("ARCHIVEBOX_BUS_TRACE") or "").strip()
|
||||
if not trace_target:
|
||||
return
|
||||
if getattr(bus, "_archivebox_trace_task", None) is not None:
|
||||
return
|
||||
|
||||
trace_path = None if trace_target in {"1", "-", "stderr"} else Path(trace_target)
|
||||
stop_event = asyncio.Event()
|
||||
|
||||
async def trace_loop() -> None:
|
||||
seen_event_ids: set[str] = set()
|
||||
while not stop_event.is_set():
|
||||
for event_id, event in list(bus.event_history.items()):
|
||||
if event_id in seen_event_ids:
|
||||
continue
|
||||
seen_event_ids.add(event_id)
|
||||
payload = event.model_dump(mode="json")
|
||||
payload["bus_name"] = bus.name
|
||||
line = json.dumps(payload, ensure_ascii=False, default=str, separators=(",", ":"))
|
||||
if trace_path is None:
|
||||
print(line, file=sys.stderr, flush=True)
|
||||
else:
|
||||
trace_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with trace_path.open("a", encoding="utf-8") as handle:
|
||||
handle.write(line + "\n")
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
bus._archivebox_trace_stop = stop_event
|
||||
bus._archivebox_trace_task = asyncio.create_task(trace_loop())
|
||||
|
||||
|
||||
async def _stop_bus_trace(bus) -> None:
|
||||
stop_event = getattr(bus, "_archivebox_trace_stop", None)
|
||||
trace_task = getattr(bus, "_archivebox_trace_task", None)
|
||||
if stop_event is None or trace_task is None:
|
||||
return
|
||||
stop_event.set()
|
||||
await asyncio.gather(trace_task, return_exceptions=True)
|
||||
bus._archivebox_trace_stop = None
|
||||
bus._archivebox_trace_task = None
|
||||
|
||||
|
||||
class CrawlRunner:
|
||||
MAX_CONCURRENT_SNAPSHOTS = 8
|
||||
|
||||
def __init__(self, crawl, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None):
|
||||
self.crawl = crawl
|
||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||
self.plugins = discover_plugins()
|
||||
self.process_service = ProcessService(self.bus)
|
||||
self.machine_service = MachineService(self.bus)
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.snapshot_service = SnapshotService(self.bus, crawl_id=str(crawl.id), schedule_snapshot=self.enqueue_snapshot)
|
||||
self.archive_result_service = ArchiveResultService(self.bus, process_service=self.process_service)
|
||||
self.selected_plugins = selected_plugins
|
||||
self.initial_snapshot_ids = snapshot_ids
|
||||
self.snapshot_tasks: dict[str, asyncio.Task[None]] = {}
|
||||
self.snapshot_semaphore = asyncio.Semaphore(self.MAX_CONCURRENT_SNAPSHOTS)
|
||||
self.abx_services = None
|
||||
self.persona = None
|
||||
self.base_config: dict[str, Any] = {}
|
||||
self.primary_url = ""
|
||||
|
||||
async def run(self) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
try:
|
||||
await sync_to_async(self._prepare, thread_sensitive=True)()
|
||||
_attach_bus_trace(self.bus)
|
||||
self.abx_services = setup_abx_services(
|
||||
self.bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=self.base_config,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
if self.crawl.get_system_task() == INSTALL_URL:
|
||||
await self._run_install_crawl()
|
||||
else:
|
||||
snapshot_ids = await sync_to_async(self._initial_snapshot_ids, thread_sensitive=True)()
|
||||
if snapshot_ids:
|
||||
root_snapshot_id = snapshot_ids[0]
|
||||
await self._run_crawl_setup(root_snapshot_id)
|
||||
for snapshot_id in snapshot_ids:
|
||||
await self.enqueue_snapshot(snapshot_id)
|
||||
await self._wait_for_snapshot_tasks()
|
||||
await self._run_crawl_cleanup(root_snapshot_id)
|
||||
if self.abx_services is not None:
|
||||
await self.abx_services.process.wait_for_background_monitors()
|
||||
finally:
|
||||
await _stop_bus_trace(self.bus)
|
||||
await self.bus.stop()
|
||||
await sync_to_async(self._cleanup_persona, thread_sensitive=True)()
|
||||
crawl = await sync_to_async(Crawl.objects.get, thread_sensitive=True)(id=self.crawl.id)
|
||||
if crawl.status != Crawl.StatusChoices.SEALED:
|
||||
crawl.status = Crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
await sync_to_async(crawl.save, thread_sensitive=True)(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
async def enqueue_snapshot(self, snapshot_id: str) -> None:
|
||||
task = self.snapshot_tasks.get(snapshot_id)
|
||||
if task is not None and not task.done():
|
||||
return
|
||||
task = asyncio.create_task(self._run_snapshot(snapshot_id))
|
||||
self.snapshot_tasks[snapshot_id] = task
|
||||
|
||||
async def _wait_for_snapshot_tasks(self) -> None:
|
||||
while True:
|
||||
active = [task for task in self.snapshot_tasks.values() if not task.done()]
|
||||
if not active:
|
||||
return
|
||||
await asyncio.gather(*active)
|
||||
|
||||
def _prepare(self) -> None:
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
self.primary_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ""
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
if self.selected_plugins is None:
|
||||
self.selected_plugins = _selected_plugins_from_config(self.base_config)
|
||||
if self.persona:
|
||||
chrome_binary = str(self.base_config.get("CHROME_BINARY") or "")
|
||||
self.base_config.update(self.persona.prepare_runtime_for_crawl(self.crawl, chrome_binary=chrome_binary))
|
||||
|
||||
def _cleanup_persona(self) -> None:
|
||||
if self.persona:
|
||||
self.persona.cleanup_runtime_for_crawl(self.crawl)
|
||||
|
||||
def _create_root_snapshots(self) -> list[str]:
|
||||
created = self.crawl.create_snapshots_from_urls()
|
||||
snapshots = created or list(self.crawl.snapshot_set.filter(depth=0).order_by("created_at"))
|
||||
return [str(snapshot.id) for snapshot in snapshots]
|
||||
|
||||
def _initial_snapshot_ids(self) -> list[str]:
|
||||
if self.initial_snapshot_ids:
|
||||
return [str(snapshot_id) for snapshot_id in self.initial_snapshot_ids]
|
||||
return self._create_root_snapshots()
|
||||
|
||||
def _snapshot_config(self, snapshot) -> dict[str, Any]:
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(crawl=self.crawl, snapshot=snapshot)
|
||||
config.update(self.base_config)
|
||||
config["CRAWL_DIR"] = str(self.crawl.output_dir)
|
||||
config["SNAP_DIR"] = str(snapshot.output_dir)
|
||||
config["SNAPSHOT_ID"] = str(snapshot.id)
|
||||
config["SNAPSHOT_DEPTH"] = snapshot.depth
|
||||
config["CRAWL_ID"] = str(self.crawl.id)
|
||||
config["SOURCE_URL"] = snapshot.url
|
||||
if snapshot.parent_snapshot_id:
|
||||
config["PARENT_SNAPSHOT_ID"] = str(snapshot.parent_snapshot_id)
|
||||
return config
|
||||
|
||||
async def _run_install_crawl(self) -> None:
|
||||
install_snapshot = AbxSnapshot(
|
||||
url=self.primary_url or INSTALL_URL,
|
||||
id=str(self.crawl.id),
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=self.primary_url or INSTALL_URL,
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(self.crawl.output_dir),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides={
|
||||
**self.base_config,
|
||||
"CRAWL_DIR": str(self.crawl.output_dir),
|
||||
"SNAP_DIR": str(self.crawl.output_dir),
|
||||
"CRAWL_ID": str(self.crawl.id),
|
||||
"SOURCE_URL": self.crawl.urls,
|
||||
},
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=install_snapshot,
|
||||
crawl_only=True,
|
||||
)
|
||||
|
||||
async def _run_crawl_setup(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
setup_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=setup_snapshot,
|
||||
crawl_setup_only=True,
|
||||
)
|
||||
|
||||
async def _run_crawl_cleanup(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
cleanup_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=cleanup_snapshot,
|
||||
crawl_cleanup_only=True,
|
||||
)
|
||||
|
||||
async def _run_snapshot(self, snapshot_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
async with self.snapshot_semaphore:
|
||||
snapshot = await sync_to_async(self._load_snapshot_run_data, thread_sensitive=True)(snapshot_id)
|
||||
abx_snapshot = AbxSnapshot(
|
||||
url=snapshot["url"],
|
||||
id=snapshot["id"],
|
||||
title=snapshot["title"],
|
||||
timestamp=snapshot["timestamp"],
|
||||
bookmarked_at=snapshot["bookmarked_at"],
|
||||
created_at=snapshot["created_at"],
|
||||
tags=snapshot["tags"],
|
||||
depth=snapshot["depth"],
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
|
||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
|
||||
return {
|
||||
"id": str(snapshot.id),
|
||||
"url": snapshot.url,
|
||||
"title": snapshot.title,
|
||||
"timestamp": snapshot.timestamp,
|
||||
"bookmarked_at": snapshot.bookmarked_at.isoformat() if snapshot.bookmarked_at else "",
|
||||
"created_at": snapshot.created_at.isoformat() if snapshot.created_at else "",
|
||||
"tags": snapshot.tags_str(),
|
||||
"depth": snapshot.depth,
|
||||
"parent_snapshot_id": str(snapshot.parent_snapshot_id) if snapshot.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot.output_dir),
|
||||
"config": self._snapshot_config(snapshot),
|
||||
}
|
||||
|
||||
|
||||
def run_crawl(crawl_id: str, *, snapshot_ids: list[str] | None = None, selected_plugins: list[str] | None = None) -> None:
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
asyncio.run(CrawlRunner(crawl, snapshot_ids=snapshot_ids, selected_plugins=selected_plugins).run())
|
||||
|
||||
|
||||
async def _run_binary(binary_id: str) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
|
||||
config = get_config()
|
||||
plugins = discover_plugins()
|
||||
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
try:
|
||||
_attach_bus_trace(bus)
|
||||
await bus.emit(
|
||||
BinaryEvent(
|
||||
name=binary.name,
|
||||
plugin_name="archivebox",
|
||||
hook_name="archivebox_run",
|
||||
output_dir=str(binary.output_dir),
|
||||
binary_id=str(binary.id),
|
||||
machine_id=str(binary.machine_id),
|
||||
abspath=binary.abspath,
|
||||
version=binary.version,
|
||||
sha256=binary.sha256,
|
||||
binproviders=binary.binproviders,
|
||||
binprovider=binary.binprovider,
|
||||
overrides=binary.overrides or None,
|
||||
),
|
||||
)
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
|
||||
|
||||
def run_binary(binary_id: str) -> None:
|
||||
asyncio.run(_run_binary(binary_id))
|
||||
|
||||
|
||||
async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config()
|
||||
plugins = discover_plugins()
|
||||
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
try:
|
||||
_attach_bus_trace(bus)
|
||||
await abx_install_plugins(
|
||||
plugin_names=plugin_names,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
emit_jsonl=False,
|
||||
bus=bus,
|
||||
)
|
||||
await abx_services.process.wait_for_background_monitors()
|
||||
finally:
|
||||
await _stop_bus_trace(bus)
|
||||
await bus.stop()
|
||||
|
||||
|
||||
def run_install(*, plugin_names: list[str] | None = None) -> None:
|
||||
asyncio.run(_run_install(plugin_names=plugin_names))
|
||||
|
||||
|
||||
def run_pending_crawls(*, daemon: bool = False, crawl_id: str | None = None) -> int:
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
while True:
|
||||
if daemon and crawl_id is None:
|
||||
now = timezone.now()
|
||||
for schedule in CrawlSchedule.objects.filter(is_enabled=True).select_related("template", "template__created_by"):
|
||||
if schedule.is_due(now):
|
||||
schedule.enqueue(queued_at=now)
|
||||
|
||||
if crawl_id is None:
|
||||
binary = (
|
||||
Binary.objects.filter(retry_at__lte=timezone.now())
|
||||
.exclude(status=Binary.StatusChoices.INSTALLED)
|
||||
.order_by("retry_at", "created_at")
|
||||
.first()
|
||||
)
|
||||
if binary is not None:
|
||||
run_binary(str(binary.id))
|
||||
continue
|
||||
|
||||
pending = Crawl.objects.filter(retry_at__lte=timezone.now()).exclude(status=Crawl.StatusChoices.SEALED)
|
||||
if crawl_id:
|
||||
pending = pending.filter(id=crawl_id)
|
||||
pending = pending.order_by("retry_at", "created_at")
|
||||
|
||||
crawl = pending.first()
|
||||
if crawl is None:
|
||||
if daemon:
|
||||
time.sleep(2.0)
|
||||
continue
|
||||
return 0
|
||||
|
||||
run_crawl(str(crawl.id))
|
||||
128
archivebox/services/snapshot_service.py
Normal file
128
archivebox/services/snapshot_service.py
Normal file
@@ -0,0 +1,128 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
class SnapshotService(BaseService):
|
||||
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
||||
EMITS = []
|
||||
|
||||
def __init__(self, bus, *, crawl_id: str, schedule_snapshot):
|
||||
self.crawl_id = crawl_id
|
||||
self.schedule_snapshot = schedule_snapshot
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
||||
snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event)
|
||||
if snapshot_id and event.depth > 0:
|
||||
await self.schedule_snapshot(snapshot_id)
|
||||
|
||||
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
||||
await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id)
|
||||
|
||||
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
|
||||
if event.depth == 0:
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
|
||||
if snapshot is None:
|
||||
return None
|
||||
snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return str(snapshot.id)
|
||||
|
||||
if event.depth > crawl.max_depth:
|
||||
return None
|
||||
|
||||
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
|
||||
if parent_snapshot is None:
|
||||
return None
|
||||
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
|
||||
return None
|
||||
|
||||
snapshot = Snapshot.from_json(
|
||||
{
|
||||
"url": event.url,
|
||||
"depth": event.depth,
|
||||
"parent_snapshot_id": str(parent_snapshot.id),
|
||||
"crawl_id": str(crawl.id),
|
||||
},
|
||||
overrides={
|
||||
"crawl": crawl,
|
||||
"snapshot": parent_snapshot,
|
||||
"created_by_id": crawl.created_by_id,
|
||||
},
|
||||
queue_for_extraction=False,
|
||||
)
|
||||
if snapshot is None:
|
||||
return None
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
return None
|
||||
snapshot.retry_at = None
|
||||
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
snapshot.ensure_crawl_symlink()
|
||||
return str(snapshot.id)
|
||||
|
||||
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(
|
||||
user=getattr(crawl, "created_by", None),
|
||||
crawl=crawl,
|
||||
snapshot=parent_snapshot,
|
||||
)
|
||||
|
||||
def to_pattern_list(value):
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return [pattern.strip() for pattern in value.split(",") if pattern.strip()]
|
||||
return []
|
||||
|
||||
allowlist = to_pattern_list(config.get("URL_ALLOWLIST", ""))
|
||||
denylist = to_pattern_list(config.get("URL_DENYLIST", ""))
|
||||
|
||||
for pattern in denylist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return False
|
||||
except re.error:
|
||||
continue
|
||||
|
||||
if allowlist:
|
||||
for pattern in allowlist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return True
|
||||
except re.error:
|
||||
continue
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _seal_snapshot(self, snapshot_id: str) -> None:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
||||
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
||||
snapshot.write_index_jsonl()
|
||||
snapshot.write_json_details()
|
||||
snapshot.write_html_details()
|
||||
21
archivebox/services/tag_service.py
Normal file
21
archivebox/services/tag_service.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from asgiref.sync import sync_to_async
|
||||
from abx_dl.events import TagEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
class TagService(BaseService):
|
||||
LISTENS_TO = [TagEvent]
|
||||
EMITS = []
|
||||
|
||||
async def on_TagEvent(self, event: TagEvent) -> None:
|
||||
await sync_to_async(self._project, thread_sensitive=True)(event)
|
||||
|
||||
def _project(self, event: TagEvent) -> None:
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
|
||||
snapshot = Snapshot.objects.filter(id=event.snapshot_id).first()
|
||||
if snapshot is None:
|
||||
return
|
||||
Tag.from_json({"name": event.name}, overrides={"snapshot": snapshot})
|
||||
Reference in New Issue
Block a user