Files
ArchiveBox/archivebox/services/snapshot_service.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

138 lines
5.3 KiB
Python

from __future__ import annotations
from asgiref.sync import sync_to_async
from django.utils import timezone
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.services.base import BaseService
from .db import run_db_op
class SnapshotService(BaseService):
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
EMITS = []
def __init__(self, bus, *, crawl_id: str, schedule_snapshot):
self.crawl_id = crawl_id
self.schedule_snapshot = schedule_snapshot
super().__init__(bus)
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
snapshot_id = await run_db_op(self._project_snapshot, event)
if snapshot_id:
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
if snapshot_id and event.depth > 0:
await self.schedule_snapshot(snapshot_id)
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
if snapshot_id:
await sync_to_async(self._write_snapshot_details)(snapshot_id)
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.get(id=self.crawl_id)
if event.depth == 0:
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
if snapshot is None:
return None
snapshot.status = Snapshot.StatusChoices.STARTED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
return str(snapshot.id)
if event.depth > crawl.max_depth:
return None
if self._crawl_limit_stop_reason(crawl) == "max_size":
return None
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
if parent_snapshot is None:
return None
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
return None
snapshot = Snapshot.from_json(
{
"url": event.url,
"depth": event.depth,
"parent_snapshot_id": str(parent_snapshot.id),
"crawl_id": str(crawl.id),
},
overrides={
"crawl": crawl,
"snapshot": parent_snapshot,
"created_by_id": crawl.created_by_id,
},
queue_for_extraction=False,
)
if snapshot is None:
return None
if snapshot.status == Snapshot.StatusChoices.SEALED:
return None
snapshot.retry_at = None
if snapshot.status != Snapshot.StatusChoices.SEALED:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
return str(snapshot.id)
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
def _seal_snapshot(self, snapshot_id: str) -> str | None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
if snapshot is None:
return None
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
return str(snapshot.id)
def _crawl_limit_stop_reason(self, crawl) -> str:
config = dict(crawl.config or {})
config["CRAWL_DIR"] = str(crawl.output_dir)
return CrawlLimitState.from_config(config).get_stop_reason()
def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
from archivebox.core.models import Snapshot
return (
Snapshot.objects.filter(
crawl_id=crawl_id,
status=Snapshot.StatusChoices.QUEUED,
)
.exclude(id=exclude_snapshot_id)
.update(
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
modified_at=timezone.now(),
)
)
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is not None:
snapshot.ensure_crawl_symlink()
def _write_snapshot_details(self, snapshot_id: str) -> None:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
if snapshot is None:
return
snapshot.write_index_jsonl()
snapshot.write_json_details()
snapshot.write_html_details()