ArchiveBox/archivebox/services/snapshot_service.py

from __future__ import annotations

from asgiref.sync import sync_to_async
from django.utils import timezone

from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
from abx_dl.limits import CrawlLimitState
from abx_dl.services.base import BaseService

from .db import run_db_op


class SnapshotService(BaseService):
    LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
    EMITS = []

    def __init__(self, bus, *, crawl_id: str, schedule_snapshot):
        self.crawl_id = crawl_id
        self.schedule_snapshot = schedule_snapshot
        super().__init__(bus)

    async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
        snapshot_id = await run_db_op(self._project_snapshot, event)
        if snapshot_id:
            await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
        if snapshot_id and event.depth > 0:
            await self.schedule_snapshot(snapshot_id)

    async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
        snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
        if snapshot_id:
            await sync_to_async(self._write_snapshot_details)(snapshot_id)

    def _project_snapshot(self, event: SnapshotEvent) -> str | None:
        from archivebox.core.models import Snapshot
        from archivebox.crawls.models import Crawl

        crawl = Crawl.objects.get(id=self.crawl_id)

        if event.depth == 0:
            snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
            if snapshot is None:
                return None
            snapshot.status = Snapshot.StatusChoices.STARTED
            snapshot.retry_at = None
            snapshot.save(update_fields=["status", "retry_at", "modified_at"])
            return str(snapshot.id)

        if event.depth > crawl.max_depth:
            return None
        if self._crawl_limit_stop_reason(crawl) == "max_size":
            return None

        parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
        if parent_snapshot is None:
            return None
        if not self._url_passes_filters(crawl, parent_snapshot, event.url):
            return None

        snapshot = Snapshot.from_json(
            {
                "url": event.url,
                "depth": event.depth,
                "parent_snapshot_id": str(parent_snapshot.id),
                "crawl_id": str(crawl.id),
            },
            overrides={
                "crawl": crawl,
                "snapshot": parent_snapshot,
                "created_by_id": crawl.created_by_id,
            },
            queue_for_extraction=False,
        )
        if snapshot is None:
            return None
        if snapshot.status == Snapshot.StatusChoices.SEALED:
            return None
        snapshot.retry_at = None
        if snapshot.status != Snapshot.StatusChoices.SEALED:
            snapshot.status = Snapshot.StatusChoices.QUEUED
        snapshot.save(update_fields=["status", "retry_at", "modified_at"])
        return str(snapshot.id)

    def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
        return crawl.url_passes_filters(url, snapshot=parent_snapshot)

    def _seal_snapshot(self, snapshot_id: str) -> str | None:
        from archivebox.core.models import Snapshot

        snapshot = Snapshot.objects.select_related("crawl").filter(id=snapshot_id).first()
        if snapshot is None:
            return None
        snapshot.status = Snapshot.StatusChoices.SEALED
        snapshot.retry_at = None
        snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
        snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
        if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
            self._cancel_pending_snapshots(snapshot.crawl_id, exclude_snapshot_id=snapshot.id)
        return str(snapshot.id)

    def _crawl_limit_stop_reason(self, crawl) -> str:
        config = dict(crawl.config or {})
        config["CRAWL_DIR"] = str(crawl.output_dir)
        return CrawlLimitState.from_config(config).get_stop_reason()

    def _cancel_pending_snapshots(self, crawl_id: str, *, exclude_snapshot_id) -> int:
        from archivebox.core.models import Snapshot

        return (
            Snapshot.objects.filter(
                crawl_id=crawl_id,
                status=Snapshot.StatusChoices.QUEUED,
            )
            .exclude(id=exclude_snapshot_id)
            .update(
                status=Snapshot.StatusChoices.SEALED,
                retry_at=None,
                modified_at=timezone.now(),
            )
        )

    def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
        from archivebox.core.models import Snapshot

        snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
        if snapshot is not None:
            snapshot.ensure_crawl_symlink()

    def _write_snapshot_details(self, snapshot_id: str) -> None:
        from archivebox.core.models import Snapshot

        snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
        if snapshot is None:
            return
        snapshot.write_index_jsonl()
        snapshot.write_json_details()
        snapshot.write_html_details()