mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
112 lines
4.2 KiB
Python
112 lines
4.2 KiB
Python
from __future__ import annotations
|
|
|
|
from asgiref.sync import sync_to_async
|
|
from django.utils import timezone
|
|
|
|
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
|
from abx_dl.services.base import BaseService
|
|
|
|
from .db import run_db_op
|
|
|
|
|
|
class SnapshotService(BaseService):
|
|
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
|
EMITS = []
|
|
|
|
def __init__(self, bus, *, crawl_id: str, schedule_snapshot):
|
|
self.crawl_id = crawl_id
|
|
self.schedule_snapshot = schedule_snapshot
|
|
super().__init__(bus)
|
|
|
|
async def on_SnapshotEvent__Outer(self, event: SnapshotEvent) -> None:
|
|
snapshot_id = await run_db_op(self._project_snapshot, event)
|
|
if snapshot_id:
|
|
await sync_to_async(self._ensure_crawl_symlink)(snapshot_id)
|
|
if snapshot_id and event.depth > 0:
|
|
await self.schedule_snapshot(snapshot_id)
|
|
|
|
async def on_SnapshotCompletedEvent__Outer(self, event: SnapshotCompletedEvent) -> None:
|
|
snapshot_id = await run_db_op(self._seal_snapshot, event.snapshot_id)
|
|
if snapshot_id:
|
|
await sync_to_async(self._write_snapshot_details)(snapshot_id)
|
|
|
|
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
crawl = Crawl.objects.get(id=self.crawl_id)
|
|
|
|
if event.depth == 0:
|
|
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
|
|
if snapshot is None:
|
|
return None
|
|
snapshot.status = Snapshot.StatusChoices.STARTED
|
|
snapshot.retry_at = None
|
|
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
|
return str(snapshot.id)
|
|
|
|
if event.depth > crawl.max_depth:
|
|
return None
|
|
|
|
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
|
|
if parent_snapshot is None:
|
|
return None
|
|
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
|
|
return None
|
|
|
|
snapshot = Snapshot.from_json(
|
|
{
|
|
"url": event.url,
|
|
"depth": event.depth,
|
|
"parent_snapshot_id": str(parent_snapshot.id),
|
|
"crawl_id": str(crawl.id),
|
|
},
|
|
overrides={
|
|
"crawl": crawl,
|
|
"snapshot": parent_snapshot,
|
|
"created_by_id": crawl.created_by_id,
|
|
},
|
|
queue_for_extraction=False,
|
|
)
|
|
if snapshot is None:
|
|
return None
|
|
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
|
return None
|
|
snapshot.retry_at = None
|
|
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
|
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
|
return str(snapshot.id)
|
|
|
|
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
|
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
|
|
|
def _seal_snapshot(self, snapshot_id: str) -> str | None:
|
|
from archivebox.core.models import Snapshot
|
|
|
|
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
|
if snapshot is None:
|
|
return None
|
|
snapshot.status = Snapshot.StatusChoices.SEALED
|
|
snapshot.retry_at = None
|
|
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
|
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
|
return str(snapshot.id)
|
|
|
|
def _ensure_crawl_symlink(self, snapshot_id: str) -> None:
|
|
from archivebox.core.models import Snapshot
|
|
|
|
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
|
if snapshot is not None:
|
|
snapshot.ensure_crawl_symlink()
|
|
|
|
def _write_snapshot_details(self, snapshot_id: str) -> None:
|
|
from archivebox.core.models import Snapshot
|
|
|
|
snapshot = Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").first()
|
|
if snapshot is None:
|
|
return
|
|
snapshot.write_index_jsonl()
|
|
snapshot.write_json_details()
|
|
snapshot.write_html_details()
|