mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
112 lines
5.4 KiB
Python
112 lines
5.4 KiB
Python
from __future__ import annotations
|
|
|
|
from asgiref.sync import sync_to_async
|
|
from django.utils import timezone
|
|
|
|
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
|
from abx_dl.limits import CrawlLimitState
|
|
from abx_dl.services.base import BaseService
|
|
|
|
|
|
class SnapshotService(BaseService):
|
|
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
|
EMITS = []
|
|
|
|
def __init__(self, bus, *, crawl_id: str, schedule_snapshot):
|
|
self.crawl_id = crawl_id
|
|
self.schedule_snapshot = schedule_snapshot
|
|
super().__init__(bus)
|
|
self.bus.on(SnapshotEvent, self.on_SnapshotEvent)
|
|
self.bus.on(SnapshotCompletedEvent, self.on_SnapshotCompletedEvent)
|
|
|
|
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
crawl = await Crawl.objects.aget(id=self.crawl_id)
|
|
snapshot_id: str | None = None
|
|
snapshot = await Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).afirst()
|
|
|
|
if snapshot is not None:
|
|
snapshot.status = Snapshot.StatusChoices.STARTED
|
|
snapshot.retry_at = None
|
|
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
|
snapshot_id = str(snapshot.id)
|
|
elif event.depth > 0:
|
|
if event.depth <= crawl.max_depth and self._crawl_limit_stop_reason(crawl) != "max_size":
|
|
parent_event = await self.bus.find(
|
|
SnapshotEvent,
|
|
past=True,
|
|
future=False,
|
|
where=lambda candidate: candidate.depth == event.depth - 1 and self.bus.event_is_child_of(event, candidate),
|
|
)
|
|
parent_snapshot = None
|
|
if parent_event is not None:
|
|
parent_snapshot = await Snapshot.objects.filter(id=parent_event.snapshot_id, crawl=crawl).afirst()
|
|
if parent_snapshot is not None and self._url_passes_filters(crawl, parent_snapshot, event.url):
|
|
snapshot = await sync_to_async(Snapshot.from_json, thread_sensitive=True)(
|
|
{
|
|
"url": event.url,
|
|
"depth": event.depth,
|
|
"parent_snapshot_id": str(parent_snapshot.id),
|
|
"crawl_id": str(crawl.id),
|
|
},
|
|
overrides={
|
|
"crawl": crawl,
|
|
"snapshot": parent_snapshot,
|
|
"created_by_id": crawl.created_by_id,
|
|
},
|
|
queue_for_extraction=False,
|
|
)
|
|
if snapshot is not None and snapshot.status != Snapshot.StatusChoices.SEALED:
|
|
snapshot.retry_at = None
|
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
|
await snapshot.asave(update_fields=["status", "retry_at", "modified_at"])
|
|
snapshot_id = str(snapshot.id)
|
|
|
|
if snapshot_id:
|
|
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
|
if snapshot is not None:
|
|
await sync_to_async(snapshot.ensure_crawl_symlink, thread_sensitive=True)()
|
|
if snapshot_id and event.depth > 0:
|
|
await self.schedule_snapshot(snapshot_id)
|
|
|
|
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
|
from archivebox.core.models import Snapshot
|
|
|
|
snapshot = await Snapshot.objects.select_related("crawl").filter(id=event.snapshot_id).afirst()
|
|
snapshot_id: str | None = None
|
|
if snapshot is not None:
|
|
snapshot.status = Snapshot.StatusChoices.SEALED
|
|
snapshot.retry_at = None
|
|
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
|
await snapshot.asave(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
|
if snapshot.crawl_id and self._crawl_limit_stop_reason(snapshot.crawl) == "max_size":
|
|
await (
|
|
Snapshot.objects.filter(
|
|
crawl_id=snapshot.crawl_id,
|
|
status=Snapshot.StatusChoices.QUEUED,
|
|
)
|
|
.exclude(id=snapshot.id)
|
|
.aupdate(
|
|
status=Snapshot.StatusChoices.SEALED,
|
|
retry_at=None,
|
|
modified_at=timezone.now(),
|
|
)
|
|
)
|
|
snapshot_id = str(snapshot.id)
|
|
if snapshot_id:
|
|
snapshot = await Snapshot.objects.filter(id=snapshot_id).select_related("crawl", "crawl__created_by").afirst()
|
|
if snapshot is not None:
|
|
await sync_to_async(snapshot.write_index_jsonl, thread_sensitive=True)()
|
|
await sync_to_async(snapshot.write_json_details, thread_sensitive=True)()
|
|
await sync_to_async(snapshot.write_html_details, thread_sensitive=True)()
|
|
|
|
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
|
return crawl.url_passes_filters(url, snapshot=parent_snapshot)
|
|
|
|
def _crawl_limit_stop_reason(self, crawl) -> str:
|
|
config = dict(crawl.config or {})
|
|
config["CRAWL_DIR"] = str(crawl.output_dir)
|
|
return CrawlLimitState.from_config(config).get_stop_reason()
|