mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
129 lines
4.5 KiB
Python
129 lines
4.5 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
from asgiref.sync import sync_to_async
|
|
from django.utils import timezone
|
|
|
|
from abx_dl.events import SnapshotCompletedEvent, SnapshotEvent
|
|
from abx_dl.services.base import BaseService
|
|
|
|
|
|
class SnapshotService(BaseService):
|
|
LISTENS_TO = [SnapshotEvent, SnapshotCompletedEvent]
|
|
EMITS = []
|
|
|
|
def __init__(self, bus, *, crawl_id: str, schedule_snapshot):
|
|
self.crawl_id = crawl_id
|
|
self.schedule_snapshot = schedule_snapshot
|
|
super().__init__(bus)
|
|
|
|
async def on_SnapshotEvent(self, event: SnapshotEvent) -> None:
|
|
snapshot_id = await sync_to_async(self._project_snapshot, thread_sensitive=True)(event)
|
|
if snapshot_id and event.depth > 0:
|
|
await self.schedule_snapshot(snapshot_id)
|
|
|
|
async def on_SnapshotCompletedEvent(self, event: SnapshotCompletedEvent) -> None:
|
|
await sync_to_async(self._seal_snapshot, thread_sensitive=True)(event.snapshot_id)
|
|
|
|
def _project_snapshot(self, event: SnapshotEvent) -> str | None:
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.crawls.models import Crawl
|
|
|
|
crawl = Crawl.objects.get(id=self.crawl_id)
|
|
|
|
if event.depth == 0:
|
|
snapshot = Snapshot.objects.filter(id=event.snapshot_id, crawl=crawl).first()
|
|
if snapshot is None:
|
|
return None
|
|
snapshot.status = Snapshot.StatusChoices.STARTED
|
|
snapshot.retry_at = None
|
|
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
|
snapshot.ensure_crawl_symlink()
|
|
return str(snapshot.id)
|
|
|
|
if event.depth > crawl.max_depth:
|
|
return None
|
|
|
|
parent_snapshot = Snapshot.objects.filter(id=event.parent_snapshot_id, crawl=crawl).first()
|
|
if parent_snapshot is None:
|
|
return None
|
|
if not self._url_passes_filters(crawl, parent_snapshot, event.url):
|
|
return None
|
|
|
|
snapshot = Snapshot.from_json(
|
|
{
|
|
"url": event.url,
|
|
"depth": event.depth,
|
|
"parent_snapshot_id": str(parent_snapshot.id),
|
|
"crawl_id": str(crawl.id),
|
|
},
|
|
overrides={
|
|
"crawl": crawl,
|
|
"snapshot": parent_snapshot,
|
|
"created_by_id": crawl.created_by_id,
|
|
},
|
|
queue_for_extraction=False,
|
|
)
|
|
if snapshot is None:
|
|
return None
|
|
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
|
return None
|
|
snapshot.retry_at = None
|
|
if snapshot.status != Snapshot.StatusChoices.SEALED:
|
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
|
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
|
snapshot.ensure_crawl_symlink()
|
|
return str(snapshot.id)
|
|
|
|
def _url_passes_filters(self, crawl, parent_snapshot, url: str) -> bool:
|
|
from archivebox.config.configset import get_config
|
|
|
|
config = get_config(
|
|
user=getattr(crawl, "created_by", None),
|
|
crawl=crawl,
|
|
snapshot=parent_snapshot,
|
|
)
|
|
|
|
def to_pattern_list(value):
|
|
if isinstance(value, list):
|
|
return value
|
|
if isinstance(value, str):
|
|
return [pattern.strip() for pattern in value.split(",") if pattern.strip()]
|
|
return []
|
|
|
|
allowlist = to_pattern_list(config.get("URL_ALLOWLIST", ""))
|
|
denylist = to_pattern_list(config.get("URL_DENYLIST", ""))
|
|
|
|
for pattern in denylist:
|
|
try:
|
|
if re.search(pattern, url):
|
|
return False
|
|
except re.error:
|
|
continue
|
|
|
|
if allowlist:
|
|
for pattern in allowlist:
|
|
try:
|
|
if re.search(pattern, url):
|
|
return True
|
|
except re.error:
|
|
continue
|
|
return False
|
|
|
|
return True
|
|
|
|
def _seal_snapshot(self, snapshot_id: str) -> None:
|
|
from archivebox.core.models import Snapshot
|
|
|
|
snapshot = Snapshot.objects.filter(id=snapshot_id).first()
|
|
if snapshot is None:
|
|
return
|
|
snapshot.status = Snapshot.StatusChoices.SEALED
|
|
snapshot.retry_at = None
|
|
snapshot.downloaded_at = snapshot.downloaded_at or timezone.now()
|
|
snapshot.save(update_fields=["status", "retry_at", "downloaded_at", "modified_at"])
|
|
snapshot.write_index_jsonl()
|
|
snapshot.write_json_details()
|
|
snapshot.write_html_details()
|