__package__ = 'archivebox.api' from uuid import UUID from typing import List from datetime import datetime from django.utils import timezone from django.db.models import Q from django.contrib.auth import get_user_model from ninja import Router, Schema from ninja.errors import HttpError from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl from .auth import API_AUTH_METHODS router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS) class CrawlSchema(Schema): TYPE: str = 'crawls.models.Crawl' id: UUID modified_at: datetime created_at: datetime created_by_id: str created_by_username: str status: str retry_at: datetime | None urls: str extractor: str max_depth: int tags_str: str config: dict # snapshots: List[SnapshotSchema] @staticmethod def resolve_created_by_id(obj): return str(obj.created_by_id) @staticmethod def resolve_created_by_username(obj): User = get_user_model() return User.objects.get(id=obj.created_by_id).username @staticmethod def resolve_snapshots(obj, context): if context['request'].with_snapshots: return obj.snapshot_set.all().distinct() return Snapshot.objects.none() class CrawlUpdateSchema(Schema): status: str | None = None retry_at: datetime | None = None @router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls") def get_crawls(request): return Crawl.objects.all().distinct() @router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl") def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False): """Get a specific Crawl by id.""" request.with_snapshots = with_snapshots request.with_archiveresults = with_archiveresults crawl = Crawl.objects.get(id__icontains=crawl_id) if crawl and as_rss: # return snapshots as XML rss feed urls = [ {'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str} for snapshot in crawl.snapshot_set.all() ] xml = '' for url in urls: xml += f'{url["url"]}{url["title"]}{url["bookmarked_at"]}{url["tags"]}' xml += '' return xml return crawl @router.patch("/crawl/{crawl_id}", response=CrawlSchema, url_name="patch_crawl") def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema): """Update a crawl (e.g., set status=sealed to cancel queued work).""" crawl = Crawl.objects.get(id__icontains=crawl_id) payload = data.dict(exclude_unset=True) if 'status' in payload: if payload['status'] not in Crawl.StatusChoices.values: raise HttpError(400, f'Invalid status: {payload["status"]}') crawl.status = payload['status'] if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload: crawl.retry_at = None if 'retry_at' in payload: crawl.retry_at = payload['retry_at'] crawl.save(update_fields=['status', 'retry_at', 'modified_at']) if payload.get('status') == Crawl.StatusChoices.SEALED: Snapshot.objects.filter( crawl=crawl, status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], ).update( status=Snapshot.StatusChoices.SEALED, retry_at=None, modified_at=timezone.now(), ) return crawl