bump versions and fix docs

2026-04-06 07:47:53 +10:00 · 2026-03-15 17:43:07 -07:00
parent e598614b05
commit 7d42c6c8b5
15 changed files with 245 additions and 349 deletions
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -37,11 +37,10 @@ html_description=f'''


 def register_urls(api: NinjaAPI) -> NinjaAPI:
-    # api.add_router('/auth/',     'archivebox.api.v1_auth.router')
+    api.add_router('/auth/',     'archivebox.api.v1_auth.router')
    api.add_router('/core/',     'archivebox.api.v1_core.router')
    api.add_router('/crawls/',   'archivebox.api.v1_crawls.router')
    api.add_router('/cli/',      'archivebox.api.v1_cli.router')
-    api.add_router('/workers/',  'archivebox.api.v1_workers.router')
    api.add_router('/machine/',  'archivebox.api.v1_machine.router')
    return api

--- a/archivebox/api/v1_auth.py
+++ b/archivebox/api/v1_auth.py
@@ -30,7 +30,13 @@ def get_api_token(request, auth_data: PasswordAuthSchema):
    if user and user.is_superuser:
        api_token = get_or_create_api_token(user)
        assert api_token is not None, "Failed to create API token"
-        return api_token.__json__()
+        return {
+            "success": True,
+            "user_id": str(user.pk),
+            "username": user.username,
+            "token": api_token.token,
+            "expires": api_token.expires.isoformat() if api_token.expires else None,
+        }
    
    return {"success": False, "errors": ["Invalid credentials"]}

--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -121,10 +121,19 @@ def cli_add(request, args: AddCommandSchema):
        created_by_id=request.user.pk,
    )

+    snapshot_ids = [str(snapshot_id) for snapshot_id in result.values_list('id', flat=True)]
+    result_payload = {
+        "crawl_id": getattr(result, "crawl_id", None),
+        "num_snapshots": len(snapshot_ids),
+        "snapshot_ids": snapshot_ids,
+        "queued_urls": args.urls,
+    }
+
    return {
        "success": True,
        "errors": [],
-        "result": result,
+        "result": result_payload,
+        "result_format": "json",
        "stdout": ansi_to_html(request.stdout.getvalue().strip()),
        "stderr": ansi_to_html(request.stderr.getvalue().strip()),
    }
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -9,12 +9,14 @@ from django.db.models import Q
 from django.core.exceptions import ValidationError
 from django.contrib.auth import get_user_model
 from django.shortcuts import redirect
+from django.utils import timezone

 from ninja import Router, Schema, FilterSchema, Field, Query
 from ninja.pagination import paginate, PaginationBase
 from ninja.errors import HttpError

 from archivebox.core.models import Snapshot, ArchiveResult, Tag
+from archivebox.crawls.models import Crawl
 from archivebox.api.v1_crawls import CrawlSchema


@@ -191,6 +193,27 @@ class SnapshotSchema(Schema):
 class SnapshotUpdateSchema(Schema):
    status: str | None = None
    retry_at: datetime | None = None
+    tags: Optional[List[str]] = None
+
+
+class SnapshotCreateSchema(Schema):
+    url: str
+    crawl_id: Optional[str] = None
+    depth: int = 0
+    title: Optional[str] = None
+    tags: Optional[List[str]] = None
+    status: Optional[str] = None
+
+
+class SnapshotDeleteResponseSchema(Schema):
+    success: bool
+    snapshot_id: str
+    crawl_id: str
+    deleted_count: int
+
+
+def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
+    return [tag.strip() for tag in (tags or []) if tag and tag.strip()]


 class SnapshotFilterSchema(FilterSchema):
@@ -230,6 +253,68 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
        return Snapshot.objects.get(Q(id__icontains=snapshot_id))


+@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
+def create_snapshot(request, data: SnapshotCreateSchema):
+    tags = normalize_tag_list(data.tags)
+    if data.status is not None and data.status not in Snapshot.StatusChoices.values:
+        raise HttpError(400, f'Invalid status: {data.status}')
+    if not data.url.strip():
+        raise HttpError(400, 'URL is required')
+    if data.depth not in (0, 1, 2, 3, 4):
+        raise HttpError(400, 'depth must be between 0 and 4')
+
+    if data.crawl_id:
+        crawl = Crawl.objects.get(id__icontains=data.crawl_id)
+        crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
+        tags = tags or crawl_tags
+    else:
+        crawl = Crawl.objects.create(
+            urls=data.url,
+            max_depth=max(data.depth, 0),
+            tags_str=','.join(tags),
+            status=Crawl.StatusChoices.QUEUED,
+            retry_at=timezone.now(),
+            created_by=request.user,
+        )
+
+    snapshot_defaults = {
+        'depth': data.depth,
+        'title': data.title,
+        'timestamp': str(timezone.now().timestamp()),
+        'status': data.status or Snapshot.StatusChoices.QUEUED,
+        'retry_at': timezone.now(),
+    }
+    snapshot, _ = Snapshot.objects.get_or_create(
+        url=data.url,
+        crawl=crawl,
+        defaults=snapshot_defaults,
+    )
+
+    update_fields: List[str] = []
+    if data.title is not None and snapshot.title != data.title:
+        snapshot.title = data.title
+        update_fields.append('title')
+    if data.status is not None and snapshot.status != data.status:
+        if data.status not in Snapshot.StatusChoices.values:
+            raise HttpError(400, f'Invalid status: {data.status}')
+        snapshot.status = data.status
+        update_fields.append('status')
+    if update_fields:
+        update_fields.append('modified_at')
+        snapshot.save(update_fields=update_fields)
+
+    if tags:
+        snapshot.save_tags(tags)
+
+    try:
+        snapshot.ensure_crawl_symlink()
+    except Exception:
+        pass
+
+    request.with_archiveresults = False
+    return snapshot
+
+
@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot")
 def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
    """Update a snapshot (e.g., set status=sealed to cancel queued work)."""
@@ -239,6 +324,8 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
        snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))

    payload = data.dict(exclude_unset=True)
+    update_fields = ['modified_at']
+    tags = payload.pop('tags', None)

    if 'status' in payload:
        if payload['status'] not in Snapshot.StatusChoices.values:
@@ -246,20 +333,39 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
        snapshot.status = payload['status']
        if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
            snapshot.retry_at = None
+        update_fields.append('status')

    if 'retry_at' in payload:
        snapshot.retry_at = payload['retry_at']
+        update_fields.append('retry_at')

-    snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
+    if tags is not None:
+        snapshot.save_tags(normalize_tag_list(tags))
+
+    snapshot.save(update_fields=update_fields)
    request.with_archiveresults = False
    return snapshot


+@router.delete("/snapshot/{snapshot_id}", response=SnapshotDeleteResponseSchema, url_name="delete_snapshot")
+def delete_snapshot(request, snapshot_id: str):
+    snapshot = get_snapshot(request, snapshot_id, with_archiveresults=False)
+    snapshot_id_str = str(snapshot.id)
+    crawl_id_str = str(snapshot.crawl_id)
+    deleted_count, _ = snapshot.delete()
+    return {
+        'success': True,
+        'snapshot_id': snapshot_id_str,
+        'crawl_id': crawl_id_str,
+        'deleted_count': deleted_count,
+    }
+
+
 ### Tag #########################################################################

 class TagSchema(Schema):
    TYPE: str = 'core.models.Tag'
-    id: UUID
+    id: int
    modified_at: datetime
    created_at: datetime
    created_by_id: str
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -1,7 +1,7 @@
 __package__ = 'archivebox.api'

 from uuid import UUID
-from typing import List
+from typing import List, Optional
 from datetime import datetime
 from django.utils import timezone

@@ -33,7 +33,6 @@ class CrawlSchema(Schema):
    retry_at: datetime | None

    urls: str
-    extractor: str
    max_depth: int
    tags_str: str
    config: dict
@@ -59,12 +58,61 @@ class CrawlSchema(Schema):
 class CrawlUpdateSchema(Schema):
    status: str | None = None
    retry_at: datetime | None = None
+    tags: Optional[List[str]] = None
+    tags_str: str | None = None
+
+
+class CrawlCreateSchema(Schema):
+    urls: List[str]
+    max_depth: int = 0
+    tags: Optional[List[str]] = None
+    tags_str: str = ''
+    label: str = ''
+    notes: str = ''
+    config: dict = {}
+
+
+class CrawlDeleteResponseSchema(Schema):
+    success: bool
+    crawl_id: str
+    deleted_count: int
+    deleted_snapshots: int
+
+
+def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
+    if tags is not None:
+        return [tag.strip() for tag in tags if tag and tag.strip()]
+    return [tag.strip() for tag in tags_str.split(',') if tag.strip()]


@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
 def get_crawls(request):
    return Crawl.objects.all().distinct()

+
+@router.post("/crawls", response=CrawlSchema, url_name="create_crawl")
+def create_crawl(request, data: CrawlCreateSchema):
+    urls = [url.strip() for url in data.urls if url and url.strip()]
+    if not urls:
+        raise HttpError(400, 'At least one URL is required')
+    if data.max_depth not in (0, 1, 2, 3, 4):
+        raise HttpError(400, 'max_depth must be between 0 and 4')
+
+    tags = normalize_tag_list(data.tags, data.tags_str)
+    crawl = Crawl.objects.create(
+        urls='\n'.join(urls),
+        max_depth=data.max_depth,
+        tags_str=','.join(tags),
+        label=data.label,
+        notes=data.notes,
+        config=data.config,
+        status=Crawl.StatusChoices.QUEUED,
+        retry_at=timezone.now(),
+        created_by=request.user,
+    )
+    crawl.create_snapshots_from_urls()
+    return crawl
+
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
 def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
    """Get a specific Crawl by id."""
@@ -92,6 +140,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
    """Update a crawl (e.g., set status=sealed to cancel queued work)."""
    crawl = Crawl.objects.get(id__icontains=crawl_id)
    payload = data.dict(exclude_unset=True)
+    update_fields = ['modified_at']
+
+    tags = payload.pop('tags', None)
+    tags_str = payload.pop('tags_str', None)
+    if tags is not None or tags_str is not None:
+        crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
+        update_fields.append('tags_str')

    if 'status' in payload:
        if payload['status'] not in Crawl.StatusChoices.values:
@@ -99,11 +154,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
        crawl.status = payload['status']
        if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
            crawl.retry_at = None
+        update_fields.append('status')

    if 'retry_at' in payload:
        crawl.retry_at = payload['retry_at']
+        update_fields.append('retry_at')

-    crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
+    crawl.save(update_fields=update_fields)

    if payload.get('status') == Crawl.StatusChoices.SEALED:
        Snapshot.objects.filter(
@@ -115,3 +172,17 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
            modified_at=timezone.now(),
        )
    return crawl
+
+
+@router.delete("/crawl/{crawl_id}", response=CrawlDeleteResponseSchema, url_name="delete_crawl")
+def delete_crawl(request, crawl_id: str):
+    crawl = Crawl.objects.get(id__icontains=crawl_id)
+    crawl_id_str = str(crawl.id)
+    snapshot_count = crawl.snapshot_set.count()
+    deleted_count, _ = crawl.delete()
+    return {
+        'success': True,
+        'crawl_id': crawl_id_str,
+        'deleted_count': deleted_count,
+        'deleted_snapshots': snapshot_count,
+    }
--- a/archivebox/api/v1_workers.py
+++ b/archivebox/api/v1_workers.py
@@ -1,107 +0,0 @@
-__package__ = 'archivebox.api'
-
-from uuid import UUID
-from typing import List, Any
-from datetime import datetime
-
-from ninja import Router, Schema
-
-
-router = Router(tags=['Workers and Tasks'])
-
-
-class QueueItemSchema(Schema):
-    """Schema for a single item in a worker's queue."""
-    TYPE: str
-    id: UUID
-    status: str
-    retry_at: datetime | None
-    created_at: datetime
-    modified_at: datetime
-    description: str
-
-    @staticmethod
-    def resolve_TYPE(obj) -> str:
-        return f'{obj._meta.app_label}.{obj._meta.model_name}'
-
-    @staticmethod
-    def resolve_description(obj) -> str:
-        return str(obj)
-
-
-class WorkerSchema(Schema):
-    """Schema for a Worker type."""
-    name: str
-    model: str
-    max_tick_time: int
-    max_concurrent_tasks: int
-    running_count: int
-    running_workers: List[dict[str, Any]]
-
-    @staticmethod
-    def resolve_model(obj) -> str:
-        Model = obj.get_model()
-        return f'{Model._meta.app_label}.{Model._meta.model_name}'
-
-    @staticmethod
-    def resolve_max_tick_time(obj) -> int:
-        return obj.MAX_TICK_TIME
-
-    @staticmethod
-    def resolve_max_concurrent_tasks(obj) -> int:
-        return obj.MAX_CONCURRENT_TASKS
-
-    @staticmethod
-    def resolve_running_count(obj) -> int:
-        return obj.get_worker_count()
-
-    @staticmethod
-    def resolve_running_workers(obj) -> List[dict[str, Any]]:
-        return obj.get_running_workers()
-
-
-class OrchestratorSchema(Schema):
-    """Schema for the Orchestrator."""
-    is_running: bool
-    poll_interval: float
-    idle_timeout: int
-    max_crawl_workers: int
-    total_worker_count: int
-    workers: List[WorkerSchema]
-
-
-@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
-def get_orchestrator(request):
-    """Get the orchestrator status and all worker queues."""
-    from archivebox.workers.orchestrator import Orchestrator
-    from archivebox.workers.worker import CrawlWorker
-
-    orchestrator = Orchestrator()
-
-    # Create temporary worker instances to query their queues
-    workers = [
-        CrawlWorker(worker_id=-1),
-    ]
-
-    return {
-        'is_running': orchestrator.is_running(),
-        'poll_interval': orchestrator.POLL_INTERVAL,
-        'idle_timeout': orchestrator.IDLE_TIMEOUT,
-        'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS,
-        'total_worker_count': orchestrator.get_total_worker_count(),
-        'workers': workers,
-    }
-
-
-@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
-def get_workers(request):
-    """List all worker types and their current status."""
-    from archivebox.workers.worker import CrawlWorker
-
-    # Create temporary instances to query their queues
-    return [
-        CrawlWorker(worker_id=-1),
-    ]
-
-
-# Progress endpoint moved to core.views.live_progress_view for simplicity