bump versions and fix docs

This commit is contained in:
Nick Sweeting
2026-03-15 17:43:07 -07:00
parent e598614b05
commit 7d42c6c8b5
15 changed files with 245 additions and 349 deletions

View File

@@ -37,11 +37,10 @@ html_description=f'''
def register_urls(api: NinjaAPI) -> NinjaAPI:
# api.add_router('/auth/', 'archivebox.api.v1_auth.router')
api.add_router('/auth/', 'archivebox.api.v1_auth.router')
api.add_router('/core/', 'archivebox.api.v1_core.router')
api.add_router('/crawls/', 'archivebox.api.v1_crawls.router')
api.add_router('/cli/', 'archivebox.api.v1_cli.router')
api.add_router('/workers/', 'archivebox.api.v1_workers.router')
api.add_router('/machine/', 'archivebox.api.v1_machine.router')
return api

View File

@@ -30,7 +30,13 @@ def get_api_token(request, auth_data: PasswordAuthSchema):
if user and user.is_superuser:
api_token = get_or_create_api_token(user)
assert api_token is not None, "Failed to create API token"
return api_token.__json__()
return {
"success": True,
"user_id": str(user.pk),
"username": user.username,
"token": api_token.token,
"expires": api_token.expires.isoformat() if api_token.expires else None,
}
return {"success": False, "errors": ["Invalid credentials"]}

View File

@@ -121,10 +121,19 @@ def cli_add(request, args: AddCommandSchema):
created_by_id=request.user.pk,
)
snapshot_ids = [str(snapshot_id) for snapshot_id in result.values_list('id', flat=True)]
result_payload = {
"crawl_id": getattr(result, "crawl_id", None),
"num_snapshots": len(snapshot_ids),
"snapshot_ids": snapshot_ids,
"queued_urls": args.urls,
}
return {
"success": True,
"errors": [],
"result": result,
"result": result_payload,
"result_format": "json",
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}

View File

@@ -9,12 +9,14 @@ from django.db.models import Q
from django.core.exceptions import ValidationError
from django.contrib.auth import get_user_model
from django.shortcuts import redirect
from django.utils import timezone
from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError
from archivebox.core.models import Snapshot, ArchiveResult, Tag
from archivebox.crawls.models import Crawl
from archivebox.api.v1_crawls import CrawlSchema
@@ -191,6 +193,27 @@ class SnapshotSchema(Schema):
class SnapshotUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
class SnapshotCreateSchema(Schema):
url: str
crawl_id: Optional[str] = None
depth: int = 0
title: Optional[str] = None
tags: Optional[List[str]] = None
status: Optional[str] = None
class SnapshotDeleteResponseSchema(Schema):
success: bool
snapshot_id: str
crawl_id: str
deleted_count: int
def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
return [tag.strip() for tag in (tags or []) if tag and tag.strip()]
class SnapshotFilterSchema(FilterSchema):
@@ -230,6 +253,68 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
def create_snapshot(request, data: SnapshotCreateSchema):
tags = normalize_tag_list(data.tags)
if data.status is not None and data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
if not data.url.strip():
raise HttpError(400, 'URL is required')
if data.depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'depth must be between 0 and 4')
if data.crawl_id:
crawl = Crawl.objects.get(id__icontains=data.crawl_id)
crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
tags = tags or crawl_tags
else:
crawl = Crawl.objects.create(
urls=data.url,
max_depth=max(data.depth, 0),
tags_str=','.join(tags),
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
created_by=request.user,
)
snapshot_defaults = {
'depth': data.depth,
'title': data.title,
'timestamp': str(timezone.now().timestamp()),
'status': data.status or Snapshot.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
snapshot, _ = Snapshot.objects.get_or_create(
url=data.url,
crawl=crawl,
defaults=snapshot_defaults,
)
update_fields: List[str] = []
if data.title is not None and snapshot.title != data.title:
snapshot.title = data.title
update_fields.append('title')
if data.status is not None and snapshot.status != data.status:
if data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
snapshot.status = data.status
update_fields.append('status')
if update_fields:
update_fields.append('modified_at')
snapshot.save(update_fields=update_fields)
if tags:
snapshot.save_tags(tags)
try:
snapshot.ensure_crawl_symlink()
except Exception:
pass
request.with_archiveresults = False
return snapshot
@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot")
def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
"""Update a snapshot (e.g., set status=sealed to cancel queued work)."""
@@ -239,6 +324,8 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
tags = payload.pop('tags', None)
if 'status' in payload:
if payload['status'] not in Snapshot.StatusChoices.values:
@@ -246,20 +333,39 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
snapshot.status = payload['status']
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
snapshot.retry_at = None
update_fields.append('status')
if 'retry_at' in payload:
snapshot.retry_at = payload['retry_at']
update_fields.append('retry_at')
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
if tags is not None:
snapshot.save_tags(normalize_tag_list(tags))
snapshot.save(update_fields=update_fields)
request.with_archiveresults = False
return snapshot
@router.delete("/snapshot/{snapshot_id}", response=SnapshotDeleteResponseSchema, url_name="delete_snapshot")
def delete_snapshot(request, snapshot_id: str):
snapshot = get_snapshot(request, snapshot_id, with_archiveresults=False)
snapshot_id_str = str(snapshot.id)
crawl_id_str = str(snapshot.crawl_id)
deleted_count, _ = snapshot.delete()
return {
'success': True,
'snapshot_id': snapshot_id_str,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
}
### Tag #########################################################################
class TagSchema(Schema):
TYPE: str = 'core.models.Tag'
id: UUID
id: int
modified_at: datetime
created_at: datetime
created_by_id: str

View File

@@ -1,7 +1,7 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List
from typing import List, Optional
from datetime import datetime
from django.utils import timezone
@@ -33,7 +33,6 @@ class CrawlSchema(Schema):
retry_at: datetime | None
urls: str
extractor: str
max_depth: int
tags_str: str
config: dict
@@ -59,12 +58,61 @@ class CrawlSchema(Schema):
class CrawlUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
tags_str: str | None = None
class CrawlCreateSchema(Schema):
urls: List[str]
max_depth: int = 0
tags: Optional[List[str]] = None
tags_str: str = ''
label: str = ''
notes: str = ''
config: dict = {}
class CrawlDeleteResponseSchema(Schema):
success: bool
crawl_id: str
deleted_count: int
deleted_snapshots: int
def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
if tags is not None:
return [tag.strip() for tag in tags if tag and tag.strip()]
return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
def get_crawls(request):
return Crawl.objects.all().distinct()
@router.post("/crawls", response=CrawlSchema, url_name="create_crawl")
def create_crawl(request, data: CrawlCreateSchema):
urls = [url.strip() for url in data.urls if url and url.strip()]
if not urls:
raise HttpError(400, 'At least one URL is required')
if data.max_depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'max_depth must be between 0 and 4')
tags = normalize_tag_list(data.tags, data.tags_str)
crawl = Crawl.objects.create(
urls='\n'.join(urls),
max_depth=data.max_depth,
tags_str=','.join(tags),
label=data.label,
notes=data.notes,
config=data.config,
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
created_by=request.user,
)
crawl.create_snapshots_from_urls()
return crawl
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
"""Get a specific Crawl by id."""
@@ -92,6 +140,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
crawl = Crawl.objects.get(id__icontains=crawl_id)
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
tags = payload.pop('tags', None)
tags_str = payload.pop('tags_str', None)
if tags is not None or tags_str is not None:
crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
update_fields.append('tags_str')
if 'status' in payload:
if payload['status'] not in Crawl.StatusChoices.values:
@@ -99,11 +154,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
crawl.status = payload['status']
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
crawl.retry_at = None
update_fields.append('status')
if 'retry_at' in payload:
crawl.retry_at = payload['retry_at']
update_fields.append('retry_at')
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=update_fields)
if payload.get('status') == Crawl.StatusChoices.SEALED:
Snapshot.objects.filter(
@@ -115,3 +172,17 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
modified_at=timezone.now(),
)
return crawl
@router.delete("/crawl/{crawl_id}", response=CrawlDeleteResponseSchema, url_name="delete_crawl")
def delete_crawl(request, crawl_id: str):
crawl = Crawl.objects.get(id__icontains=crawl_id)
crawl_id_str = str(crawl.id)
snapshot_count = crawl.snapshot_set.count()
deleted_count, _ = crawl.delete()
return {
'success': True,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
'deleted_snapshots': snapshot_count,
}

View File

@@ -1,107 +0,0 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List, Any
from datetime import datetime
from ninja import Router, Schema
router = Router(tags=['Workers and Tasks'])
class QueueItemSchema(Schema):
"""Schema for a single item in a worker's queue."""
TYPE: str
id: UUID
status: str
retry_at: datetime | None
created_at: datetime
modified_at: datetime
description: str
@staticmethod
def resolve_TYPE(obj) -> str:
return f'{obj._meta.app_label}.{obj._meta.model_name}'
@staticmethod
def resolve_description(obj) -> str:
return str(obj)
class WorkerSchema(Schema):
"""Schema for a Worker type."""
name: str
model: str
max_tick_time: int
max_concurrent_tasks: int
running_count: int
running_workers: List[dict[str, Any]]
@staticmethod
def resolve_model(obj) -> str:
Model = obj.get_model()
return f'{Model._meta.app_label}.{Model._meta.model_name}'
@staticmethod
def resolve_max_tick_time(obj) -> int:
return obj.MAX_TICK_TIME
@staticmethod
def resolve_max_concurrent_tasks(obj) -> int:
return obj.MAX_CONCURRENT_TASKS
@staticmethod
def resolve_running_count(obj) -> int:
return obj.get_worker_count()
@staticmethod
def resolve_running_workers(obj) -> List[dict[str, Any]]:
return obj.get_running_workers()
class OrchestratorSchema(Schema):
"""Schema for the Orchestrator."""
is_running: bool
poll_interval: float
idle_timeout: int
max_crawl_workers: int
total_worker_count: int
workers: List[WorkerSchema]
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
def get_orchestrator(request):
"""Get the orchestrator status and all worker queues."""
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import CrawlWorker
orchestrator = Orchestrator()
# Create temporary worker instances to query their queues
workers = [
CrawlWorker(worker_id=-1),
]
return {
'is_running': orchestrator.is_running(),
'poll_interval': orchestrator.POLL_INTERVAL,
'idle_timeout': orchestrator.IDLE_TIMEOUT,
'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS,
'total_worker_count': orchestrator.get_total_worker_count(),
'workers': workers,
}
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
def get_workers(request):
"""List all worker types and their current status."""
from archivebox.workers.worker import CrawlWorker
# Create temporary instances to query their queues
return [
CrawlWorker(worker_id=-1),
]
# Progress endpoint moved to core.views.live_progress_view for simplicity