mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
bump versions and fix docs
This commit is contained in:
@@ -37,11 +37,10 @@ html_description=f'''
|
||||
|
||||
|
||||
def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||
# api.add_router('/auth/', 'archivebox.api.v1_auth.router')
|
||||
api.add_router('/auth/', 'archivebox.api.v1_auth.router')
|
||||
api.add_router('/core/', 'archivebox.api.v1_core.router')
|
||||
api.add_router('/crawls/', 'archivebox.api.v1_crawls.router')
|
||||
api.add_router('/cli/', 'archivebox.api.v1_cli.router')
|
||||
api.add_router('/workers/', 'archivebox.api.v1_workers.router')
|
||||
api.add_router('/machine/', 'archivebox.api.v1_machine.router')
|
||||
return api
|
||||
|
||||
|
||||
@@ -30,7 +30,13 @@ def get_api_token(request, auth_data: PasswordAuthSchema):
|
||||
if user and user.is_superuser:
|
||||
api_token = get_or_create_api_token(user)
|
||||
assert api_token is not None, "Failed to create API token"
|
||||
return api_token.__json__()
|
||||
return {
|
||||
"success": True,
|
||||
"user_id": str(user.pk),
|
||||
"username": user.username,
|
||||
"token": api_token.token,
|
||||
"expires": api_token.expires.isoformat() if api_token.expires else None,
|
||||
}
|
||||
|
||||
return {"success": False, "errors": ["Invalid credentials"]}
|
||||
|
||||
|
||||
@@ -121,10 +121,19 @@ def cli_add(request, args: AddCommandSchema):
|
||||
created_by_id=request.user.pk,
|
||||
)
|
||||
|
||||
snapshot_ids = [str(snapshot_id) for snapshot_id in result.values_list('id', flat=True)]
|
||||
result_payload = {
|
||||
"crawl_id": getattr(result, "crawl_id", None),
|
||||
"num_snapshots": len(snapshot_ids),
|
||||
"snapshot_ids": snapshot_ids,
|
||||
"queued_urls": args.urls,
|
||||
}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"errors": [],
|
||||
"result": result,
|
||||
"result": result_payload,
|
||||
"result_format": "json",
|
||||
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
|
||||
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
|
||||
}
|
||||
|
||||
@@ -9,12 +9,14 @@ from django.db.models import Q
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.shortcuts import redirect
|
||||
from django.utils import timezone
|
||||
|
||||
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||
from ninja.pagination import paginate, PaginationBase
|
||||
from ninja.errors import HttpError
|
||||
|
||||
from archivebox.core.models import Snapshot, ArchiveResult, Tag
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.api.v1_crawls import CrawlSchema
|
||||
|
||||
|
||||
@@ -191,6 +193,27 @@ class SnapshotSchema(Schema):
|
||||
class SnapshotUpdateSchema(Schema):
|
||||
status: str | None = None
|
||||
retry_at: datetime | None = None
|
||||
tags: Optional[List[str]] = None
|
||||
|
||||
|
||||
class SnapshotCreateSchema(Schema):
|
||||
url: str
|
||||
crawl_id: Optional[str] = None
|
||||
depth: int = 0
|
||||
title: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
status: Optional[str] = None
|
||||
|
||||
|
||||
class SnapshotDeleteResponseSchema(Schema):
|
||||
success: bool
|
||||
snapshot_id: str
|
||||
crawl_id: str
|
||||
deleted_count: int
|
||||
|
||||
|
||||
def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
|
||||
return [tag.strip() for tag in (tags or []) if tag and tag.strip()]
|
||||
|
||||
|
||||
class SnapshotFilterSchema(FilterSchema):
|
||||
@@ -230,6 +253,68 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
|
||||
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
|
||||
|
||||
|
||||
@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
|
||||
def create_snapshot(request, data: SnapshotCreateSchema):
|
||||
tags = normalize_tag_list(data.tags)
|
||||
if data.status is not None and data.status not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {data.status}')
|
||||
if not data.url.strip():
|
||||
raise HttpError(400, 'URL is required')
|
||||
if data.depth not in (0, 1, 2, 3, 4):
|
||||
raise HttpError(400, 'depth must be between 0 and 4')
|
||||
|
||||
if data.crawl_id:
|
||||
crawl = Crawl.objects.get(id__icontains=data.crawl_id)
|
||||
crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
|
||||
tags = tags or crawl_tags
|
||||
else:
|
||||
crawl = Crawl.objects.create(
|
||||
urls=data.url,
|
||||
max_depth=max(data.depth, 0),
|
||||
tags_str=','.join(tags),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
created_by=request.user,
|
||||
)
|
||||
|
||||
snapshot_defaults = {
|
||||
'depth': data.depth,
|
||||
'title': data.title,
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'status': data.status or Snapshot.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
snapshot, _ = Snapshot.objects.get_or_create(
|
||||
url=data.url,
|
||||
crawl=crawl,
|
||||
defaults=snapshot_defaults,
|
||||
)
|
||||
|
||||
update_fields: List[str] = []
|
||||
if data.title is not None and snapshot.title != data.title:
|
||||
snapshot.title = data.title
|
||||
update_fields.append('title')
|
||||
if data.status is not None and snapshot.status != data.status:
|
||||
if data.status not in Snapshot.StatusChoices.values:
|
||||
raise HttpError(400, f'Invalid status: {data.status}')
|
||||
snapshot.status = data.status
|
||||
update_fields.append('status')
|
||||
if update_fields:
|
||||
update_fields.append('modified_at')
|
||||
snapshot.save(update_fields=update_fields)
|
||||
|
||||
if tags:
|
||||
snapshot.save_tags(tags)
|
||||
|
||||
try:
|
||||
snapshot.ensure_crawl_symlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
request.with_archiveresults = False
|
||||
return snapshot
|
||||
|
||||
|
||||
@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot")
|
||||
def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
|
||||
"""Update a snapshot (e.g., set status=sealed to cancel queued work)."""
|
||||
@@ -239,6 +324,8 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
|
||||
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
|
||||
|
||||
payload = data.dict(exclude_unset=True)
|
||||
update_fields = ['modified_at']
|
||||
tags = payload.pop('tags', None)
|
||||
|
||||
if 'status' in payload:
|
||||
if payload['status'] not in Snapshot.StatusChoices.values:
|
||||
@@ -246,20 +333,39 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
|
||||
snapshot.status = payload['status']
|
||||
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
|
||||
snapshot.retry_at = None
|
||||
update_fields.append('status')
|
||||
|
||||
if 'retry_at' in payload:
|
||||
snapshot.retry_at = payload['retry_at']
|
||||
update_fields.append('retry_at')
|
||||
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
if tags is not None:
|
||||
snapshot.save_tags(normalize_tag_list(tags))
|
||||
|
||||
snapshot.save(update_fields=update_fields)
|
||||
request.with_archiveresults = False
|
||||
return snapshot
|
||||
|
||||
|
||||
@router.delete("/snapshot/{snapshot_id}", response=SnapshotDeleteResponseSchema, url_name="delete_snapshot")
|
||||
def delete_snapshot(request, snapshot_id: str):
|
||||
snapshot = get_snapshot(request, snapshot_id, with_archiveresults=False)
|
||||
snapshot_id_str = str(snapshot.id)
|
||||
crawl_id_str = str(snapshot.crawl_id)
|
||||
deleted_count, _ = snapshot.delete()
|
||||
return {
|
||||
'success': True,
|
||||
'snapshot_id': snapshot_id_str,
|
||||
'crawl_id': crawl_id_str,
|
||||
'deleted_count': deleted_count,
|
||||
}
|
||||
|
||||
|
||||
### Tag #########################################################################
|
||||
|
||||
class TagSchema(Schema):
|
||||
TYPE: str = 'core.models.Tag'
|
||||
id: UUID
|
||||
id: int
|
||||
modified_at: datetime
|
||||
created_at: datetime
|
||||
created_by_id: str
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
__package__ = 'archivebox.api'
|
||||
|
||||
from uuid import UUID
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from django.utils import timezone
|
||||
|
||||
@@ -33,7 +33,6 @@ class CrawlSchema(Schema):
|
||||
retry_at: datetime | None
|
||||
|
||||
urls: str
|
||||
extractor: str
|
||||
max_depth: int
|
||||
tags_str: str
|
||||
config: dict
|
||||
@@ -59,12 +58,61 @@ class CrawlSchema(Schema):
|
||||
class CrawlUpdateSchema(Schema):
|
||||
status: str | None = None
|
||||
retry_at: datetime | None = None
|
||||
tags: Optional[List[str]] = None
|
||||
tags_str: str | None = None
|
||||
|
||||
|
||||
class CrawlCreateSchema(Schema):
|
||||
urls: List[str]
|
||||
max_depth: int = 0
|
||||
tags: Optional[List[str]] = None
|
||||
tags_str: str = ''
|
||||
label: str = ''
|
||||
notes: str = ''
|
||||
config: dict = {}
|
||||
|
||||
|
||||
class CrawlDeleteResponseSchema(Schema):
|
||||
success: bool
|
||||
crawl_id: str
|
||||
deleted_count: int
|
||||
deleted_snapshots: int
|
||||
|
||||
|
||||
def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
|
||||
if tags is not None:
|
||||
return [tag.strip() for tag in tags if tag and tag.strip()]
|
||||
return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
|
||||
|
||||
|
||||
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
|
||||
def get_crawls(request):
|
||||
return Crawl.objects.all().distinct()
|
||||
|
||||
|
||||
@router.post("/crawls", response=CrawlSchema, url_name="create_crawl")
|
||||
def create_crawl(request, data: CrawlCreateSchema):
|
||||
urls = [url.strip() for url in data.urls if url and url.strip()]
|
||||
if not urls:
|
||||
raise HttpError(400, 'At least one URL is required')
|
||||
if data.max_depth not in (0, 1, 2, 3, 4):
|
||||
raise HttpError(400, 'max_depth must be between 0 and 4')
|
||||
|
||||
tags = normalize_tag_list(data.tags, data.tags_str)
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join(urls),
|
||||
max_depth=data.max_depth,
|
||||
tags_str=','.join(tags),
|
||||
label=data.label,
|
||||
notes=data.notes,
|
||||
config=data.config,
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
created_by=request.user,
|
||||
)
|
||||
crawl.create_snapshots_from_urls()
|
||||
return crawl
|
||||
|
||||
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
|
||||
def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
|
||||
"""Get a specific Crawl by id."""
|
||||
@@ -92,6 +140,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
|
||||
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
|
||||
crawl = Crawl.objects.get(id__icontains=crawl_id)
|
||||
payload = data.dict(exclude_unset=True)
|
||||
update_fields = ['modified_at']
|
||||
|
||||
tags = payload.pop('tags', None)
|
||||
tags_str = payload.pop('tags_str', None)
|
||||
if tags is not None or tags_str is not None:
|
||||
crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
|
||||
update_fields.append('tags_str')
|
||||
|
||||
if 'status' in payload:
|
||||
if payload['status'] not in Crawl.StatusChoices.values:
|
||||
@@ -99,11 +154,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
|
||||
crawl.status = payload['status']
|
||||
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
|
||||
crawl.retry_at = None
|
||||
update_fields.append('status')
|
||||
|
||||
if 'retry_at' in payload:
|
||||
crawl.retry_at = payload['retry_at']
|
||||
update_fields.append('retry_at')
|
||||
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
crawl.save(update_fields=update_fields)
|
||||
|
||||
if payload.get('status') == Crawl.StatusChoices.SEALED:
|
||||
Snapshot.objects.filter(
|
||||
@@ -115,3 +172,17 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
return crawl
|
||||
|
||||
|
||||
@router.delete("/crawl/{crawl_id}", response=CrawlDeleteResponseSchema, url_name="delete_crawl")
|
||||
def delete_crawl(request, crawl_id: str):
|
||||
crawl = Crawl.objects.get(id__icontains=crawl_id)
|
||||
crawl_id_str = str(crawl.id)
|
||||
snapshot_count = crawl.snapshot_set.count()
|
||||
deleted_count, _ = crawl.delete()
|
||||
return {
|
||||
'success': True,
|
||||
'crawl_id': crawl_id_str,
|
||||
'deleted_count': deleted_count,
|
||||
'deleted_snapshots': snapshot_count,
|
||||
}
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
__package__ = 'archivebox.api'
|
||||
|
||||
from uuid import UUID
|
||||
from typing import List, Any
|
||||
from datetime import datetime
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
|
||||
router = Router(tags=['Workers and Tasks'])
|
||||
|
||||
|
||||
class QueueItemSchema(Schema):
|
||||
"""Schema for a single item in a worker's queue."""
|
||||
TYPE: str
|
||||
id: UUID
|
||||
status: str
|
||||
retry_at: datetime | None
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
description: str
|
||||
|
||||
@staticmethod
|
||||
def resolve_TYPE(obj) -> str:
|
||||
return f'{obj._meta.app_label}.{obj._meta.model_name}'
|
||||
|
||||
@staticmethod
|
||||
def resolve_description(obj) -> str:
|
||||
return str(obj)
|
||||
|
||||
|
||||
class WorkerSchema(Schema):
|
||||
"""Schema for a Worker type."""
|
||||
name: str
|
||||
model: str
|
||||
max_tick_time: int
|
||||
max_concurrent_tasks: int
|
||||
running_count: int
|
||||
running_workers: List[dict[str, Any]]
|
||||
|
||||
@staticmethod
|
||||
def resolve_model(obj) -> str:
|
||||
Model = obj.get_model()
|
||||
return f'{Model._meta.app_label}.{Model._meta.model_name}'
|
||||
|
||||
@staticmethod
|
||||
def resolve_max_tick_time(obj) -> int:
|
||||
return obj.MAX_TICK_TIME
|
||||
|
||||
@staticmethod
|
||||
def resolve_max_concurrent_tasks(obj) -> int:
|
||||
return obj.MAX_CONCURRENT_TASKS
|
||||
|
||||
@staticmethod
|
||||
def resolve_running_count(obj) -> int:
|
||||
return obj.get_worker_count()
|
||||
|
||||
@staticmethod
|
||||
def resolve_running_workers(obj) -> List[dict[str, Any]]:
|
||||
return obj.get_running_workers()
|
||||
|
||||
|
||||
class OrchestratorSchema(Schema):
|
||||
"""Schema for the Orchestrator."""
|
||||
is_running: bool
|
||||
poll_interval: float
|
||||
idle_timeout: int
|
||||
max_crawl_workers: int
|
||||
total_worker_count: int
|
||||
workers: List[WorkerSchema]
|
||||
|
||||
|
||||
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
|
||||
def get_orchestrator(request):
|
||||
"""Get the orchestrator status and all worker queues."""
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
# Create temporary worker instances to query their queues
|
||||
workers = [
|
||||
CrawlWorker(worker_id=-1),
|
||||
]
|
||||
|
||||
return {
|
||||
'is_running': orchestrator.is_running(),
|
||||
'poll_interval': orchestrator.POLL_INTERVAL,
|
||||
'idle_timeout': orchestrator.IDLE_TIMEOUT,
|
||||
'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS,
|
||||
'total_worker_count': orchestrator.get_total_worker_count(),
|
||||
'workers': workers,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
|
||||
def get_workers(request):
|
||||
"""List all worker types and their current status."""
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
# Create temporary instances to query their queues
|
||||
return [
|
||||
CrawlWorker(worker_id=-1),
|
||||
]
|
||||
|
||||
|
||||
# Progress endpoint moved to core.views.live_progress_view for simplicity
|
||||
Reference in New Issue
Block a user