bump versions and fix docs

This commit is contained in:
Nick Sweeting
2026-03-15 17:43:07 -07:00
parent e598614b05
commit 7d42c6c8b5
15 changed files with 245 additions and 349 deletions

View File

@@ -37,11 +37,10 @@ html_description=f'''
def register_urls(api: NinjaAPI) -> NinjaAPI:
# api.add_router('/auth/', 'archivebox.api.v1_auth.router')
api.add_router('/auth/', 'archivebox.api.v1_auth.router')
api.add_router('/core/', 'archivebox.api.v1_core.router')
api.add_router('/crawls/', 'archivebox.api.v1_crawls.router')
api.add_router('/cli/', 'archivebox.api.v1_cli.router')
api.add_router('/workers/', 'archivebox.api.v1_workers.router')
api.add_router('/machine/', 'archivebox.api.v1_machine.router')
return api

View File

@@ -30,7 +30,13 @@ def get_api_token(request, auth_data: PasswordAuthSchema):
if user and user.is_superuser:
api_token = get_or_create_api_token(user)
assert api_token is not None, "Failed to create API token"
return api_token.__json__()
return {
"success": True,
"user_id": str(user.pk),
"username": user.username,
"token": api_token.token,
"expires": api_token.expires.isoformat() if api_token.expires else None,
}
return {"success": False, "errors": ["Invalid credentials"]}

View File

@@ -121,10 +121,19 @@ def cli_add(request, args: AddCommandSchema):
created_by_id=request.user.pk,
)
snapshot_ids = [str(snapshot_id) for snapshot_id in result.values_list('id', flat=True)]
result_payload = {
"crawl_id": getattr(result, "crawl_id", None),
"num_snapshots": len(snapshot_ids),
"snapshot_ids": snapshot_ids,
"queued_urls": args.urls,
}
return {
"success": True,
"errors": [],
"result": result,
"result": result_payload,
"result_format": "json",
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}

View File

@@ -9,12 +9,14 @@ from django.db.models import Q
from django.core.exceptions import ValidationError
from django.contrib.auth import get_user_model
from django.shortcuts import redirect
from django.utils import timezone
from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError
from archivebox.core.models import Snapshot, ArchiveResult, Tag
from archivebox.crawls.models import Crawl
from archivebox.api.v1_crawls import CrawlSchema
@@ -191,6 +193,27 @@ class SnapshotSchema(Schema):
class SnapshotUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
class SnapshotCreateSchema(Schema):
url: str
crawl_id: Optional[str] = None
depth: int = 0
title: Optional[str] = None
tags: Optional[List[str]] = None
status: Optional[str] = None
class SnapshotDeleteResponseSchema(Schema):
success: bool
snapshot_id: str
crawl_id: str
deleted_count: int
def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
return [tag.strip() for tag in (tags or []) if tag and tag.strip()]
class SnapshotFilterSchema(FilterSchema):
@@ -230,6 +253,68 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool = True):
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
def create_snapshot(request, data: SnapshotCreateSchema):
tags = normalize_tag_list(data.tags)
if data.status is not None and data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
if not data.url.strip():
raise HttpError(400, 'URL is required')
if data.depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'depth must be between 0 and 4')
if data.crawl_id:
crawl = Crawl.objects.get(id__icontains=data.crawl_id)
crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
tags = tags or crawl_tags
else:
crawl = Crawl.objects.create(
urls=data.url,
max_depth=max(data.depth, 0),
tags_str=','.join(tags),
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
created_by=request.user,
)
snapshot_defaults = {
'depth': data.depth,
'title': data.title,
'timestamp': str(timezone.now().timestamp()),
'status': data.status or Snapshot.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
snapshot, _ = Snapshot.objects.get_or_create(
url=data.url,
crawl=crawl,
defaults=snapshot_defaults,
)
update_fields: List[str] = []
if data.title is not None and snapshot.title != data.title:
snapshot.title = data.title
update_fields.append('title')
if data.status is not None and snapshot.status != data.status:
if data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
snapshot.status = data.status
update_fields.append('status')
if update_fields:
update_fields.append('modified_at')
snapshot.save(update_fields=update_fields)
if tags:
snapshot.save_tags(tags)
try:
snapshot.ensure_crawl_symlink()
except Exception:
pass
request.with_archiveresults = False
return snapshot
@router.patch("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="patch_snapshot")
def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
"""Update a snapshot (e.g., set status=sealed to cancel queued work)."""
@@ -239,6 +324,8 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
tags = payload.pop('tags', None)
if 'status' in payload:
if payload['status'] not in Snapshot.StatusChoices.values:
@@ -246,20 +333,39 @@ def patch_snapshot(request, snapshot_id: str, data: SnapshotUpdateSchema):
snapshot.status = payload['status']
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
snapshot.retry_at = None
update_fields.append('status')
if 'retry_at' in payload:
snapshot.retry_at = payload['retry_at']
update_fields.append('retry_at')
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
if tags is not None:
snapshot.save_tags(normalize_tag_list(tags))
snapshot.save(update_fields=update_fields)
request.with_archiveresults = False
return snapshot
@router.delete("/snapshot/{snapshot_id}", response=SnapshotDeleteResponseSchema, url_name="delete_snapshot")
def delete_snapshot(request, snapshot_id: str):
snapshot = get_snapshot(request, snapshot_id, with_archiveresults=False)
snapshot_id_str = str(snapshot.id)
crawl_id_str = str(snapshot.crawl_id)
deleted_count, _ = snapshot.delete()
return {
'success': True,
'snapshot_id': snapshot_id_str,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
}
### Tag #########################################################################
class TagSchema(Schema):
TYPE: str = 'core.models.Tag'
id: UUID
id: int
modified_at: datetime
created_at: datetime
created_by_id: str

View File

@@ -1,7 +1,7 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List
from typing import List, Optional
from datetime import datetime
from django.utils import timezone
@@ -33,7 +33,6 @@ class CrawlSchema(Schema):
retry_at: datetime | None
urls: str
extractor: str
max_depth: int
tags_str: str
config: dict
@@ -59,12 +58,61 @@ class CrawlSchema(Schema):
class CrawlUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
tags_str: str | None = None
class CrawlCreateSchema(Schema):
urls: List[str]
max_depth: int = 0
tags: Optional[List[str]] = None
tags_str: str = ''
label: str = ''
notes: str = ''
config: dict = {}
class CrawlDeleteResponseSchema(Schema):
success: bool
crawl_id: str
deleted_count: int
deleted_snapshots: int
def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
if tags is not None:
return [tag.strip() for tag in tags if tag and tag.strip()]
return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
def get_crawls(request):
return Crawl.objects.all().distinct()
@router.post("/crawls", response=CrawlSchema, url_name="create_crawl")
def create_crawl(request, data: CrawlCreateSchema):
urls = [url.strip() for url in data.urls if url and url.strip()]
if not urls:
raise HttpError(400, 'At least one URL is required')
if data.max_depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'max_depth must be between 0 and 4')
tags = normalize_tag_list(data.tags, data.tags_str)
crawl = Crawl.objects.create(
urls='\n'.join(urls),
max_depth=data.max_depth,
tags_str=','.join(tags),
label=data.label,
notes=data.notes,
config=data.config,
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
created_by=request.user,
)
crawl.create_snapshots_from_urls()
return crawl
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
def get_crawl(request, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
"""Get a specific Crawl by id."""
@@ -92,6 +140,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
crawl = Crawl.objects.get(id__icontains=crawl_id)
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
tags = payload.pop('tags', None)
tags_str = payload.pop('tags_str', None)
if tags is not None or tags_str is not None:
crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
update_fields.append('tags_str')
if 'status' in payload:
if payload['status'] not in Crawl.StatusChoices.values:
@@ -99,11 +154,13 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
crawl.status = payload['status']
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
crawl.retry_at = None
update_fields.append('status')
if 'retry_at' in payload:
crawl.retry_at = payload['retry_at']
update_fields.append('retry_at')
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl.save(update_fields=update_fields)
if payload.get('status') == Crawl.StatusChoices.SEALED:
Snapshot.objects.filter(
@@ -115,3 +172,17 @@ def patch_crawl(request, crawl_id: str, data: CrawlUpdateSchema):
modified_at=timezone.now(),
)
return crawl
@router.delete("/crawl/{crawl_id}", response=CrawlDeleteResponseSchema, url_name="delete_crawl")
def delete_crawl(request, crawl_id: str):
crawl = Crawl.objects.get(id__icontains=crawl_id)
crawl_id_str = str(crawl.id)
snapshot_count = crawl.snapshot_set.count()
deleted_count, _ = crawl.delete()
return {
'success': True,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
'deleted_snapshots': snapshot_count,
}

View File

@@ -1,107 +0,0 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List, Any
from datetime import datetime
from ninja import Router, Schema
router = Router(tags=['Workers and Tasks'])
class QueueItemSchema(Schema):
"""Schema for a single item in a worker's queue."""
TYPE: str
id: UUID
status: str
retry_at: datetime | None
created_at: datetime
modified_at: datetime
description: str
@staticmethod
def resolve_TYPE(obj) -> str:
return f'{obj._meta.app_label}.{obj._meta.model_name}'
@staticmethod
def resolve_description(obj) -> str:
return str(obj)
class WorkerSchema(Schema):
"""Schema for a Worker type."""
name: str
model: str
max_tick_time: int
max_concurrent_tasks: int
running_count: int
running_workers: List[dict[str, Any]]
@staticmethod
def resolve_model(obj) -> str:
Model = obj.get_model()
return f'{Model._meta.app_label}.{Model._meta.model_name}'
@staticmethod
def resolve_max_tick_time(obj) -> int:
return obj.MAX_TICK_TIME
@staticmethod
def resolve_max_concurrent_tasks(obj) -> int:
return obj.MAX_CONCURRENT_TASKS
@staticmethod
def resolve_running_count(obj) -> int:
return obj.get_worker_count()
@staticmethod
def resolve_running_workers(obj) -> List[dict[str, Any]]:
return obj.get_running_workers()
class OrchestratorSchema(Schema):
"""Schema for the Orchestrator."""
is_running: bool
poll_interval: float
idle_timeout: int
max_crawl_workers: int
total_worker_count: int
workers: List[WorkerSchema]
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
def get_orchestrator(request):
"""Get the orchestrator status and all worker queues."""
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import CrawlWorker
orchestrator = Orchestrator()
# Create temporary worker instances to query their queues
workers = [
CrawlWorker(worker_id=-1),
]
return {
'is_running': orchestrator.is_running(),
'poll_interval': orchestrator.POLL_INTERVAL,
'idle_timeout': orchestrator.IDLE_TIMEOUT,
'max_crawl_workers': orchestrator.MAX_CRAWL_WORKERS,
'total_worker_count': orchestrator.get_total_worker_count(),
'workers': workers,
}
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
def get_workers(request):
"""List all worker types and their current status."""
from archivebox.workers.worker import CrawlWorker
# Create temporary instances to query their queues
return [
CrawlWorker(worker_id=-1),
]
# Progress endpoint moved to core.views.live_progress_view for simplicity

View File

@@ -186,7 +186,9 @@ def add(urls: str | list[str],
pass
# 6. Return the list of Snapshots in this crawl
return crawl.snapshot_set.all()
snapshots = crawl.snapshot_set.all()
snapshots.crawl_id = str(crawl.id)
return snapshots
@click.command()

View File

@@ -10,7 +10,6 @@ from archivebox.misc.serve_static import serve_static
from archivebox.core.admin_site import archivebox_admin
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view
from archivebox.workers.views import JobsDashboardView
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
@@ -42,8 +41,6 @@ urlpatterns = [
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),
path("jobs/", JobsDashboardView.as_view(), name='jobs_dashboard'),
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),

View File

@@ -284,7 +284,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
depth = 0
title = None
timestamp = None
tags = ''
tags = self.tags_str
if not url:
continue

View File

@@ -312,6 +312,40 @@ class TestUrlRouting:
"""
)
def test_api_auth_token_endpoint_available_on_admin_and_api_hosts(self) -> None:
self._run(
"""
ensure_admin_user()
client = Client()
admin_host = get_admin_host()
api_host = get_api_host()
payload = '{"username": "testadmin", "password": "testpassword"}'
resp = client.post(
"/api/v1/auth/get_api_token",
data=payload,
content_type="application/json",
HTTP_HOST=admin_host,
)
assert resp.status_code == 200
data = resp.json()
assert data.get("token")
resp = client.post(
"/api/v1/auth/get_api_token",
data=payload,
content_type="application/json",
HTTP_HOST=api_host,
)
assert resp.status_code == 200
data = resp.json()
assert data.get("token")
print("OK")
"""
)
def test_api_post_with_token_on_admin_and_api_hosts(self) -> None:
self._run(
"""

View File

@@ -1,202 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Job Dashboard</title>
<style>
body {
font-family: Arial, sans-serif;
line-height: 1.6;
color: #333;
width: 100%;
margin: 0 auto;
padding: 20px;
}
@keyframes pulse {
0% { opacity: 1; }
48% { opacity: 0.2; }
52% { opacity: 1; }
100% { opacity: 1; }
}
h1 {
text-align: center;
}
h1 a {
animation: pulse 1s;
}
.dashboard {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 20px;
}
.card {
border: 1px solid #ddd;
border-radius: 8px;
padding: 15px;
background-color: #f9f9f9;
}
.card h2 {
margin-top: 0;
border-bottom: 2px solid #ddd;
padding-bottom: 10px;
font-family: monospace;
}
.scroll-area {
/*height: 800px;
overflow-y: scroll; */
height: auto;
border: 1px solid #ddd;
padding: 10px;
background-color: #fff;
}
.job-item {
border: 1px solid #eee;
border-radius: 4px;
padding: 10px;
margin-bottom: 10px;
}
.job-item:last-child {
margin-bottom: 0;
}
.badge {
display: inline-block;
padding: 3px 7px;
border-radius: 3px;
font-size: 12px;
font-weight: bold;
}
.badge-started {
background-color: #4CAF50;
color: white;
}
.badge-queued {
background-color: #2196F3;
color: white;
}
.badge-failed {
background-color: #f44336;
color: white;
}
.badge-succeeded {
background-color: #666;
color: white;
}
.badge-sealed {
background-color: #666;
color: white;
}
.date {
font-size: 16px;
color: #666;
float: right;
}
</style>
</head>
<body>
<h1>Job Dashboard <small><a href="?refresh=true" id="current-time">♻️ {{now}}</a></small></h1>
<div id="dashboard" class="dashboard"></div>
<script>
function formatDate(dateString) {
const now = Date.now()
const date = new Date(dateString)
// return new Date(dateString).toLocaleString();
// return date.toISOString().split('T').at(-1).replace('Z', '');
const seconds_diff = Math.round((date - now) / 1000, 0)
if (seconds_diff < 0) {
return `${seconds_diff}s ago`;
} else {
return `${seconds_diff}s in the future`;
}
}
function createJobElement(job) {
const jobElement = document.createElement('div');
jobElement.className = 'job-item';
jobElement.innerHTML = `
<p><a href="/api/v1/core/any/${job.id}?api_key={{api_token|default:'NONE PROVIDED BY VIEW'}}"><code>${job.id}</code></a></p>
<p>
<span class="badge badge-${job.status}">${job.status}</span>
<span class="date">♻️ ${formatDate(job.retry_at)}</span>
</p>
<p style="font-size: 12px; color: #666;">${job.description}</p>
`;
return jobElement;
}
function updateDashboard(data) {
const currentTime = document.getElementById('current-time');
window.now = new Date();
currentTime.innerHTML = `♻️ ${window.now.toISOString().split('T').at(-1).replace('Z', '')}`;
const dashboard = document.getElementById('dashboard');
dashboard.innerHTML = '';
data.forEach(actor => {
const card = document.createElement('div');
card.className = 'card';
card.innerHTML = `
<h2>${actor.model}</h2>
<hr/>
Future
<div class="scroll-area" style="background-color: white;" id="future-${actor.model}"></div>
<hr/>
Pending
<div class="scroll-area" style="background-color: lightblue;" id="pending-${actor.model}"></div>
<hr/>
Stalled
<div class="scroll-area" style="background-color: lightcoral;" id="stalled-${actor.model}"></div>
<hr/>
Active
<div class="scroll-area" style="background-color: lightgreen;" id="active-${actor.model}"></div>
<hr/>
Past
<div class="scroll-area" style="background-color: lightgrey;" id="past-${actor.model}"></div>
`;
dashboard.appendChild(card);
const futureContainer = document.getElementById(`future-${actor.model}`);
actor.future.forEach(job => {
futureContainer.appendChild(createJobElement(job));
});
const pendingContainer = document.getElementById(`pending-${actor.model}`);
actor.pending.forEach(job => {
pendingContainer.appendChild(createJobElement(job));
});
const stalledContainer = document.getElementById(`stalled-${actor.model}`);
actor.stalled.forEach(job => {
stalledContainer.appendChild(createJobElement(job));
});
const activeContainer = document.getElementById(`active-${actor.model}`);
actor.active.forEach(job => {
activeContainer.appendChild(createJobElement(job));
});
const pastContainer = document.getElementById(`past-${actor.model}`);
actor.past.forEach(job => {
pastContainer.appendChild(createJobElement(job));
});
});
}
function fetchData() {
fetch('/api/v1/workers/actors', {
headers: {
'Authorization': `Bearer {{api_token|default:'NONE PROVIDED BY VIEW'}}`
}
})
.then(response => response.json())
.then(data => updateDashboard(data))
.catch(error => console.error('Error fetching data:', error));
}
fetchData();
setInterval(fetchData, 750);
</script>
</body>
</html>

View File

@@ -1,20 +0,0 @@
from django.views.generic import TemplateView
from django.contrib.auth.mixins import UserPassesTestMixin
from django.utils import timezone
from archivebox.api.auth import get_or_create_api_token
class JobsDashboardView(UserPassesTestMixin, TemplateView):
template_name = "jobs_dashboard.html"
def test_func(self):
return self.request.user and self.request.user.is_superuser
def get_context_data(self, **kwargs):
api_token = get_or_create_api_token(self.request.user)
context = super().get_context_data(**kwargs)
context['api_token'] = api_token.token if api_token else 'UNABLE TO GENERATE API TOKEN'
context['now'] = timezone.now().strftime("%H:%M:%S")
return context