This commit is contained in:
Nick Sweeting
2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions

View File

@@ -1 +1 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.contrib import admin
from django.http import HttpRequest
@@ -11,57 +11,81 @@ from archivebox.api.models import APIToken
class APITokenAdmin(BaseModelAdmin):
list_display = ('created_at', 'id', 'created_by', 'token_redacted', 'expires')
sort_fields = ('id', 'created_at', 'created_by', 'expires')
readonly_fields = ('created_at', 'modified_at')
search_fields = ('id', 'created_by__username', 'token')
list_display = ("created_at", "id", "created_by", "token_redacted", "expires")
sort_fields = ("id", "created_at", "created_by", "expires")
readonly_fields = ("created_at", "modified_at")
search_fields = ("id", "created_by__username", "token")
fieldsets = (
('Token', {
'fields': ('token', 'expires'),
'classes': ('card',),
}),
('Owner', {
'fields': ('created_by',),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
(
"Token",
{
"fields": ("token", "expires"),
"classes": ("card",),
},
),
(
"Owner",
{
"fields": ("created_by",),
"classes": ("card",),
},
),
(
"Timestamps",
{
"fields": ("created_at", "modified_at"),
"classes": ("card",),
},
),
)
list_filter = ('created_by',)
ordering = ['-created_at']
list_filter = ("created_by",)
ordering = ["-created_at"]
list_per_page = 100
class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
list_display = ('created_at', 'created_by', 'id', *WebhookAdmin.list_display)
sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
list_display = ("created_at", "created_by", "id", *WebhookAdmin.list_display)
sort_fields = ("created_at", "created_by", "id", "referenced_model", "endpoint", "last_success", "last_error")
readonly_fields = ("created_at", "modified_at", *WebhookAdmin.readonly_fields)
fieldsets = (
('Webhook', {
'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
'classes': ('card', 'wide'),
}),
('Authentication', {
'fields': ('auth_token',),
'classes': ('card',),
}),
('Status', {
'fields': ('enabled', 'last_success', 'last_error'),
'classes': ('card',),
}),
('Owner', {
'fields': ('created_by',),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('created_at', 'modified_at'),
'classes': ('card',),
}),
(
"Webhook",
{
"fields": ("name", "signal", "referenced_model", "endpoint"),
"classes": ("card", "wide"),
},
),
(
"Authentication",
{
"fields": ("auth_token",),
"classes": ("card",),
},
),
(
"Status",
{
"fields": ("enabled", "last_success", "last_error"),
"classes": ("card",),
},
),
(
"Owner",
{
"fields": ("created_by",),
"classes": ("card",),
},
),
(
"Timestamps",
{
"fields": ("created_at", "modified_at"),
"classes": ("card",),
},
),
)
def lookup_allowed(self, lookup: str, value: str, request: HttpRequest | None = None) -> bool:

View File

@@ -1,13 +1,14 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.apps import AppConfig
class APIConfig(AppConfig):
name = 'archivebox.api'
label = 'api'
name = "archivebox.api"
label = "api"
def register_admin(admin_site):
from archivebox.api.admin import register_admin
register_admin(admin_site)

View File

@@ -1,6 +1,5 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from typing import Optional
from datetime import timedelta
from django.utils import timezone
@@ -14,7 +13,7 @@ from ninja.errors import HttpError
def get_or_create_api_token(user: User | None):
from archivebox.api.models import APIToken
if user and user.is_superuser:
api_tokens = APIToken.objects.filter(created_by_id=user.pk, expires__gt=timezone.now())
if api_tokens.exists():
@@ -34,18 +33,18 @@ def get_or_create_api_token(user: User | None):
def auth_using_token(token: str | None, request: HttpRequest | None = None) -> User | None:
"""Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
from archivebox.api.models import APIToken # lazy import model to avoid loading it at urls.py import time
user: User | None = None
submitted_empty_form = str(token).strip() in ('string', '', 'None', 'null')
submitted_empty_form = str(token).strip() in ("string", "", "None", "null")
if not submitted_empty_form:
try:
api_token = APIToken.objects.get(token=token)
if api_token.is_valid() and isinstance(api_token.created_by, User):
user = api_token.created_by
if request is not None:
setattr(request, '_api_token', api_token)
setattr(request, "_api_token", api_token)
except APIToken.DoesNotExist:
pass
@@ -55,8 +54,8 @@ def auth_using_token(token: str | None, request: HttpRequest | None = None) -> U
def auth_using_password(username: str | None, password: str | None, request: HttpRequest | None = None) -> User | None:
"""Given a username and password, check if they are valid and return the corresponding user"""
user: User | None = None
submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
submitted_empty_form = (username, password) in (("string", "string"), ("", ""), (None, None))
if not submitted_empty_form:
authenticated_user = authenticate(
username=username,
@@ -73,34 +72,40 @@ def auth_using_password(username: str | None, password: str | None, request: Htt
def _require_superuser(user: User | None, request: HttpRequest, auth_method: str) -> User | None:
if user and user.pk:
request.user = user
setattr(request, '_api_auth_method', auth_method)
setattr(request, "_api_auth_method", auth_method)
if not user.is_superuser:
raise HttpError(403, 'Valid credentials but User does not have permission (make sure user.is_superuser=True)')
raise HttpError(403, "Valid credentials but User does not have permission (make sure user.is_superuser=True)")
return user
### Django-Ninja-Provided Auth Methods
class HeaderTokenAuth(APIKeyHeader):
"""Allow authenticating by passing X-API-Key=xyz as a request header"""
param_name = "X-ArchiveBox-API-Key"
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
class BearerTokenAuth(HttpBearer):
"""Allow authenticating by passing Bearer=xyz as a request header"""
def authenticate(self, request: HttpRequest, token: str) -> User | None:
return _require_superuser(auth_using_token(token=token, request=request), request, self.__class__.__name__)
class QueryParamTokenAuth(APIKeyQuery):
"""Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
param_name = "api_key"
def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)
class UsernameAndPasswordAuth(HttpBasicAuth):
"""Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""
@@ -111,25 +116,28 @@ class UsernameAndPasswordAuth(HttpBasicAuth):
self.__class__.__name__,
)
class DjangoSessionAuth:
"""Allow authenticating with existing Django session cookies (same-origin only)."""
def __call__(self, request: HttpRequest) -> User | None:
return self.authenticate(request)
def authenticate(self, request: HttpRequest, **kwargs) -> User | None:
user = getattr(request, 'user', None)
user = getattr(request, "user", None)
if isinstance(user, User) and user.is_authenticated:
setattr(request, '_api_auth_method', self.__class__.__name__)
setattr(request, "_api_auth_method", self.__class__.__name__)
if not user.is_superuser:
raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
raise HttpError(403, "Valid session but User does not have permission (make sure user.is_superuser=True)")
return user
return None
### Enabled Auth Methods
API_AUTH_METHODS = [
HeaderTokenAuth(),
BearerTokenAuth(),
QueryParamTokenAuth(),
QueryParamTokenAuth(),
# django_auth_superuser, # django admin cookie auth, not secure to use with csrf=False
]

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.http import HttpResponse
@@ -10,8 +10,8 @@ class ApiCorsMiddleware:
self.get_response = get_response
def __call__(self, request):
if request.path.startswith('/api/'):
if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
if request.path.startswith("/api/"):
if request.method == "OPTIONS" and request.META.get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"):
response = HttpResponse(status=204)
return self._add_cors_headers(request, response)
@@ -21,14 +21,12 @@ class ApiCorsMiddleware:
return self.get_response(request)
def _add_cors_headers(self, request, response):
origin = request.META.get('HTTP_ORIGIN')
origin = request.META.get("HTTP_ORIGIN")
if not origin:
return response
response['Access-Control-Allow-Origin'] = '*'
response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
response['Access-Control-Allow-Headers'] = (
'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
)
response['Access-Control-Max-Age'] = '600'
response["Access-Control-Allow-Origin"] = "*"
response["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS"
response["Access-Control-Allow-Headers"] = "Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken"
response["Access-Control-Max-Age"] = "600"
return response

View File

@@ -13,11 +13,10 @@ import signal_webhooks.utils
class Migration(migrations.Migration):
initial = True
dependencies = [
('auth', '0012_alter_user_first_name_max_length'),
("auth", "0012_alter_user_first_name_max_length"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
@@ -75,55 +74,165 @@ class Migration(migrations.Migration):
reverse_sql="""
DROP TABLE IF EXISTS api_outboundwebhook;
DROP TABLE IF EXISTS api_apitoken;
"""
""",
),
],
state_operations=[
migrations.CreateModel(
name='APIToken',
name="APIToken",
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
('expires', models.DateTimeField(blank=True, null=True)),
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
("modified_at", models.DateTimeField(auto_now=True)),
("token", models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
("expires", models.DateTimeField(blank=True, null=True)),
(
"created_by",
models.ForeignKey(
default=get_or_create_system_user_pk,
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
],
options={
'verbose_name': 'API Key',
'verbose_name_plural': 'API Keys',
'app_label': 'api',
"verbose_name": "API Key",
"verbose_name_plural": "API Keys",
"app_label": "api",
},
),
migrations.CreateModel(
name='OutboundWebhook',
name="OutboundWebhook",
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('name', models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name')),
('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal')),
('ref', models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
('endpoint', models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint')),
('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')),
('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
("modified_at", models.DateTimeField(auto_now=True)),
(
"name",
models.CharField(db_index=True, help_text="Webhook name.", max_length=255, unique=True, verbose_name="name"),
),
(
"signal",
models.CharField(
choices=[
("CREATE", "Create"),
("UPDATE", "Update"),
("DELETE", "Delete"),
("M2M", "M2M changed"),
("CREATE_OR_UPDATE", "Create or Update"),
("CREATE_OR_DELETE", "Create or Delete"),
("CREATE_OR_M2M", "Create or M2M changed"),
("UPDATE_OR_DELETE", "Update or Delete"),
("UPDATE_OR_M2M", "Update or M2M changed"),
("DELETE_OR_M2M", "Delete or M2M changed"),
("CREATE_UPDATE_OR_DELETE", "Create, Update or Delete"),
("CREATE_UPDATE_OR_M2M", "Create, Update or M2M changed"),
("CREATE_DELETE_OR_M2M", "Create, Delete or M2M changed"),
("UPDATE_DELETE_OR_M2M", "Update, Delete or M2M changed"),
("CREATE_UPDATE_DELETE_OR_M2M", "Create, Update or Delete, or M2M changed"),
],
help_text="Signal the webhook fires to.",
max_length=255,
verbose_name="signal",
),
),
(
"ref",
models.CharField(
db_index=True,
help_text="Dot import notation to the model the webhook is for.",
max_length=1023,
validators=[signal_webhooks.utils.model_from_reference],
verbose_name="referenced model",
),
),
(
"endpoint",
models.URLField(help_text="Target endpoint for this webhook.", max_length=2047, verbose_name="endpoint"),
),
(
"headers",
models.JSONField(
blank=True,
default=dict,
help_text="Headers to send with the webhook request.",
validators=[signal_webhooks.utils.is_dict],
verbose_name="headers",
),
),
(
"auth_token",
signal_webhooks.fields.TokenField(
blank=True,
default="",
help_text="Authentication token to use in an Authorization header.",
max_length=8000,
validators=[signal_webhooks.utils.decode_cipher_key],
verbose_name="authentication token",
),
),
("enabled", models.BooleanField(default=True, help_text="Is this webhook enabled?", verbose_name="enabled")),
(
"keep_last_response",
models.BooleanField(
default=False,
help_text="Should the webhook keep a log of the latest response it got?",
verbose_name="keep last response",
),
),
(
"created",
models.DateTimeField(auto_now_add=True, help_text="When the webhook was created.", verbose_name="created"),
),
(
"updated",
models.DateTimeField(auto_now=True, help_text="When the webhook was last updated.", verbose_name="updated"),
),
(
"last_response",
models.CharField(
blank=True,
default="",
help_text="Latest response to this webhook.",
max_length=8000,
verbose_name="last response",
),
),
(
"last_success",
models.DateTimeField(
default=None,
help_text="When the webhook last succeeded.",
null=True,
verbose_name="last success",
),
),
(
"last_failure",
models.DateTimeField(
default=None,
help_text="When the webhook last failed.",
null=True,
verbose_name="last failure",
),
),
(
"created_by",
models.ForeignKey(
default=get_or_create_system_user_pk,
on_delete=django.db.models.deletion.CASCADE,
to=settings.AUTH_USER_MODEL,
),
),
],
options={
'verbose_name': 'API Outbound Webhook',
'app_label': 'api',
"verbose_name": "API Outbound Webhook",
"app_label": "api",
},
),
migrations.AddConstraint(
model_name='outboundwebhook',
constraint=models.UniqueConstraint(fields=['ref', 'endpoint'], name='prevent_duplicate_hooks_api_outboundwebhook'),
model_name="outboundwebhook",
constraint=models.UniqueConstraint(fields=["ref", "endpoint"], name="prevent_duplicate_hooks_api_outboundwebhook"),
),
],
),

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
import secrets
from archivebox.uuid_compat import uuid7
@@ -25,7 +25,7 @@ class APIToken(models.Model):
expires = models.DateTimeField(null=True, blank=True)
class Meta(TypedModelMeta):
app_label = 'api'
app_label = "api"
verbose_name = "API Key"
verbose_name_plural = "API Keys"
@@ -34,7 +34,7 @@ class APIToken(models.Model):
@property
def token_redacted(self):
return f'************{self.token[-4:]}'
return f"************{self.token[-4:]}"
def is_valid(self, for_date=None):
return not self.expires or self.expires >= (for_date or timezone.now())
@@ -47,8 +47,8 @@ class OutboundWebhook(WebhookBase):
modified_at = models.DateTimeField(auto_now=True)
class Meta(WebhookBase.Meta):
app_label = 'api'
verbose_name = 'API Outbound Webhook'
app_label = "api"
verbose_name = "API Outbound Webhook"
def __str__(self) -> str:
return f'[{self.id}] {self.ref} -> {self.endpoint}'
return f"[{self.id}] {self.ref} -> {self.endpoint}"

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from django.urls import path
from django.views.generic.base import RedirectView
@@ -6,12 +6,10 @@ from django.views.generic.base import RedirectView
from .v1_api import urls as v1_api_urls
urlpatterns = [
path("", RedirectView.as_view(url='/api/v1/docs')),
path("v1/", RedirectView.as_view(url='/api/v1/docs')),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url='/api/v1/docs')),
path("", RedirectView.as_view(url="/api/v1/docs")),
path("v1/", RedirectView.as_view(url="/api/v1/docs")),
path("v1/", v1_api_urls),
path("v1", RedirectView.as_view(url="/api/v1/docs")),
# ... v2 can be added here ...
# path("v2/", v2_api_urls),
# path("v2", RedirectView.as_view(url='/api/v2/docs')),

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from io import StringIO
@@ -20,9 +20,9 @@ from archivebox.api.auth import API_AUTH_METHODS
from archivebox.api.models import APIToken
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
COMMIT_HASH = get_COMMIT_HASH() or "unknown"
html_description=f'''
html_description = f"""
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
<br/>
<i><b>WARNING: This API is still in an early development stage and may change!</b></i>
@@ -35,47 +35,47 @@ html_description=f'''
<li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
</ul>
<small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
'''
"""
def register_urls(api: NinjaAPI) -> NinjaAPI:
api.add_router('/auth/', 'archivebox.api.v1_auth.router')
api.add_router('/core/', 'archivebox.api.v1_core.router')
api.add_router('/crawls/', 'archivebox.api.v1_crawls.router')
api.add_router('/cli/', 'archivebox.api.v1_cli.router')
api.add_router('/machine/', 'archivebox.api.v1_machine.router')
api.add_router("/auth/", "archivebox.api.v1_auth.router")
api.add_router("/core/", "archivebox.api.v1_core.router")
api.add_router("/crawls/", "archivebox.api.v1_crawls.router")
api.add_router("/cli/", "archivebox.api.v1_cli.router")
api.add_router("/machine/", "archivebox.api.v1_machine.router")
return api
class NinjaAPIWithIOCapture(NinjaAPI):
class NinjaAPIWithIOCapture(NinjaAPI):
def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
stdout, stderr = StringIO(), StringIO()
with redirect_stderr(stderr):
with redirect_stdout(stdout):
setattr(request, 'stdout', stdout)
setattr(request, 'stderr', stderr)
setattr(request, "stdout", stdout)
setattr(request, "stderr", stderr)
response = super().create_temporal_response(request)
# Diable caching of API responses entirely
response['Cache-Control'] = 'no-store'
# Disable caching of API responses entirely
response["Cache-Control"] = "no-store"
# Add debug stdout and stderr headers to response
response['X-ArchiveBox-Stdout'] = stdout.getvalue().replace('\n', '\\n')[:200]
response['X-ArchiveBox-Stderr'] = stderr.getvalue().replace('\n', '\\n')[:200]
response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200]
response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200]
# response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown'
# Add Auth Headers to response
api_token_attr = getattr(request, '_api_token', None)
api_token_attr = getattr(request, "_api_token", None)
api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else 'Never'
token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never"
response['X-ArchiveBox-Auth-Method'] = str(getattr(request, '_api_auth_method', 'None'))
response['X-ArchiveBox-Auth-Expires'] = token_expiry
response['X-ArchiveBox-Auth-Token-Id'] = str(api_token.id) if api_token else 'None'
response['X-ArchiveBox-Auth-User-Id'] = str(request.user.pk) if getattr(request.user, 'pk', None) else 'None'
response['X-ArchiveBox-Auth-User-Username'] = request.user.username if isinstance(request.user, User) else 'None'
response["X-ArchiveBox-Auth-Method"] = str(getattr(request, "_api_auth_method", "None"))
response["X-ArchiveBox-Auth-Expires"] = token_expiry
response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None"
response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if getattr(request.user, "pk", None) else "None"
response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None"
# import ipdb; ipdb.set_trace()
# print('RESPONDING NOW', response)
@@ -84,7 +84,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):
api = NinjaAPIWithIOCapture(
title='ArchiveBox API',
title="ArchiveBox API",
description=html_description,
version=VERSION,
auth=API_AUTH_METHODS,
@@ -103,15 +103,15 @@ def generic_exception_handler(request, err):
if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
status = 404
print(''.join(format_exception(err)))
print("".join(format_exception(err)))
return api.create_response(
request,
{
"succeeded": False,
"message": f'{err.__class__.__name__}: {err}',
"message": f"{err.__class__.__name__}: {err}",
"errors": [
''.join(format_exception(err)),
"".join(format_exception(err)),
# or send simpler parent-only traceback:
# *([str(err.__context__)] if getattr(err, '__context__', None) else []),
],
@@ -120,7 +120,6 @@ def generic_exception_handler(request, err):
)
# import orjson
# from ninja.renderers import BaseRenderer
# class ORJSONRenderer(BaseRenderer):

View File

@@ -1,6 +1,5 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from typing import Optional
from django.http import HttpRequest
from ninja import Router, Schema
@@ -8,16 +7,21 @@ from ninja import Router, Schema
from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token
router = Router(tags=['Authentication'], auth=None)
router = Router(tags=["Authentication"], auth=None)
class PasswordAuthSchema(Schema):
"""Schema for a /get_api_token request"""
username: Optional[str] = None
password: Optional[str] = None
username: str | None = None
password: str | None = None
@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)') # auth=None because they are not authed yet
@router.post(
"/get_api_token",
auth=None,
summary="Generate an API token for a given username & password (or currently logged-in user)",
) # auth=None because they are not authed yet
def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
user = auth_using_password(
username=auth_data.username,
@@ -35,17 +39,21 @@ def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
"token": api_token.token,
"expires": api_token.expires.isoformat() if api_token.expires else None,
}
return {"success": False, "errors": ["Invalid credentials"]}
return {"success": False, "errors": ["Invalid credentials"]}
class TokenAuthSchema(Schema):
"""Schema for a /check_api_token request"""
token: str
@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired') # auth=None because they are not authed yet
@router.post(
"/check_api_token",
auth=None,
summary="Validate an API token to make sure its valid and non-expired",
) # auth=None because they are not authed yet
def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
user = auth_using_token(
token=token_data.token,
@@ -53,5 +61,5 @@ def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
)
if user:
return {"success": True, "user_id": str(user.pk)}
return {"success": False, "user_id": None}

View File

@@ -1,8 +1,8 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
import json
from io import StringIO
from typing import List, Dict, Any, Optional
from typing import Any
from enum import Enum
from django.http import HttpRequest
@@ -16,44 +16,47 @@ from archivebox.config.common import ARCHIVING_CONFIG
# from .auth import API_AUTH_METHODS
# router for API that exposes archivebox cli subcommands as REST endpoints
router = Router(tags=['ArchiveBox CLI Sub-Commands'])
router = Router(tags=["ArchiveBox CLI Sub-Commands"])
# Schemas
JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
JSONType = list[Any] | dict[str, Any] | bool | int | str | None
class CLICommandResponseSchema(Schema):
success: bool
errors: List[str]
errors: list[str]
result: JSONType
result_format: str = 'str'
result_format: str = "str"
stdout: str
stderr: str
class FilterTypeChoices(str, Enum):
exact = 'exact'
substring = 'substring'
regex = 'regex'
domain = 'domain'
tag = 'tag'
timestamp = 'timestamp'
exact = "exact"
substring = "substring"
regex = "regex"
domain = "domain"
tag = "tag"
timestamp = "timestamp"
class StatusChoices(str, Enum):
indexed = 'indexed'
archived = 'archived'
unarchived = 'unarchived'
present = 'present'
valid = 'valid'
invalid = 'invalid'
duplicate = 'duplicate'
orphaned = 'orphaned'
corrupted = 'corrupted'
unrecognized = 'unrecognized'
indexed = "indexed"
archived = "archived"
unarchived = "unarchived"
present = "present"
valid = "valid"
invalid = "invalid"
duplicate = "duplicate"
orphaned = "orphaned"
corrupted = "corrupted"
unrecognized = "unrecognized"
class AddCommandSchema(Schema):
urls: List[str]
urls: list[str]
tag: str = ""
depth: int = 0
parser: str = "auto"
@@ -62,53 +65,54 @@ class AddCommandSchema(Schema):
overwrite: bool = False
index_only: bool = False
class UpdateCommandSchema(Schema):
resume: Optional[str] = None
after: Optional[float] = 0
before: Optional[float] = 999999999999999
filter_type: Optional[str] = FilterTypeChoices.substring
filter_patterns: Optional[List[str]] = ['https://example.com']
resume: str | None = None
after: float | None = 0
before: float | None = 999999999999999
filter_type: str | None = FilterTypeChoices.substring
filter_patterns: list[str] | None = ["https://example.com"]
batch_size: int = 100
continuous: bool = False
class ScheduleCommandSchema(Schema):
import_path: Optional[str] = None
import_path: str | None = None
add: bool = False
show: bool = False
foreground: bool = False
run_all: bool = False
quiet: bool = False
every: Optional[str] = None
tag: str = ''
every: str | None = None
tag: str = ""
depth: int = 0
overwrite: bool = False
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
clear: bool = False
class ListCommandSchema(Schema):
filter_patterns: Optional[List[str]] = ['https://example.com']
filter_patterns: list[str] | None = ["https://example.com"]
filter_type: str = FilterTypeChoices.substring
status: StatusChoices = StatusChoices.indexed
after: Optional[float] = 0
before: Optional[float] = 999999999999999
sort: str = 'bookmarked_at'
after: float | None = 0
before: float | None = 999999999999999
sort: str = "bookmarked_at"
as_json: bool = True
as_html: bool = False
as_csv: str | None = 'timestamp,url'
as_csv: str | None = "timestamp,url"
with_headers: bool = False
class RemoveCommandSchema(Schema):
delete: bool = True
after: Optional[float] = 0
before: Optional[float] = 999999999999999
after: float | None = 0
before: float | None = 999999999999999
filter_type: str = FilterTypeChoices.exact
filter_patterns: Optional[List[str]] = ['https://example.com']
filter_patterns: list[str] | None = ["https://example.com"]
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
def cli_add(request: HttpRequest, args: AddCommandSchema):
from archivebox.cli.archivebox_add import add
@@ -125,30 +129,30 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
created_by_id=request.user.pk,
)
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)]
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
result_payload = {
"crawl_id": str(crawl.id),
"num_snapshots": len(snapshot_ids),
"snapshot_ids": snapshot_ids,
"queued_urls": args.urls,
}
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result_payload,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
def cli_update(request: HttpRequest, args: UpdateCommandSchema):
from archivebox.cli.archivebox_update import update
result = update(
filter_patterns=args.filter_patterns or [],
filter_type=args.filter_type or FilterTypeChoices.substring,
@@ -158,21 +162,21 @@ def cli_update(request: HttpRequest, args: UpdateCommandSchema):
batch_size=args.batch_size,
continuous=args.continuous,
)
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
from archivebox.cli.archivebox_schedule import schedule
result = schedule(
import_path=args.import_path,
add=args.add,
@@ -188,23 +192,22 @@ def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
update=args.update,
)
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
def cli_search(request: HttpRequest, args: ListCommandSchema):
from archivebox.cli.archivebox_search import search
result = search(
filter_patterns=args.filter_patterns,
filter_type=args.filter_type,
@@ -218,7 +221,7 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
with_headers=args.with_headers,
)
result_format = 'txt'
result_format = "txt"
if args.as_json:
result_format = "json"
result = json.loads(result)
@@ -227,20 +230,19 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
elif args.as_csv:
result_format = "csv"
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": result_format,
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
from archivebox.cli.archivebox_remove import remove
from archivebox.cli.archivebox_search import get_snapshots
@@ -253,10 +255,10 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
after=args.after,
before=args.before,
)
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)]
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]
remove(
yes=True, # no way to interactively ask for confirmation via API, so we force yes
yes=True, # no way to interactively ask for confirmation via API, so we force yes
delete=args.delete,
snapshots=snapshots_to_remove,
before=args.before,
@@ -270,14 +272,13 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
"removed_snapshot_ids": removed_snapshot_ids,
"remaining_snapshots": Snapshot.objects.count(),
}
stdout = getattr(request, 'stdout', None)
stderr = getattr(request, 'stderr', None)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}

View File

@@ -1,11 +1,13 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
import math
from collections import defaultdict
from uuid import UUID
from typing import List, Optional, Union, Any, Annotated
from typing import Union, Any, Annotated
from datetime import datetime
from django.db.models import Model, Q
from django.db.models import Model, Q, Sum
from django.db.models.functions import Coalesce
from django.conf import settings
from django.http import HttpRequest, HttpResponse
from django.core.exceptions import ValidationError
@@ -39,7 +41,7 @@ from archivebox.crawls.models import Crawl
from archivebox.api.v1_crawls import CrawlSchema
router = Router(tags=['Core Models'])
router = Router(tags=["Core Models"])
class CustomPagination(PaginationBase):
@@ -49,13 +51,14 @@ class CustomPagination(PaginationBase):
page: int = 0
class Output(PaginationBase.Output):
count: int
total_items: int
total_pages: int
page: int
limit: int
offset: int
num_items: int
items: List[Any]
items: list[Any]
def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params):
limit = min(pagination.limit, 500)
@@ -65,27 +68,29 @@ class CustomPagination(PaginationBase):
current_page = math.ceil(offset / (limit + 1))
items = queryset[offset : offset + limit]
return {
'total_items': total,
'total_pages': total_pages,
'page': current_page,
'limit': limit,
'offset': offset,
'num_items': len(items),
'items': items,
"count": total,
"total_items": total,
"total_pages": total_pages,
"page": current_page,
"limit": limit,
"offset": offset,
"num_items": len(items),
"items": items,
}
### ArchiveResult #########################################################################
class MinimalArchiveResultSchema(Schema):
TYPE: str = 'core.models.ArchiveResult'
TYPE: str = "core.models.ArchiveResult"
id: UUID
created_at: datetime | None
modified_at: datetime | None
created_by_id: str
created_by_username: str
status: str
retry_at: datetime | None
retry_at: datetime | None = None
plugin: str
hook_name: str
process_id: UUID | None
@@ -93,8 +98,8 @@ class MinimalArchiveResultSchema(Schema):
cmd: list[str] | None
pwd: str | None
output_str: str
output_json: dict | None
output_files: dict | None
output_json: dict[str, Any] | None
output_files: dict[str, dict[str, Any]] | None
output_size: int
output_mimetypes: str
start_ts: datetime | None
@@ -108,13 +113,34 @@ class MinimalArchiveResultSchema(Schema):
def resolve_created_by_username(obj) -> str:
return obj.created_by.username
@staticmethod
def resolve_output_files(obj):
return obj.output_file_map()
@staticmethod
def resolve_output_mimetypes(obj) -> str:
mime_sizes: dict[str, int] = defaultdict(int)
for metadata in obj.output_file_map().values():
if not isinstance(metadata, dict):
continue
mimetype = str(metadata.get("mimetype") or "").strip()
try:
size = max(int(metadata.get("size") or 0), 0)
except (TypeError, ValueError):
size = 0
if mimetype and size:
mime_sizes[mimetype] += size
if mime_sizes:
return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
return obj.output_mimetypes or ""
class ArchiveResultSchema(MinimalArchiveResultSchema):
TYPE: str = 'core.models.ArchiveResult'
TYPE: str = "core.models.ArchiveResult"
snapshot_id: UUID
snapshot_timestamp: str
snapshot_url: str
snapshot_tags: List[str]
snapshot_tags: list[str]
@staticmethod
def resolve_snapshot_timestamp(obj):
@@ -134,25 +160,39 @@ class ArchiveResultSchema(MinimalArchiveResultSchema):
class ArchiveResultFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup(['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
search: Annotated[Optional[str], FilterLookup(['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
snapshot_id: Annotated[Optional[str], FilterLookup(['snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
snapshot_url: Annotated[Optional[str], FilterLookup('snapshot__url__icontains')] = None
snapshot_tag: Annotated[Optional[str], FilterLookup('snapshot__tags__name__icontains')] = None
status: Annotated[Optional[str], FilterLookup('status')] = None
output_str: Annotated[Optional[str], FilterLookup('output_str__icontains')] = None
plugin: Annotated[Optional[str], FilterLookup('plugin__icontains')] = None
hook_name: Annotated[Optional[str], FilterLookup('hook_name__icontains')] = None
process_id: Annotated[Optional[str], FilterLookup('process__id__startswith')] = None
cmd: Annotated[Optional[str], FilterLookup('cmd__0__icontains')] = None
pwd: Annotated[Optional[str], FilterLookup('pwd__icontains')] = None
cmd_version: Annotated[Optional[str], FilterLookup('cmd_version')] = None
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
search: Annotated[
str | None,
FilterLookup(
[
"snapshot__url__icontains",
"snapshot__title__icontains",
"snapshot__tags__name__icontains",
"plugin",
"output_str__icontains",
"id__startswith",
"snapshot__id__startswith",
"snapshot__timestamp__startswith",
],
),
] = None
snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None
snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None
status: Annotated[str | None, FilterLookup("status")] = None
output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None
plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None
hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None
process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None
cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None
pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None
cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult")
@paginate(CustomPagination)
def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]):
"""List all ArchiveResult entries matching these filters."""
@@ -167,8 +207,9 @@ def get_archiveresult(request: HttpRequest, archiveresult_id: str):
### Snapshot #########################################################################
class SnapshotSchema(Schema):
TYPE: str = 'core.models.Snapshot'
TYPE: str = "core.models.Snapshot"
id: UUID
created_by_id: str
created_by_username: str
@@ -177,14 +218,16 @@ class SnapshotSchema(Schema):
status: str
retry_at: datetime | None
bookmarked_at: datetime
downloaded_at: Optional[datetime]
downloaded_at: datetime | None
url: str
tags: List[str]
title: Optional[str]
tags: list[str]
title: str | None
timestamp: str
archive_path: str
archive_size: int
output_size: int
num_archiveresults: int
archiveresults: List[MinimalArchiveResultSchema]
archiveresults: list[MinimalArchiveResultSchema]
@staticmethod
def resolve_created_by_id(obj):
@@ -198,13 +241,21 @@ class SnapshotSchema(Schema):
def resolve_tags(obj):
return sorted(tag.name for tag in obj.tags.all())
@staticmethod
def resolve_archive_size(obj):
return int(getattr(obj, "output_size_sum", obj.archive_size) or 0)
@staticmethod
def resolve_output_size(obj):
return SnapshotSchema.resolve_archive_size(obj)
@staticmethod
def resolve_num_archiveresults(obj, context):
return obj.archiveresult_set.all().distinct().count()
@staticmethod
def resolve_archiveresults(obj, context):
if bool(getattr(context['request'], 'with_archiveresults', False)):
if bool(getattr(context["request"], "with_archiveresults", False)):
return obj.archiveresult_set.all().distinct()
return ArchiveResult.objects.none()
@@ -212,16 +263,16 @@ class SnapshotSchema(Schema):
class SnapshotUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
tags: list[str] | None = None
class SnapshotCreateSchema(Schema):
url: str
crawl_id: Optional[str] = None
crawl_id: str | None = None
depth: int = 0
title: Optional[str] = None
tags: Optional[List[str]] = None
status: Optional[str] = None
title: str | None = None
tags: list[str] | None = None
status: str | None = None
class SnapshotDeleteResponseSchema(Schema):
@@ -231,77 +282,82 @@ class SnapshotDeleteResponseSchema(Schema):
deleted_count: int
def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
def normalize_tag_list(tags: list[str] | None = None) -> list[str]:
return [tag.strip() for tag in (tags or []) if tag and tag.strip()]
class SnapshotFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup(['id__icontains', 'timestamp__startswith'])] = None
created_by_id: Annotated[Optional[str], FilterLookup('crawl__created_by_id')] = None
created_by_username: Annotated[Optional[str], FilterLookup('crawl__created_by__username__icontains')] = None
created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
modified_at: Annotated[Optional[datetime], FilterLookup('modified_at')] = None
modified_at__gte: Annotated[Optional[datetime], FilterLookup('modified_at__gte')] = None
modified_at__lt: Annotated[Optional[datetime], FilterLookup('modified_at__lt')] = None
search: Annotated[Optional[str], FilterLookup(['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'timestamp__startswith'])] = None
url: Annotated[Optional[str], FilterLookup('url')] = None
tag: Annotated[Optional[str], FilterLookup('tags__name')] = None
title: Annotated[Optional[str], FilterLookup('title__icontains')] = None
timestamp: Annotated[Optional[str], FilterLookup('timestamp__startswith')] = None
bookmarked_at__gte: Annotated[Optional[datetime], FilterLookup('bookmarked_at__gte')] = None
bookmarked_at__lt: Annotated[Optional[datetime], FilterLookup('bookmarked_at__lt')] = None
id: Annotated[str | None, FilterLookup(["id__icontains", "timestamp__startswith"])] = None
created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None
created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None
created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None
modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None
modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None
search: Annotated[
str | None,
FilterLookup(["url__icontains", "title__icontains", "tags__name__icontains", "id__icontains", "timestamp__startswith"]),
] = None
url: Annotated[str | None, FilterLookup("url")] = None
tag: Annotated[str | None, FilterLookup("tags__name")] = None
title: Annotated[str | None, FilterLookup("title__icontains")] = None
timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None
bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None
bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None
@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots")
@paginate(CustomPagination)
def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False):
"""List all Snapshot entries matching these filters."""
setattr(request, 'with_archiveresults', with_archiveresults)
return filters.filter(Snapshot.objects.all()).distinct()
setattr(request, "with_archiveresults", with_archiveresults)
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
return filters.filter(queryset).distinct()
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True):
"""Get a specific Snapshot by id."""
setattr(request, 'with_archiveresults', with_archiveresults)
setattr(request, "with_archiveresults", with_archiveresults)
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
try:
return Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
return queryset.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
except Snapshot.DoesNotExist:
return Snapshot.objects.get(Q(id__icontains=snapshot_id))
return queryset.get(Q(id__icontains=snapshot_id))
@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
tags = normalize_tag_list(data.tags)
if data.status is not None and data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
raise HttpError(400, f"Invalid status: {data.status}")
if not data.url.strip():
raise HttpError(400, 'URL is required')
raise HttpError(400, "URL is required")
if data.depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'depth must be between 0 and 4')
raise HttpError(400, "depth must be between 0 and 4")
if data.crawl_id:
crawl = Crawl.objects.get(id__icontains=data.crawl_id)
crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
crawl_tags = normalize_tag_list(crawl.tags_str.split(","))
tags = tags or crawl_tags
else:
crawl = Crawl.objects.create(
urls=data.url,
max_depth=max(data.depth, 0),
tags_str=','.join(tags),
tags_str=",".join(tags),
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
created_by=request.user if isinstance(request.user, User) else None,
)
snapshot_defaults = {
'depth': data.depth,
'title': data.title,
'timestamp': str(timezone.now().timestamp()),
'status': data.status or Snapshot.StatusChoices.QUEUED,
'retry_at': timezone.now(),
"depth": data.depth,
"title": data.title,
"timestamp": str(timezone.now().timestamp()),
"status": data.status or Snapshot.StatusChoices.QUEUED,
"retry_at": timezone.now(),
}
snapshot, _ = Snapshot.objects.get_or_create(
url=data.url,
@@ -309,17 +365,17 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
defaults=snapshot_defaults,
)
update_fields: List[str] = []
update_fields: list[str] = []
if data.title is not None and snapshot.title != data.title:
snapshot.title = data.title
update_fields.append('title')
update_fields.append("title")
if data.status is not None and snapshot.status != data.status:
if data.status not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {data.status}')
raise HttpError(400, f"Invalid status: {data.status}")
snapshot.status = data.status
update_fields.append('status')
update_fields.append("status")
if update_fields:
update_fields.append('modified_at')
update_fields.append("modified_at")
snapshot.save(update_fields=update_fields)
if tags:
@@ -330,7 +386,7 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
except Exception:
pass
setattr(request, 'with_archiveresults', False)
setattr(request, "with_archiveresults", False)
return snapshot
@@ -343,26 +399,26 @@ def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateS
snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
tags = payload.pop('tags', None)
update_fields = ["modified_at"]
tags = payload.pop("tags", None)
if 'status' in payload:
if payload['status'] not in Snapshot.StatusChoices.values:
raise HttpError(400, f'Invalid status: {payload["status"]}')
snapshot.status = payload['status']
if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
if "status" in payload:
if payload["status"] not in Snapshot.StatusChoices.values:
raise HttpError(400, f"Invalid status: {payload['status']}")
snapshot.status = payload["status"]
if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload:
snapshot.retry_at = None
update_fields.append('status')
update_fields.append("status")
if 'retry_at' in payload:
snapshot.retry_at = payload['retry_at']
update_fields.append('retry_at')
if "retry_at" in payload:
snapshot.retry_at = payload["retry_at"]
update_fields.append("retry_at")
if tags is not None:
snapshot.save_tags(normalize_tag_list(tags))
snapshot.save(update_fields=update_fields)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_archiveresults", False)
return snapshot
@@ -373,17 +429,18 @@ def delete_snapshot(request: HttpRequest, snapshot_id: str):
crawl_id_str = str(snapshot.crawl.pk)
deleted_count, _ = snapshot.delete()
return {
'success': True,
'snapshot_id': snapshot_id_str,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
"success": True,
"snapshot_id": snapshot_id_str,
"crawl_id": crawl_id_str,
"deleted_count": deleted_count,
}
### Tag #########################################################################
class TagSchema(Schema):
TYPE: str = 'core.models.Tag'
TYPE: str = "core.models.Tag"
id: int
modified_at: datetime
created_at: datetime
@@ -392,7 +449,7 @@ class TagSchema(Schema):
name: str
slug: str
num_snapshots: int
snapshots: List[SnapshotSchema]
snapshots: list[SnapshotSchema]
@staticmethod
def resolve_created_by_id(obj):
@@ -402,7 +459,7 @@ class TagSchema(Schema):
def resolve_created_by_username(obj):
user_model = get_user_model()
user = user_model.objects.get(id=obj.created_by_id)
username = getattr(user, 'username', None)
username = getattr(user, "username", None)
return username if isinstance(username, str) else str(user)
@staticmethod
@@ -411,58 +468,67 @@ class TagSchema(Schema):
@staticmethod
def resolve_snapshots(obj, context):
if bool(getattr(context['request'], 'with_snapshots', False)):
if bool(getattr(context["request"], "with_snapshots", False)):
return obj.snapshot_set.all().distinct()
return Snapshot.objects.none()
@router.get("/tags", response=List[TagSchema], url_name="get_tags")
@router.get("/tags", response=list[TagSchema], url_name="get_tags")
@paginate(CustomPagination)
def get_tags(request: HttpRequest):
setattr(request, 'with_snapshots', False)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_snapshots", False)
setattr(request, "with_archiveresults", False)
return get_matching_tags()
@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
setattr(request, 'with_snapshots', with_snapshots)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_snapshots", with_snapshots)
setattr(request, "with_archiveresults", False)
try:
return get_tag_by_ref(tag_id)
except (Tag.DoesNotExist, ValidationError):
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
@router.get(
"/any/{id}",
response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema],
url_name="get_any",
summary="Get any object by its ID",
)
def get_any(request: HttpRequest, id: str):
"""Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
setattr(request, 'with_snapshots', False)
setattr(request, 'with_archiveresults', False)
setattr(request, "with_snapshots", False)
setattr(request, "with_archiveresults", False)
for getter in [get_snapshot, get_archiveresult, get_tag]:
try:
response = getter(request, id)
if isinstance(response, Model):
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
return redirect(
f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}",
)
except Exception:
pass
try:
from archivebox.api.v1_crawls import get_crawl
response = get_crawl(request, id)
if isinstance(response, Model):
return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
except Exception:
pass
raise HttpError(404, 'Object with given ID not found')
raise HttpError(404, "Object with given ID not found")
### Tag Editor API Endpoints #########################################################################
class TagAutocompleteSchema(Schema):
tags: List[dict]
tags: list[dict]
class TagCreateSchema(Schema):
@@ -483,7 +549,7 @@ class TagSearchSnapshotSchema(Schema):
favicon_url: str
admin_url: str
archive_url: str
downloaded_at: Optional[str] = None
downloaded_at: str | None = None
class TagSearchCardSchema(Schema):
@@ -497,11 +563,11 @@ class TagSearchCardSchema(Schema):
export_jsonl_url: str
rename_url: str
delete_url: str
snapshots: List[TagSearchSnapshotSchema]
snapshots: list[TagSearchSnapshotSchema]
class TagSearchResponseSchema(Schema):
tags: List[TagSearchCardSchema]
tags: list[TagSearchCardSchema]
sort: str
created_by: str
year: str
@@ -527,8 +593,8 @@ class TagDeleteResponseSchema(Schema):
class TagSnapshotRequestSchema(Schema):
snapshot_id: str
tag_name: Optional[str] = None
tag_id: Optional[int] = None
tag_name: str | None = None
tag_id: int | None = None
class TagSnapshotResponseSchema(Schema):
@@ -541,10 +607,10 @@ class TagSnapshotResponseSchema(Schema):
def search_tags(
request: HttpRequest,
q: str = "",
sort: str = 'created_desc',
created_by: str = '',
year: str = '',
has_snapshots: str = 'all',
sort: str = "created_desc",
created_by: str = "",
year: str = "",
has_snapshots: str = "all",
):
"""Return detailed tag cards for admin/live-search UIs."""
normalized_sort = normalize_tag_sort(sort)
@@ -552,7 +618,7 @@ def search_tags(
normalized_year = normalize_created_year_filter(year)
normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
return {
'tags': build_tag_cards(
"tags": build_tag_cards(
query=q,
request=request,
sort=normalized_sort,
@@ -560,28 +626,28 @@ def search_tags(
year=normalized_year,
has_snapshots=normalized_has_snapshots,
),
'sort': normalized_sort,
'created_by': normalized_created_by,
'year': normalized_year,
'has_snapshots': normalized_has_snapshots,
"sort": normalized_sort,
"created_by": normalized_created_by,
"year": normalized_year,
"has_snapshots": normalized_has_snapshots,
}
def _public_tag_listing_enabled() -> bool:
explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
explicit = getattr(settings, "PUBLIC_SNAPSHOTS_LIST", None)
if explicit is not None:
return bool(explicit)
return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
return bool(getattr(settings, "PUBLIC_INDEX", SERVER_CONFIG.PUBLIC_INDEX))
def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
user = getattr(request, 'user', None)
if getattr(user, 'is_authenticated', False):
user = getattr(request, "user", None)
if getattr(user, "is_authenticated", False):
return True
token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
auth_header = request.headers.get('Authorization', '')
if not token and auth_header.lower().startswith('bearer '):
token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key")
auth_header = request.headers.get("Authorization", "")
if not token and auth_header.lower().startswith("bearer "):
token = auth_header.split(None, 1)[1].strip()
if token and auth_using_token(token=token, request=request):
@@ -594,12 +660,12 @@ def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
def tags_autocomplete(request: HttpRequest, q: str = ""):
"""Return tags matching the query for autocomplete."""
if not _request_has_tag_autocomplete_access(request):
raise HttpError(401, 'Authentication required')
raise HttpError(401, "Authentication required")
tags = get_matching_tags(q)[:50 if not q else 20]
tags = get_matching_tags(q)[: 50 if not q else 20]
return {
'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
"tags": [{"id": tag.pk, "name": tag.name, "slug": tag.slug, "num_snapshots": getattr(tag, "num_snapshots", 0)} for tag in tags],
}
@@ -615,10 +681,10 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
raise HttpError(400, str(err)) from err
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
'created': created,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
"created": created,
}
@@ -627,15 +693,15 @@ def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
try:
tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
except ValueError as err:
raise HttpError(400, str(err)) from err
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
'slug': tag.slug,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
"slug": tag.slug,
}
@@ -644,13 +710,13 @@ def delete_tag(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
deleted_count, _ = delete_tag_record(tag)
return {
'success': True,
'tag_id': int(tag_id),
'deleted_count': deleted_count,
"success": True,
"tag_id": int(tag_id),
"deleted_count": deleted_count,
}
@@ -659,10 +725,10 @@ def tag_urls_export(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8")
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
return response
@@ -671,10 +737,10 @@ def tag_snapshots_export(request: HttpRequest, tag_id: int):
try:
tag = get_tag_by_ref(tag_id)
except Tag.DoesNotExist as err:
raise HttpError(404, 'Tag not found') from err
raise HttpError(404, "Tag not found") from err
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8")
response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
return response
@@ -684,16 +750,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
# Get the snapshot
try:
snapshot = Snapshot.objects.get(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
)
except Snapshot.DoesNotExist:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
).first()
if snapshot is None:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
# Get or create the tag
if data.tag_name:
@@ -708,17 +774,17 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
try:
tag = get_tag_by_ref(data.tag_id)
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
else:
raise HttpError(400, 'Either tag_name or tag_id is required')
raise HttpError(400, "Either tag_name or tag_id is required")
# Add the tag to the snapshot
snapshot.tags.add(tag.pk)
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
}
@@ -728,36 +794,36 @@ def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSche
# Get the snapshot
try:
snapshot = Snapshot.objects.get(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
)
except Snapshot.DoesNotExist:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
).first()
if snapshot is None:
raise HttpError(404, 'Snapshot not found')
raise HttpError(404, "Snapshot not found")
# Get the tag
if data.tag_id:
try:
tag = Tag.objects.get(pk=data.tag_id)
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
elif data.tag_name:
try:
tag = Tag.objects.get(name__iexact=data.tag_name.strip())
except Tag.DoesNotExist:
raise HttpError(404, 'Tag not found')
raise HttpError(404, "Tag not found")
else:
raise HttpError(400, 'Either tag_name or tag_id is required')
raise HttpError(400, "Either tag_name or tag_id is required")
# Remove the tag from the snapshot
snapshot.tags.remove(tag.pk)
return {
'success': True,
'tag_id': tag.pk,
'tag_name': tag.name,
"success": True,
"tag_id": tag.pk,
"tag_name": tag.name,
}

View File

@@ -1,7 +1,6 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from uuid import UUID
from typing import List, Optional
from datetime import datetime
from django.http import HttpRequest
from django.utils import timezone
@@ -17,11 +16,11 @@ from archivebox.crawls.models import Crawl
from .auth import API_AUTH_METHODS
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS)
class CrawlSchema(Schema):
TYPE: str = 'crawls.models.Crawl'
TYPE: str = "crawls.models.Crawl"
id: UUID
@@ -35,6 +34,8 @@ class CrawlSchema(Schema):
urls: str
max_depth: int
max_urls: int
max_size: int
tags_str: str
config: dict
@@ -48,12 +49,12 @@ class CrawlSchema(Schema):
def resolve_created_by_username(obj):
user_model = get_user_model()
user = user_model.objects.get(id=obj.created_by_id)
username = getattr(user, 'username', None)
username = getattr(user, "username", None)
return username if isinstance(username, str) else str(user)
@staticmethod
def resolve_snapshots(obj, context):
if bool(getattr(context['request'], 'with_snapshots', False)):
if bool(getattr(context["request"], "with_snapshots", False)):
return obj.snapshot_set.all().distinct()
return Snapshot.objects.none()
@@ -61,17 +62,19 @@ class CrawlSchema(Schema):
class CrawlUpdateSchema(Schema):
status: str | None = None
retry_at: datetime | None = None
tags: Optional[List[str]] = None
tags: list[str] | None = None
tags_str: str | None = None
class CrawlCreateSchema(Schema):
urls: List[str]
urls: list[str]
max_depth: int = 0
tags: Optional[List[str]] = None
tags_str: str = ''
label: str = ''
notes: str = ''
max_urls: int = 0
max_size: int = 0
tags: list[str] | None = None
tags_str: str = ""
label: str = ""
notes: str = ""
config: dict = {}
@@ -82,13 +85,13 @@ class CrawlDeleteResponseSchema(Schema):
deleted_snapshots: int
def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]:
if tags is not None:
return [tag.strip() for tag in tags if tag and tag.strip()]
return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
return [tag.strip() for tag in tags_str.split(",") if tag.strip()]
@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls")
def get_crawls(request: HttpRequest):
return Crawl.objects.all().distinct()
@@ -97,15 +100,21 @@ def get_crawls(request: HttpRequest):
def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
urls = [url.strip() for url in data.urls if url and url.strip()]
if not urls:
raise HttpError(400, 'At least one URL is required')
raise HttpError(400, "At least one URL is required")
if data.max_depth not in (0, 1, 2, 3, 4):
raise HttpError(400, 'max_depth must be between 0 and 4')
raise HttpError(400, "max_depth must be between 0 and 4")
if data.max_urls < 0:
raise HttpError(400, "max_urls must be >= 0")
if data.max_size < 0:
raise HttpError(400, "max_size must be >= 0")
tags = normalize_tag_list(data.tags, data.tags_str)
crawl = Crawl.objects.create(
urls='\n'.join(urls),
urls="\n".join(urls),
max_depth=data.max_depth,
tags_str=','.join(tags),
max_urls=data.max_urls,
max_size=data.max_size,
tags_str=",".join(tags),
label=data.label,
notes=data.notes,
config=data.config,
@@ -116,25 +125,26 @@ def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
crawl.create_snapshots_from_urls()
return crawl
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False):
"""Get a specific Crawl by id."""
setattr(request, 'with_snapshots', with_snapshots)
setattr(request, 'with_archiveresults', with_archiveresults)
setattr(request, "with_snapshots", with_snapshots)
setattr(request, "with_archiveresults", with_archiveresults)
crawl = Crawl.objects.get(id__icontains=crawl_id)
if crawl and as_rss:
# return snapshots as XML rss feed
urls = [
{'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
{"url": snapshot.url, "title": snapshot.title, "bookmarked_at": snapshot.bookmarked_at, "tags": snapshot.tags_str}
for snapshot in crawl.snapshot_set.all()
]
xml = '<rss version="2.0"><channel>'
for url in urls:
xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
xml += '</channel></rss>'
xml += f"<item><url>{url['url']}</url><title>{url['title']}</title><bookmarked_at>{url['bookmarked_at']}</bookmarked_at><tags>{url['tags']}</tags></item>"
xml += "</channel></rss>"
return xml
return crawl
@@ -143,29 +153,29 @@ def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema):
"""Update a crawl (e.g., set status=sealed to cancel queued work)."""
crawl = Crawl.objects.get(id__icontains=crawl_id)
payload = data.dict(exclude_unset=True)
update_fields = ['modified_at']
update_fields = ["modified_at"]
tags = payload.pop('tags', None)
tags_str = payload.pop('tags_str', None)
tags = payload.pop("tags", None)
tags_str = payload.pop("tags_str", None)
if tags is not None or tags_str is not None:
crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
update_fields.append('tags_str')
crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or ""))
update_fields.append("tags_str")
if 'status' in payload:
if payload['status'] not in Crawl.StatusChoices.values:
raise HttpError(400, f'Invalid status: {payload["status"]}')
crawl.status = payload['status']
if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
if "status" in payload:
if payload["status"] not in Crawl.StatusChoices.values:
raise HttpError(400, f"Invalid status: {payload['status']}")
crawl.status = payload["status"]
if crawl.status == Crawl.StatusChoices.SEALED and "retry_at" not in payload:
crawl.retry_at = None
update_fields.append('status')
update_fields.append("status")
if 'retry_at' in payload:
crawl.retry_at = payload['retry_at']
update_fields.append('retry_at')
if "retry_at" in payload:
crawl.retry_at = payload["retry_at"]
update_fields.append("retry_at")
crawl.save(update_fields=update_fields)
if payload.get('status') == Crawl.StatusChoices.SEALED:
if payload.get("status") == Crawl.StatusChoices.SEALED:
Snapshot.objects.filter(
crawl=crawl,
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
@@ -184,8 +194,8 @@ def delete_crawl(request: HttpRequest, crawl_id: str):
snapshot_count = crawl.snapshot_set.count()
deleted_count, _ = crawl.delete()
return {
'success': True,
'crawl_id': crawl_id_str,
'deleted_count': deleted_count,
'deleted_snapshots': snapshot_count,
"success": True,
"crawl_id": crawl_id_str,
"deleted_count": deleted_count,
"deleted_snapshots": snapshot_count,
}

View File

@@ -1,7 +1,7 @@
__package__ = 'archivebox.api'
__package__ = "archivebox.api"
from uuid import UUID
from typing import Annotated, List, Optional
from typing import Annotated
from datetime import datetime
from django.http import HttpRequest
@@ -12,16 +12,18 @@ from ninja.pagination import paginate
from archivebox.api.v1_core import CustomPagination
router = Router(tags=['Machine and Dependencies'])
router = Router(tags=["Machine and Dependencies"])
# ============================================================================
# Machine Schemas
# ============================================================================
class MachineSchema(Schema):
"""Schema for Machine model."""
TYPE: str = 'machine.Machine'
TYPE: str = "machine.Machine"
id: UUID
created_at: datetime
modified_at: datetime
@@ -43,22 +45,24 @@ class MachineSchema(Schema):
class MachineFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
hostname: Annotated[Optional[str], FilterLookup('hostname__icontains')] = None
os_platform: Annotated[Optional[str], FilterLookup('os_platform__icontains')] = None
os_arch: Annotated[Optional[str], FilterLookup('os_arch')] = None
hw_in_docker: Annotated[Optional[bool], FilterLookup('hw_in_docker')] = None
hw_in_vm: Annotated[Optional[bool], FilterLookup('hw_in_vm')] = None
bin_providers: Annotated[Optional[str], FilterLookup('bin_providers__icontains')] = None
id: Annotated[str | None, FilterLookup("id__startswith")] = None
hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None
os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None
os_arch: Annotated[str | None, FilterLookup("os_arch")] = None
hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None
hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None
bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None
# ============================================================================
# Binary Schemas
# ============================================================================
class BinarySchema(Schema):
"""Schema for Binary model."""
TYPE: str = 'machine.Binary'
TYPE: str = "machine.Binary"
id: UUID
created_at: datetime
modified_at: datetime
@@ -85,23 +89,25 @@ class BinarySchema(Schema):
class BinaryFilterSchema(FilterSchema):
id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
name: Annotated[Optional[str], FilterLookup('name__icontains')] = None
binprovider: Annotated[Optional[str], FilterLookup('binprovider')] = None
status: Annotated[Optional[str], FilterLookup('status')] = None
machine_id: Annotated[Optional[str], FilterLookup('machine_id__startswith')] = None
version: Annotated[Optional[str], FilterLookup('version__icontains')] = None
id: Annotated[str | None, FilterLookup("id__startswith")] = None
name: Annotated[str | None, FilterLookup("name__icontains")] = None
binprovider: Annotated[str | None, FilterLookup("binprovider")] = None
status: Annotated[str | None, FilterLookup("status")] = None
machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None
version: Annotated[str | None, FilterLookup("version__icontains")] = None
# ============================================================================
# Machine Endpoints
# ============================================================================
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
@router.get("/machines", response=list[MachineSchema], url_name="get_machines")
@paginate(CustomPagination)
def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
"""List all machines."""
from archivebox.machine.models import Machine
return filters.filter(Machine.objects.all()).distinct()
@@ -109,6 +115,7 @@ def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
def get_current_machine(request: HttpRequest):
"""Get the current machine."""
from archivebox.machine.models import Machine
return Machine.current()
@@ -117,6 +124,7 @@ def get_machine(request: HttpRequest, machine_id: str):
"""Get a specific machine by ID."""
from archivebox.machine.models import Machine
from django.db.models import Q
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
@@ -127,23 +135,27 @@ def get_machine(request: HttpRequest, machine_id: str):
# Binary Endpoints
# ============================================================================
@router.get("/binaries", response=List[BinarySchema], url_name="get_binaries")
@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries")
@paginate(CustomPagination)
def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]):
"""List all binaries."""
from archivebox.machine.models import Binary
return filters.filter(Binary.objects.all().select_related('machine')).distinct()
return filters.filter(Binary.objects.all().select_related("machine")).distinct()
@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
def get_binary(request: HttpRequest, binary_id: str):
"""Get a specific binary by ID."""
from archivebox.machine.models import Binary
return Binary.objects.select_related('machine').get(id__startswith=binary_id)
return Binary.objects.select_related("machine").get(id__startswith=binary_id)
@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name")
def get_binaries_by_name(request: HttpRequest, name: str):
"""Get all binaries with the given name."""
from archivebox.machine.models import Binary
return list(Binary.objects.filter(name__iexact=name).select_related('machine'))
return list(Binary.objects.filter(name__iexact=name).select_related("machine"))