wip

2026-04-06 07:47:53 +10:00 · 2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions
--- a/archivebox/api/init.py
+++ b/archivebox/api/init.py
@@ -1 +1 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"
--- a/archivebox/api/admin.py
+++ b/archivebox/api/admin.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 from django.contrib import admin
 from django.http import HttpRequest
@@ -11,57 +11,81 @@ from archivebox.api.models import APIToken


 class APITokenAdmin(BaseModelAdmin):
-    list_display = ('created_at', 'id', 'created_by', 'token_redacted', 'expires')
-    sort_fields = ('id', 'created_at', 'created_by', 'expires')
-    readonly_fields = ('created_at', 'modified_at')
-    search_fields = ('id', 'created_by__username', 'token')
+    list_display = ("created_at", "id", "created_by", "token_redacted", "expires")
+    sort_fields = ("id", "created_at", "created_by", "expires")
+    readonly_fields = ("created_at", "modified_at")
+    search_fields = ("id", "created_by__username", "token")

    fieldsets = (
-        ('Token', {
-            'fields': ('token', 'expires'),
-            'classes': ('card',),
-        }),
-        ('Owner', {
-            'fields': ('created_by',),
-            'classes': ('card',),
-        }),
-        ('Timestamps', {
-            'fields': ('created_at', 'modified_at'),
-            'classes': ('card',),
-        }),
+        (
+            "Token",
+            {
+                "fields": ("token", "expires"),
+                "classes": ("card",),
+            },
+        ),
+        (
+            "Owner",
+            {
+                "fields": ("created_by",),
+                "classes": ("card",),
+            },
+        ),
+        (
+            "Timestamps",
+            {
+                "fields": ("created_at", "modified_at"),
+                "classes": ("card",),
+            },
+        ),
    )

-    list_filter = ('created_by',)
-    ordering = ['-created_at']
+    list_filter = ("created_by",)
+    ordering = ["-created_at"]
    list_per_page = 100


 class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
-    list_display = ('created_at', 'created_by', 'id', *WebhookAdmin.list_display)
-    sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
-    readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)
+    list_display = ("created_at", "created_by", "id", *WebhookAdmin.list_display)
+    sort_fields = ("created_at", "created_by", "id", "referenced_model", "endpoint", "last_success", "last_error")
+    readonly_fields = ("created_at", "modified_at", *WebhookAdmin.readonly_fields)

    fieldsets = (
-        ('Webhook', {
-            'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
-            'classes': ('card', 'wide'),
-        }),
-        ('Authentication', {
-            'fields': ('auth_token',),
-            'classes': ('card',),
-        }),
-        ('Status', {
-            'fields': ('enabled', 'last_success', 'last_error'),
-            'classes': ('card',),
-        }),
-        ('Owner', {
-            'fields': ('created_by',),
-            'classes': ('card',),
-        }),
-        ('Timestamps', {
-            'fields': ('created_at', 'modified_at'),
-            'classes': ('card',),
-        }),
+        (
+            "Webhook",
+            {
+                "fields": ("name", "signal", "referenced_model", "endpoint"),
+                "classes": ("card", "wide"),
+            },
+        ),
+        (
+            "Authentication",
+            {
+                "fields": ("auth_token",),
+                "classes": ("card",),
+            },
+        ),
+        (
+            "Status",
+            {
+                "fields": ("enabled", "last_success", "last_error"),
+                "classes": ("card",),
+            },
+        ),
+        (
+            "Owner",
+            {
+                "fields": ("created_by",),
+                "classes": ("card",),
+            },
+        ),
+        (
+            "Timestamps",
+            {
+                "fields": ("created_at", "modified_at"),
+                "classes": ("card",),
+            },
+        ),
    )

    def lookup_allowed(self, lookup: str, value: str, request: HttpRequest | None = None) -> bool:
--- a/archivebox/api/apps.py
+++ b/archivebox/api/apps.py
@@ -1,13 +1,14 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 from django.apps import AppConfig


 class APIConfig(AppConfig):
-    name = 'archivebox.api'
-    label = 'api'
+    name = "archivebox.api"
+    label = "api"


 def register_admin(admin_site):
    from archivebox.api.admin import register_admin
+
    register_admin(admin_site)
--- a/archivebox/api/auth.py
+++ b/archivebox/api/auth.py
@@ -1,6 +1,5 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

-from typing import Optional
 from datetime import timedelta

 from django.utils import timezone
@@ -14,7 +13,7 @@ from ninja.errors import HttpError

 def get_or_create_api_token(user: User | None):
    from archivebox.api.models import APIToken
-    
+
    if user and user.is_superuser:
        api_tokens = APIToken.objects.filter(created_by_id=user.pk, expires__gt=timezone.now())
        if api_tokens.exists():
@@ -34,18 +33,18 @@ def get_or_create_api_token(user: User | None):

 def auth_using_token(token: str | None, request: HttpRequest | None = None) -> User | None:
    """Given an API token string, check if a corresponding non-expired APIToken exists, and return its user"""
-    from archivebox.api.models import APIToken        # lazy import model to avoid loading it at urls.py import time
-    
+    from archivebox.api.models import APIToken  # lazy import model to avoid loading it at urls.py import time
+
    user: User | None = None

-    submitted_empty_form = str(token).strip() in ('string', '', 'None', 'null')
+    submitted_empty_form = str(token).strip() in ("string", "", "None", "null")
    if not submitted_empty_form:
        try:
            api_token = APIToken.objects.get(token=token)
            if api_token.is_valid() and isinstance(api_token.created_by, User):
                user = api_token.created_by
                if request is not None:
-                    setattr(request, '_api_token', api_token)
+                    setattr(request, "_api_token", api_token)
        except APIToken.DoesNotExist:
            pass

@@ -55,8 +54,8 @@ def auth_using_token(token: str | None, request: HttpRequest | None = None) -> U
 def auth_using_password(username: str | None, password: str | None, request: HttpRequest | None = None) -> User | None:
    """Given a username and password, check if they are valid and return the corresponding user"""
    user: User | None = None
-    
-    submitted_empty_form = (username, password) in (('string', 'string'), ('', ''), (None, None))
+
+    submitted_empty_form = (username, password) in (("string", "string"), ("", ""), (None, None))
    if not submitted_empty_form:
        authenticated_user = authenticate(
            username=username,
@@ -73,34 +72,40 @@ def auth_using_password(username: str | None, password: str | None, request: Htt
 def _require_superuser(user: User | None, request: HttpRequest, auth_method: str) -> User | None:
    if user and user.pk:
        request.user = user
-        setattr(request, '_api_auth_method', auth_method)
+        setattr(request, "_api_auth_method", auth_method)
        if not user.is_superuser:
-            raise HttpError(403, 'Valid credentials but User does not have permission (make sure user.is_superuser=True)')
+            raise HttpError(403, "Valid credentials but User does not have permission (make sure user.is_superuser=True)")
    return user


 ### Django-Ninja-Provided Auth Methods

+
 class HeaderTokenAuth(APIKeyHeader):
    """Allow authenticating by passing X-API-Key=xyz as a request header"""
+
    param_name = "X-ArchiveBox-API-Key"

-    def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
+    def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
        return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)

+
 class BearerTokenAuth(HttpBearer):
    """Allow authenticating by passing Bearer=xyz as a request header"""

    def authenticate(self, request: HttpRequest, token: str) -> User | None:
        return _require_superuser(auth_using_token(token=token, request=request), request, self.__class__.__name__)

+
 class QueryParamTokenAuth(APIKeyQuery):
    """Allow authenticating by passing api_key=xyz as a GET/POST query parameter"""
+
    param_name = "api_key"

-    def authenticate(self, request: HttpRequest, key: Optional[str]) -> User | None:
+    def authenticate(self, request: HttpRequest, key: str | None) -> User | None:
        return _require_superuser(auth_using_token(token=key, request=request), request, self.__class__.__name__)

+
 class UsernameAndPasswordAuth(HttpBasicAuth):
    """Allow authenticating by passing username & password via HTTP Basic Authentication (not recommended)"""

@@ -111,25 +116,28 @@ class UsernameAndPasswordAuth(HttpBasicAuth):
            self.__class__.__name__,
        )

+
 class DjangoSessionAuth:
    """Allow authenticating with existing Django session cookies (same-origin only)."""
+
    def __call__(self, request: HttpRequest) -> User | None:
        return self.authenticate(request)

    def authenticate(self, request: HttpRequest, **kwargs) -> User | None:
-        user = getattr(request, 'user', None)
+        user = getattr(request, "user", None)
        if isinstance(user, User) and user.is_authenticated:
-            setattr(request, '_api_auth_method', self.__class__.__name__)
+            setattr(request, "_api_auth_method", self.__class__.__name__)
            if not user.is_superuser:
-                raise HttpError(403, 'Valid session but User does not have permission (make sure user.is_superuser=True)')
+                raise HttpError(403, "Valid session but User does not have permission (make sure user.is_superuser=True)")
            return user
        return None

+
 ### Enabled Auth Methods

 API_AUTH_METHODS = [
    HeaderTokenAuth(),
    BearerTokenAuth(),
-    QueryParamTokenAuth(), 
+    QueryParamTokenAuth(),
    # django_auth_superuser,       # django admin cookie auth, not secure to use with csrf=False
 ]
--- a/archivebox/api/middleware.py
+++ b/archivebox/api/middleware.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 from django.http import HttpResponse

@@ -10,8 +10,8 @@ class ApiCorsMiddleware:
        self.get_response = get_response

    def __call__(self, request):
-        if request.path.startswith('/api/'):
-            if request.method == 'OPTIONS' and request.META.get('HTTP_ACCESS_CONTROL_REQUEST_METHOD'):
+        if request.path.startswith("/api/"):
+            if request.method == "OPTIONS" and request.META.get("HTTP_ACCESS_CONTROL_REQUEST_METHOD"):
                response = HttpResponse(status=204)
                return self._add_cors_headers(request, response)

@@ -21,14 +21,12 @@ class ApiCorsMiddleware:
        return self.get_response(request)

    def _add_cors_headers(self, request, response):
-        origin = request.META.get('HTTP_ORIGIN')
+        origin = request.META.get("HTTP_ORIGIN")
        if not origin:
            return response

-        response['Access-Control-Allow-Origin'] = '*'
-        response['Access-Control-Allow-Methods'] = 'GET, POST, PUT, PATCH, DELETE, OPTIONS'
-        response['Access-Control-Allow-Headers'] = (
-            'Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken'
-        )
-        response['Access-Control-Max-Age'] = '600'
+        response["Access-Control-Allow-Origin"] = "*"
+        response["Access-Control-Allow-Methods"] = "GET, POST, PUT, PATCH, DELETE, OPTIONS"
+        response["Access-Control-Allow-Headers"] = "Authorization, X-ArchiveBox-API-Key, Content-Type, X-CSRFToken"
+        response["Access-Control-Max-Age"] = "600"
        return response
--- a/archivebox/api/migrations/0001_initial.py
+++ b/archivebox/api/migrations/0001_initial.py
@@ -13,11 +13,10 @@ import signal_webhooks.utils


 class Migration(migrations.Migration):
-
    initial = True

    dependencies = [
-        ('auth', '0012_alter_user_first_name_max_length'),
+        ("auth", "0012_alter_user_first_name_max_length"),
        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
    ]

@@ -75,55 +74,165 @@ class Migration(migrations.Migration):
                    reverse_sql="""
                DROP TABLE IF EXISTS api_outboundwebhook;
                DROP TABLE IF EXISTS api_apitoken;
-                    """
+                    """,
                ),
            ],
            state_operations=[
                migrations.CreateModel(
-                    name='APIToken',
+                    name="APIToken",
                    fields=[
-                        ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
-                        ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
-                        ('modified_at', models.DateTimeField(auto_now=True)),
-                        ('token', models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
-                        ('expires', models.DateTimeField(blank=True, null=True)),
-                        ('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+                        ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                        ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                        ("modified_at", models.DateTimeField(auto_now=True)),
+                        ("token", models.CharField(default=archivebox.api.models.generate_secret_token, max_length=32, unique=True)),
+                        ("expires", models.DateTimeField(blank=True, null=True)),
+                        (
+                            "created_by",
+                            models.ForeignKey(
+                                default=get_or_create_system_user_pk,
+                                on_delete=django.db.models.deletion.CASCADE,
+                                to=settings.AUTH_USER_MODEL,
+                            ),
+                        ),
                    ],
                    options={
-                        'verbose_name': 'API Key',
-                        'verbose_name_plural': 'API Keys',
-                        'app_label': 'api',
+                        "verbose_name": "API Key",
+                        "verbose_name_plural": "API Keys",
+                        "app_label": "api",
                    },
                ),
                migrations.CreateModel(
-                    name='OutboundWebhook',
+                    name="OutboundWebhook",
                    fields=[
-                        ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
-                        ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
-                        ('modified_at', models.DateTimeField(auto_now=True)),
-                        ('name', models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name')),
-                        ('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal')),
-                        ('ref', models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
-                        ('endpoint', models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint')),
-                        ('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
-                        ('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
-                        ('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
-                        ('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
-                        ('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')),
-                        ('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
-                        ('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
-                        ('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
-                        ('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
-                        ('created_by', models.ForeignKey(default=get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+                        ("id", models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                        ("created_at", models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                        ("modified_at", models.DateTimeField(auto_now=True)),
+                        (
+                            "name",
+                            models.CharField(db_index=True, help_text="Webhook name.", max_length=255, unique=True, verbose_name="name"),
+                        ),
+                        (
+                            "signal",
+                            models.CharField(
+                                choices=[
+                                    ("CREATE", "Create"),
+                                    ("UPDATE", "Update"),
+                                    ("DELETE", "Delete"),
+                                    ("M2M", "M2M changed"),
+                                    ("CREATE_OR_UPDATE", "Create or Update"),
+                                    ("CREATE_OR_DELETE", "Create or Delete"),
+                                    ("CREATE_OR_M2M", "Create or M2M changed"),
+                                    ("UPDATE_OR_DELETE", "Update or Delete"),
+                                    ("UPDATE_OR_M2M", "Update or M2M changed"),
+                                    ("DELETE_OR_M2M", "Delete or M2M changed"),
+                                    ("CREATE_UPDATE_OR_DELETE", "Create, Update or Delete"),
+                                    ("CREATE_UPDATE_OR_M2M", "Create, Update or M2M changed"),
+                                    ("CREATE_DELETE_OR_M2M", "Create, Delete or M2M changed"),
+                                    ("UPDATE_DELETE_OR_M2M", "Update, Delete or M2M changed"),
+                                    ("CREATE_UPDATE_DELETE_OR_M2M", "Create, Update or Delete, or M2M changed"),
+                                ],
+                                help_text="Signal the webhook fires to.",
+                                max_length=255,
+                                verbose_name="signal",
+                            ),
+                        ),
+                        (
+                            "ref",
+                            models.CharField(
+                                db_index=True,
+                                help_text="Dot import notation to the model the webhook is for.",
+                                max_length=1023,
+                                validators=[signal_webhooks.utils.model_from_reference],
+                                verbose_name="referenced model",
+                            ),
+                        ),
+                        (
+                            "endpoint",
+                            models.URLField(help_text="Target endpoint for this webhook.", max_length=2047, verbose_name="endpoint"),
+                        ),
+                        (
+                            "headers",
+                            models.JSONField(
+                                blank=True,
+                                default=dict,
+                                help_text="Headers to send with the webhook request.",
+                                validators=[signal_webhooks.utils.is_dict],
+                                verbose_name="headers",
+                            ),
+                        ),
+                        (
+                            "auth_token",
+                            signal_webhooks.fields.TokenField(
+                                blank=True,
+                                default="",
+                                help_text="Authentication token to use in an Authorization header.",
+                                max_length=8000,
+                                validators=[signal_webhooks.utils.decode_cipher_key],
+                                verbose_name="authentication token",
+                            ),
+                        ),
+                        ("enabled", models.BooleanField(default=True, help_text="Is this webhook enabled?", verbose_name="enabled")),
+                        (
+                            "keep_last_response",
+                            models.BooleanField(
+                                default=False,
+                                help_text="Should the webhook keep a log of the latest response it got?",
+                                verbose_name="keep last response",
+                            ),
+                        ),
+                        (
+                            "created",
+                            models.DateTimeField(auto_now_add=True, help_text="When the webhook was created.", verbose_name="created"),
+                        ),
+                        (
+                            "updated",
+                            models.DateTimeField(auto_now=True, help_text="When the webhook was last updated.", verbose_name="updated"),
+                        ),
+                        (
+                            "last_response",
+                            models.CharField(
+                                blank=True,
+                                default="",
+                                help_text="Latest response to this webhook.",
+                                max_length=8000,
+                                verbose_name="last response",
+                            ),
+                        ),
+                        (
+                            "last_success",
+                            models.DateTimeField(
+                                default=None,
+                                help_text="When the webhook last succeeded.",
+                                null=True,
+                                verbose_name="last success",
+                            ),
+                        ),
+                        (
+                            "last_failure",
+                            models.DateTimeField(
+                                default=None,
+                                help_text="When the webhook last failed.",
+                                null=True,
+                                verbose_name="last failure",
+                            ),
+                        ),
+                        (
+                            "created_by",
+                            models.ForeignKey(
+                                default=get_or_create_system_user_pk,
+                                on_delete=django.db.models.deletion.CASCADE,
+                                to=settings.AUTH_USER_MODEL,
+                            ),
+                        ),
                    ],
                    options={
-                        'verbose_name': 'API Outbound Webhook',
-                        'app_label': 'api',
+                        "verbose_name": "API Outbound Webhook",
+                        "app_label": "api",
                    },
                ),
                migrations.AddConstraint(
-                    model_name='outboundwebhook',
-                    constraint=models.UniqueConstraint(fields=['ref', 'endpoint'], name='prevent_duplicate_hooks_api_outboundwebhook'),
+                    model_name="outboundwebhook",
+                    constraint=models.UniqueConstraint(fields=["ref", "endpoint"], name="prevent_duplicate_hooks_api_outboundwebhook"),
                ),
            ],
        ),
--- a/archivebox/api/models.py
+++ b/archivebox/api/models.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 import secrets
 from archivebox.uuid_compat import uuid7
@@ -25,7 +25,7 @@ class APIToken(models.Model):
    expires = models.DateTimeField(null=True, blank=True)

    class Meta(TypedModelMeta):
-        app_label = 'api'
+        app_label = "api"
        verbose_name = "API Key"
        verbose_name_plural = "API Keys"

@@ -34,7 +34,7 @@ class APIToken(models.Model):

    @property
    def token_redacted(self):
-        return f'************{self.token[-4:]}'
+        return f"************{self.token[-4:]}"

    def is_valid(self, for_date=None):
        return not self.expires or self.expires >= (for_date or timezone.now())
@@ -47,8 +47,8 @@ class OutboundWebhook(WebhookBase):
    modified_at = models.DateTimeField(auto_now=True)

    class Meta(WebhookBase.Meta):
-        app_label = 'api'
-        verbose_name = 'API Outbound Webhook'
+        app_label = "api"
+        verbose_name = "API Outbound Webhook"

    def __str__(self) -> str:
-        return f'[{self.id}] {self.ref} -> {self.endpoint}'
+        return f"[{self.id}] {self.ref} -> {self.endpoint}"
--- a/archivebox/api/urls.py
+++ b/archivebox/api/urls.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 from django.urls import path
 from django.views.generic.base import RedirectView
@@ -6,12 +6,10 @@ from django.views.generic.base import RedirectView
 from .v1_api import urls as v1_api_urls

 urlpatterns = [
-    path("",                 RedirectView.as_view(url='/api/v1/docs')),
-
-    path("v1/",              RedirectView.as_view(url='/api/v1/docs')),
-    path("v1/",              v1_api_urls),
-    path("v1",               RedirectView.as_view(url='/api/v1/docs')),
-
+    path("", RedirectView.as_view(url="/api/v1/docs")),
+    path("v1/", RedirectView.as_view(url="/api/v1/docs")),
+    path("v1/", v1_api_urls),
+    path("v1", RedirectView.as_view(url="/api/v1/docs")),
    # ... v2 can be added here ...
    # path("v2/",              v2_api_urls),
    # path("v2",               RedirectView.as_view(url='/api/v2/docs')),
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"


 from io import StringIO
@@ -20,9 +20,9 @@ from archivebox.api.auth import API_AUTH_METHODS
 from archivebox.api.models import APIToken


-COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
+COMMIT_HASH = get_COMMIT_HASH() or "unknown"

-html_description=f'''
+html_description = f"""
 <h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
 <br/>
 <i><b>WARNING: This API is still in an early development stage and may change!</b></i>
@@ -35,47 +35,47 @@ html_description=f'''
 <li>📜 See the API source code: <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/api"><code>archivebox/api/</code></a></li>
 </ul>
 <small>Served by ArchiveBox v{VERSION} (<a href="https://github.com/ArchiveBox/ArchiveBox/commit/{COMMIT_HASH}"><code>{COMMIT_HASH[:8]}</code></a>), API powered by <a href="https://django-ninja.dev/"><code>django-ninja</code></a>.</small>
-'''
+"""


 def register_urls(api: NinjaAPI) -> NinjaAPI:
-    api.add_router('/auth/',     'archivebox.api.v1_auth.router')
-    api.add_router('/core/',     'archivebox.api.v1_core.router')
-    api.add_router('/crawls/',   'archivebox.api.v1_crawls.router')
-    api.add_router('/cli/',      'archivebox.api.v1_cli.router')
-    api.add_router('/machine/',  'archivebox.api.v1_machine.router')
+    api.add_router("/auth/", "archivebox.api.v1_auth.router")
+    api.add_router("/core/", "archivebox.api.v1_core.router")
+    api.add_router("/crawls/", "archivebox.api.v1_crawls.router")
+    api.add_router("/cli/", "archivebox.api.v1_cli.router")
+    api.add_router("/machine/", "archivebox.api.v1_machine.router")
    return api


-class NinjaAPIWithIOCapture(NinjaAPI):    
+class NinjaAPIWithIOCapture(NinjaAPI):
    def create_temporal_response(self, request: HttpRequest) -> HttpResponse:
        stdout, stderr = StringIO(), StringIO()

        with redirect_stderr(stderr):
            with redirect_stdout(stdout):
-                setattr(request, 'stdout', stdout)
-                setattr(request, 'stderr', stderr)
+                setattr(request, "stdout", stdout)
+                setattr(request, "stderr", stderr)

                response = super().create_temporal_response(request)

-        # Diable caching of API responses entirely
-        response['Cache-Control'] = 'no-store'
+        # Disable caching of API responses entirely
+        response["Cache-Control"] = "no-store"

        # Add debug stdout and stderr headers to response
-        response['X-ArchiveBox-Stdout'] = stdout.getvalue().replace('\n', '\\n')[:200]
-        response['X-ArchiveBox-Stderr'] = stderr.getvalue().replace('\n', '\\n')[:200]
+        response["X-ArchiveBox-Stdout"] = stdout.getvalue().replace("\n", "\\n")[:200]
+        response["X-ArchiveBox-Stderr"] = stderr.getvalue().replace("\n", "\\n")[:200]
        # response['X-ArchiveBox-View'] = self.get_openapi_operation_id(request) or 'Unknown'

        # Add Auth Headers to response
-        api_token_attr = getattr(request, '_api_token', None)
+        api_token_attr = getattr(request, "_api_token", None)
        api_token = api_token_attr if isinstance(api_token_attr, APIToken) else None
-        token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else 'Never'
+        token_expiry = api_token.expires.isoformat() if api_token and api_token.expires else "Never"

-        response['X-ArchiveBox-Auth-Method'] = str(getattr(request, '_api_auth_method', 'None'))
-        response['X-ArchiveBox-Auth-Expires'] = token_expiry
-        response['X-ArchiveBox-Auth-Token-Id'] = str(api_token.id) if api_token else 'None'
-        response['X-ArchiveBox-Auth-User-Id'] = str(request.user.pk) if getattr(request.user, 'pk', None) else 'None'
-        response['X-ArchiveBox-Auth-User-Username'] = request.user.username if isinstance(request.user, User) else 'None'
+        response["X-ArchiveBox-Auth-Method"] = str(getattr(request, "_api_auth_method", "None"))
+        response["X-ArchiveBox-Auth-Expires"] = token_expiry
+        response["X-ArchiveBox-Auth-Token-Id"] = str(api_token.id) if api_token else "None"
+        response["X-ArchiveBox-Auth-User-Id"] = str(request.user.pk) if getattr(request.user, "pk", None) else "None"
+        response["X-ArchiveBox-Auth-User-Username"] = request.user.username if isinstance(request.user, User) else "None"

        # import ipdb; ipdb.set_trace()
        # print('RESPONDING NOW', response)
@@ -84,7 +84,7 @@ class NinjaAPIWithIOCapture(NinjaAPI):


 api = NinjaAPIWithIOCapture(
-    title='ArchiveBox API',
+    title="ArchiveBox API",
    description=html_description,
    version=VERSION,
    auth=API_AUTH_METHODS,
@@ -103,15 +103,15 @@ def generic_exception_handler(request, err):
    if isinstance(err, (ObjectDoesNotExist, EmptyResultSet, PermissionDenied)):
        status = 404

-    print(''.join(format_exception(err)))
+    print("".join(format_exception(err)))

    return api.create_response(
        request,
        {
            "succeeded": False,
-            "message": f'{err.__class__.__name__}: {err}',
+            "message": f"{err.__class__.__name__}: {err}",
            "errors": [
-                ''.join(format_exception(err)),
+                "".join(format_exception(err)),
                # or send simpler parent-only traceback:
                # *([str(err.__context__)] if getattr(err, '__context__', None) else []),
            ],
@@ -120,7 +120,6 @@ def generic_exception_handler(request, err):
    )


-
 # import orjson
 # from ninja.renderers import BaseRenderer
 # class ORJSONRenderer(BaseRenderer):
--- a/archivebox/api/v1_auth.py
+++ b/archivebox/api/v1_auth.py
@@ -1,6 +1,5 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

-from typing import Optional
 from django.http import HttpRequest

 from ninja import Router, Schema
@@ -8,16 +7,21 @@ from ninja import Router, Schema
 from archivebox.api.auth import auth_using_token, auth_using_password, get_or_create_api_token


-router = Router(tags=['Authentication'], auth=None)
+router = Router(tags=["Authentication"], auth=None)


 class PasswordAuthSchema(Schema):
    """Schema for a /get_api_token request"""
-    username: Optional[str] = None
-    password: Optional[str] = None
+
+    username: str | None = None
+    password: str | None = None


-@router.post("/get_api_token", auth=None, summary='Generate an API token for a given username & password (or currently logged-in user)')             # auth=None because they are not authed yet
+@router.post(
+    "/get_api_token",
+    auth=None,
+    summary="Generate an API token for a given username & password (or currently logged-in user)",
+)  # auth=None because they are not authed yet
 def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
    user = auth_using_password(
        username=auth_data.username,
@@ -35,17 +39,21 @@ def get_api_token(request: HttpRequest, auth_data: PasswordAuthSchema):
            "token": api_token.token,
            "expires": api_token.expires.isoformat() if api_token.expires else None,
        }
-    
-    return {"success": False, "errors": ["Invalid credentials"]}

+    return {"success": False, "errors": ["Invalid credentials"]}


 class TokenAuthSchema(Schema):
    """Schema for a /check_api_token request"""
+
    token: str


-@router.post("/check_api_token", auth=None, summary='Validate an API token to make sure its valid and non-expired')        # auth=None because they are not authed yet
+@router.post(
+    "/check_api_token",
+    auth=None,
+    summary="Validate an API token to make sure its valid and non-expired",
+)  # auth=None because they are not authed yet
 def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
    user = auth_using_token(
        token=token_data.token,
@@ -53,5 +61,5 @@ def check_api_token(request: HttpRequest, token_data: TokenAuthSchema):
    )
    if user:
        return {"success": True, "user_id": str(user.pk)}
-    
+
    return {"success": False, "user_id": None}
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -1,8 +1,8 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 import json
 from io import StringIO
-from typing import List, Dict, Any, Optional
+from typing import Any
 from enum import Enum

 from django.http import HttpRequest
@@ -16,44 +16,47 @@ from archivebox.config.common import ARCHIVING_CONFIG
 # from .auth import API_AUTH_METHODS

 # router for API that exposes archivebox cli subcommands as REST endpoints
-router = Router(tags=['ArchiveBox CLI Sub-Commands'])
+router = Router(tags=["ArchiveBox CLI Sub-Commands"])


 # Schemas

-JSONType = List[Any] | Dict[str, Any] | bool | int | str | None
+JSONType = list[Any] | dict[str, Any] | bool | int | str | None
+

 class CLICommandResponseSchema(Schema):
    success: bool
-    errors: List[str]
+    errors: list[str]
    result: JSONType
-    result_format: str = 'str'
+    result_format: str = "str"
    stdout: str
    stderr: str

+
 class FilterTypeChoices(str, Enum):
-    exact = 'exact'
-    substring = 'substring'
-    regex = 'regex'
-    domain = 'domain'
-    tag = 'tag'
-    timestamp = 'timestamp'
+    exact = "exact"
+    substring = "substring"
+    regex = "regex"
+    domain = "domain"
+    tag = "tag"
+    timestamp = "timestamp"
+

 class StatusChoices(str, Enum):
-    indexed = 'indexed'
-    archived = 'archived'
-    unarchived = 'unarchived'
-    present = 'present'
-    valid = 'valid'
-    invalid = 'invalid'
-    duplicate = 'duplicate'
-    orphaned = 'orphaned'
-    corrupted = 'corrupted'
-    unrecognized = 'unrecognized'
+    indexed = "indexed"
+    archived = "archived"
+    unarchived = "unarchived"
+    present = "present"
+    valid = "valid"
+    invalid = "invalid"
+    duplicate = "duplicate"
+    orphaned = "orphaned"
+    corrupted = "corrupted"
+    unrecognized = "unrecognized"


 class AddCommandSchema(Schema):
-    urls: List[str]
+    urls: list[str]
    tag: str = ""
    depth: int = 0
    parser: str = "auto"
@@ -62,53 +65,54 @@ class AddCommandSchema(Schema):
    overwrite: bool = False
    index_only: bool = False

+
 class UpdateCommandSchema(Schema):
-    resume: Optional[str] = None
-    after: Optional[float] = 0
-    before: Optional[float] = 999999999999999
-    filter_type: Optional[str] = FilterTypeChoices.substring
-    filter_patterns: Optional[List[str]] = ['https://example.com']
+    resume: str | None = None
+    after: float | None = 0
+    before: float | None = 999999999999999
+    filter_type: str | None = FilterTypeChoices.substring
+    filter_patterns: list[str] | None = ["https://example.com"]
    batch_size: int = 100
    continuous: bool = False

+
 class ScheduleCommandSchema(Schema):
-    import_path: Optional[str] = None
+    import_path: str | None = None
    add: bool = False
    show: bool = False
    foreground: bool = False
    run_all: bool = False
    quiet: bool = False
-    every: Optional[str] = None
-    tag: str = ''
+    every: str | None = None
+    tag: str = ""
    depth: int = 0
    overwrite: bool = False
    update: bool = not ARCHIVING_CONFIG.ONLY_NEW
    clear: bool = False

+
 class ListCommandSchema(Schema):
-    filter_patterns: Optional[List[str]] = ['https://example.com']
+    filter_patterns: list[str] | None = ["https://example.com"]
    filter_type: str = FilterTypeChoices.substring
    status: StatusChoices = StatusChoices.indexed
-    after: Optional[float] = 0
-    before: Optional[float] = 999999999999999
-    sort: str = 'bookmarked_at'
+    after: float | None = 0
+    before: float | None = 999999999999999
+    sort: str = "bookmarked_at"
    as_json: bool = True
    as_html: bool = False
-    as_csv: str | None = 'timestamp,url'
+    as_csv: str | None = "timestamp,url"
    with_headers: bool = False

+
 class RemoveCommandSchema(Schema):
    delete: bool = True
-    after: Optional[float] = 0
-    before: Optional[float] = 999999999999999
+    after: float | None = 0
+    before: float | None = 999999999999999
    filter_type: str = FilterTypeChoices.exact
-    filter_patterns: Optional[List[str]] = ['https://example.com']
+    filter_patterns: list[str] | None = ["https://example.com"]


-
-
-
-@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
+@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
 def cli_add(request: HttpRequest, args: AddCommandSchema):
    from archivebox.cli.archivebox_add import add

@@ -125,30 +129,30 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
        created_by_id=request.user.pk,
    )

-    snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)]
+    snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
    result_payload = {
        "crawl_id": str(crawl.id),
        "num_snapshots": len(snapshot_ids),
        "snapshot_ids": snapshot_ids,
        "queued_urls": args.urls,
    }
-    stdout = getattr(request, 'stdout', None)
-    stderr = getattr(request, 'stderr', None)
+    stdout = getattr(request, "stdout", None)
+    stderr = getattr(request, "stderr", None)

    return {
        "success": True,
        "errors": [],
        "result": result_payload,
        "result_format": "json",
-        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
-        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
+        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
+        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }


-@router.post("/update", response=CLICommandResponseSchema, summary='archivebox update [args] [filter_patterns]')
+@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
 def cli_update(request: HttpRequest, args: UpdateCommandSchema):
    from archivebox.cli.archivebox_update import update
-    
+
    result = update(
        filter_patterns=args.filter_patterns or [],
        filter_type=args.filter_type or FilterTypeChoices.substring,
@@ -158,21 +162,21 @@ def cli_update(request: HttpRequest, args: UpdateCommandSchema):
        batch_size=args.batch_size,
        continuous=args.continuous,
    )
-    stdout = getattr(request, 'stdout', None)
-    stderr = getattr(request, 'stderr', None)
+    stdout = getattr(request, "stdout", None)
+    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
-        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
-        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
+        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
+        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }


-@router.post("/schedule", response=CLICommandResponseSchema, summary='archivebox schedule [args] [import_path]')
+@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
 def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
    from archivebox.cli.archivebox_schedule import schedule
-    
+
    result = schedule(
        import_path=args.import_path,
        add=args.add,
@@ -188,23 +192,22 @@ def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
        update=args.update,
    )

-    stdout = getattr(request, 'stdout', None)
-    stderr = getattr(request, 'stderr', None)
+    stdout = getattr(request, "stdout", None)
+    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
        "result_format": "json",
-        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
-        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
+        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
+        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }


-
-@router.post("/search", response=CLICommandResponseSchema, summary='archivebox search [args] [filter_patterns]')
+@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
 def cli_search(request: HttpRequest, args: ListCommandSchema):
    from archivebox.cli.archivebox_search import search
-    
+
    result = search(
        filter_patterns=args.filter_patterns,
        filter_type=args.filter_type,
@@ -218,7 +221,7 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
        with_headers=args.with_headers,
    )

-    result_format = 'txt'
+    result_format = "txt"
    if args.as_json:
        result_format = "json"
        result = json.loads(result)
@@ -227,20 +230,19 @@ def cli_search(request: HttpRequest, args: ListCommandSchema):
    elif args.as_csv:
        result_format = "csv"

-    stdout = getattr(request, 'stdout', None)
-    stderr = getattr(request, 'stderr', None)
+    stdout = getattr(request, "stdout", None)
+    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
        "result_format": result_format,
-        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
-        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
+        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
+        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }
-    


-@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
+@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
 def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
    from archivebox.cli.archivebox_remove import remove
    from archivebox.cli.archivebox_search import get_snapshots
@@ -253,10 +255,10 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
        after=args.after,
        before=args.before,
    )
-    removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)]
-    
+    removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]
+
    remove(
-        yes=True,            # no way to interactively ask for confirmation via API, so we force yes
+        yes=True,  # no way to interactively ask for confirmation via API, so we force yes
        delete=args.delete,
        snapshots=snapshots_to_remove,
        before=args.before,
@@ -270,14 +272,13 @@ def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
        "removed_snapshot_ids": removed_snapshot_ids,
        "remaining_snapshots": Snapshot.objects.count(),
    }
-    stdout = getattr(request, 'stdout', None)
-    stderr = getattr(request, 'stderr', None)
+    stdout = getattr(request, "stdout", None)
+    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
        "result_format": "json",
-        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else '',
-        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else '',
+        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
+        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }
-    
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -1,11 +1,13 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 import math
+from collections import defaultdict
 from uuid import UUID
-from typing import List, Optional, Union, Any, Annotated
+from typing import Union, Any, Annotated
 from datetime import datetime

-from django.db.models import Model, Q
+from django.db.models import Model, Q, Sum
+from django.db.models.functions import Coalesce
 from django.conf import settings
 from django.http import HttpRequest, HttpResponse
 from django.core.exceptions import ValidationError
@@ -39,7 +41,7 @@ from archivebox.crawls.models import Crawl
 from archivebox.api.v1_crawls import CrawlSchema


-router = Router(tags=['Core Models'])
+router = Router(tags=["Core Models"])


 class CustomPagination(PaginationBase):
@@ -49,13 +51,14 @@ class CustomPagination(PaginationBase):
        page: int = 0

    class Output(PaginationBase.Output):
+        count: int
        total_items: int
        total_pages: int
        page: int
        limit: int
        offset: int
        num_items: int
-        items: List[Any]
+        items: list[Any]

    def paginate_queryset(self, queryset, pagination: Input, request: HttpRequest, **params):
        limit = min(pagination.limit, 500)
@@ -65,27 +68,29 @@ class CustomPagination(PaginationBase):
        current_page = math.ceil(offset / (limit + 1))
        items = queryset[offset : offset + limit]
        return {
-            'total_items': total,
-            'total_pages': total_pages,
-            'page': current_page,
-            'limit': limit,
-            'offset': offset,
-            'num_items': len(items),
-            'items': items,
+            "count": total,
+            "total_items": total,
+            "total_pages": total_pages,
+            "page": current_page,
+            "limit": limit,
+            "offset": offset,
+            "num_items": len(items),
+            "items": items,
        }


 ### ArchiveResult #########################################################################

+
 class MinimalArchiveResultSchema(Schema):
-    TYPE: str = 'core.models.ArchiveResult'
+    TYPE: str = "core.models.ArchiveResult"
    id: UUID
    created_at: datetime | None
    modified_at: datetime | None
    created_by_id: str
    created_by_username: str
    status: str
-    retry_at: datetime | None
+    retry_at: datetime | None = None
    plugin: str
    hook_name: str
    process_id: UUID | None
@@ -93,8 +98,8 @@ class MinimalArchiveResultSchema(Schema):
    cmd: list[str] | None
    pwd: str | None
    output_str: str
-    output_json: dict | None
-    output_files: dict | None
+    output_json: dict[str, Any] | None
+    output_files: dict[str, dict[str, Any]] | None
    output_size: int
    output_mimetypes: str
    start_ts: datetime | None
@@ -108,13 +113,34 @@ class MinimalArchiveResultSchema(Schema):
    def resolve_created_by_username(obj) -> str:
        return obj.created_by.username

+    @staticmethod
+    def resolve_output_files(obj):
+        return obj.output_file_map()
+
+    @staticmethod
+    def resolve_output_mimetypes(obj) -> str:
+        mime_sizes: dict[str, int] = defaultdict(int)
+        for metadata in obj.output_file_map().values():
+            if not isinstance(metadata, dict):
+                continue
+            mimetype = str(metadata.get("mimetype") or "").strip()
+            try:
+                size = max(int(metadata.get("size") or 0), 0)
+            except (TypeError, ValueError):
+                size = 0
+            if mimetype and size:
+                mime_sizes[mimetype] += size
+        if mime_sizes:
+            return ",".join(mime for mime, _size in sorted(mime_sizes.items(), key=lambda item: item[1], reverse=True))
+        return obj.output_mimetypes or ""
+

 class ArchiveResultSchema(MinimalArchiveResultSchema):
-    TYPE: str = 'core.models.ArchiveResult'
+    TYPE: str = "core.models.ArchiveResult"
    snapshot_id: UUID
    snapshot_timestamp: str
    snapshot_url: str
-    snapshot_tags: List[str]
+    snapshot_tags: list[str]

    @staticmethod
    def resolve_snapshot_timestamp(obj):
@@ -134,25 +160,39 @@ class ArchiveResultSchema(MinimalArchiveResultSchema):


 class ArchiveResultFilterSchema(FilterSchema):
-    id: Annotated[Optional[str], FilterLookup(['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
-    search: Annotated[Optional[str], FilterLookup(['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
-    snapshot_id: Annotated[Optional[str], FilterLookup(['snapshot__id__startswith', 'snapshot__timestamp__startswith'])] = None
-    snapshot_url: Annotated[Optional[str], FilterLookup('snapshot__url__icontains')] = None
-    snapshot_tag: Annotated[Optional[str], FilterLookup('snapshot__tags__name__icontains')] = None
-    status: Annotated[Optional[str], FilterLookup('status')] = None
-    output_str: Annotated[Optional[str], FilterLookup('output_str__icontains')] = None
-    plugin: Annotated[Optional[str], FilterLookup('plugin__icontains')] = None
-    hook_name: Annotated[Optional[str], FilterLookup('hook_name__icontains')] = None
-    process_id: Annotated[Optional[str], FilterLookup('process__id__startswith')] = None
-    cmd: Annotated[Optional[str], FilterLookup('cmd__0__icontains')] = None
-    pwd: Annotated[Optional[str], FilterLookup('pwd__icontains')] = None
-    cmd_version: Annotated[Optional[str], FilterLookup('cmd_version')] = None
-    created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
-    created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
-    created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
+    id: Annotated[str | None, FilterLookup(["id__startswith", "snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
+    search: Annotated[
+        str | None,
+        FilterLookup(
+            [
+                "snapshot__url__icontains",
+                "snapshot__title__icontains",
+                "snapshot__tags__name__icontains",
+                "plugin",
+                "output_str__icontains",
+                "id__startswith",
+                "snapshot__id__startswith",
+                "snapshot__timestamp__startswith",
+            ],
+        ),
+    ] = None
+    snapshot_id: Annotated[str | None, FilterLookup(["snapshot__id__startswith", "snapshot__timestamp__startswith"])] = None
+    snapshot_url: Annotated[str | None, FilterLookup("snapshot__url__icontains")] = None
+    snapshot_tag: Annotated[str | None, FilterLookup("snapshot__tags__name__icontains")] = None
+    status: Annotated[str | None, FilterLookup("status")] = None
+    output_str: Annotated[str | None, FilterLookup("output_str__icontains")] = None
+    plugin: Annotated[str | None, FilterLookup("plugin__icontains")] = None
+    hook_name: Annotated[str | None, FilterLookup("hook_name__icontains")] = None
+    process_id: Annotated[str | None, FilterLookup("process__id__startswith")] = None
+    cmd: Annotated[str | None, FilterLookup("cmd__0__icontains")] = None
+    pwd: Annotated[str | None, FilterLookup("pwd__icontains")] = None
+    cmd_version: Annotated[str | None, FilterLookup("cmd_version")] = None
+    created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
+    created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
+    created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None


-@router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult")
+@router.get("/archiveresults", response=list[ArchiveResultSchema], url_name="get_archiveresult")
@paginate(CustomPagination)
 def get_archiveresults(request: HttpRequest, filters: Query[ArchiveResultFilterSchema]):
    """List all ArchiveResult entries matching these filters."""
@@ -167,8 +207,9 @@ def get_archiveresult(request: HttpRequest, archiveresult_id: str):

 ### Snapshot #########################################################################

+
 class SnapshotSchema(Schema):
-    TYPE: str = 'core.models.Snapshot'
+    TYPE: str = "core.models.Snapshot"
    id: UUID
    created_by_id: str
    created_by_username: str
@@ -177,14 +218,16 @@ class SnapshotSchema(Schema):
    status: str
    retry_at: datetime | None
    bookmarked_at: datetime
-    downloaded_at: Optional[datetime]
+    downloaded_at: datetime | None
    url: str
-    tags: List[str]
-    title: Optional[str]
+    tags: list[str]
+    title: str | None
    timestamp: str
    archive_path: str
+    archive_size: int
+    output_size: int
    num_archiveresults: int
-    archiveresults: List[MinimalArchiveResultSchema]
+    archiveresults: list[MinimalArchiveResultSchema]

    @staticmethod
    def resolve_created_by_id(obj):
@@ -198,13 +241,21 @@ class SnapshotSchema(Schema):
    def resolve_tags(obj):
        return sorted(tag.name for tag in obj.tags.all())

+    @staticmethod
+    def resolve_archive_size(obj):
+        return int(getattr(obj, "output_size_sum", obj.archive_size) or 0)
+
+    @staticmethod
+    def resolve_output_size(obj):
+        return SnapshotSchema.resolve_archive_size(obj)
+
    @staticmethod
    def resolve_num_archiveresults(obj, context):
        return obj.archiveresult_set.all().distinct().count()

    @staticmethod
    def resolve_archiveresults(obj, context):
-        if bool(getattr(context['request'], 'with_archiveresults', False)):
+        if bool(getattr(context["request"], "with_archiveresults", False)):
            return obj.archiveresult_set.all().distinct()
        return ArchiveResult.objects.none()

@@ -212,16 +263,16 @@ class SnapshotSchema(Schema):
 class SnapshotUpdateSchema(Schema):
    status: str | None = None
    retry_at: datetime | None = None
-    tags: Optional[List[str]] = None
+    tags: list[str] | None = None


 class SnapshotCreateSchema(Schema):
    url: str
-    crawl_id: Optional[str] = None
+    crawl_id: str | None = None
    depth: int = 0
-    title: Optional[str] = None
-    tags: Optional[List[str]] = None
-    status: Optional[str] = None
+    title: str | None = None
+    tags: list[str] | None = None
+    status: str | None = None


 class SnapshotDeleteResponseSchema(Schema):
@@ -231,77 +282,82 @@ class SnapshotDeleteResponseSchema(Schema):
    deleted_count: int


-def normalize_tag_list(tags: Optional[List[str]] = None) -> List[str]:
+def normalize_tag_list(tags: list[str] | None = None) -> list[str]:
    return [tag.strip() for tag in (tags or []) if tag and tag.strip()]


 class SnapshotFilterSchema(FilterSchema):
-    id: Annotated[Optional[str], FilterLookup(['id__icontains', 'timestamp__startswith'])] = None
-    created_by_id: Annotated[Optional[str], FilterLookup('crawl__created_by_id')] = None
-    created_by_username: Annotated[Optional[str], FilterLookup('crawl__created_by__username__icontains')] = None
-    created_at__gte: Annotated[Optional[datetime], FilterLookup('created_at__gte')] = None
-    created_at__lt: Annotated[Optional[datetime], FilterLookup('created_at__lt')] = None
-    created_at: Annotated[Optional[datetime], FilterLookup('created_at')] = None
-    modified_at: Annotated[Optional[datetime], FilterLookup('modified_at')] = None
-    modified_at__gte: Annotated[Optional[datetime], FilterLookup('modified_at__gte')] = None
-    modified_at__lt: Annotated[Optional[datetime], FilterLookup('modified_at__lt')] = None
-    search: Annotated[Optional[str], FilterLookup(['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'timestamp__startswith'])] = None
-    url: Annotated[Optional[str], FilterLookup('url')] = None
-    tag: Annotated[Optional[str], FilterLookup('tags__name')] = None
-    title: Annotated[Optional[str], FilterLookup('title__icontains')] = None
-    timestamp: Annotated[Optional[str], FilterLookup('timestamp__startswith')] = None
-    bookmarked_at__gte: Annotated[Optional[datetime], FilterLookup('bookmarked_at__gte')] = None
-    bookmarked_at__lt: Annotated[Optional[datetime], FilterLookup('bookmarked_at__lt')] = None
+    id: Annotated[str | None, FilterLookup(["id__icontains", "timestamp__startswith"])] = None
+    created_by_id: Annotated[str | None, FilterLookup("crawl__created_by_id")] = None
+    created_by_username: Annotated[str | None, FilterLookup("crawl__created_by__username__icontains")] = None
+    created_at__gte: Annotated[datetime | None, FilterLookup("created_at__gte")] = None
+    created_at__lt: Annotated[datetime | None, FilterLookup("created_at__lt")] = None
+    created_at: Annotated[datetime | None, FilterLookup("created_at")] = None
+    modified_at: Annotated[datetime | None, FilterLookup("modified_at")] = None
+    modified_at__gte: Annotated[datetime | None, FilterLookup("modified_at__gte")] = None
+    modified_at__lt: Annotated[datetime | None, FilterLookup("modified_at__lt")] = None
+    search: Annotated[
+        str | None,
+        FilterLookup(["url__icontains", "title__icontains", "tags__name__icontains", "id__icontains", "timestamp__startswith"]),
+    ] = None
+    url: Annotated[str | None, FilterLookup("url")] = None
+    tag: Annotated[str | None, FilterLookup("tags__name")] = None
+    title: Annotated[str | None, FilterLookup("title__icontains")] = None
+    timestamp: Annotated[str | None, FilterLookup("timestamp__startswith")] = None
+    bookmarked_at__gte: Annotated[datetime | None, FilterLookup("bookmarked_at__gte")] = None
+    bookmarked_at__lt: Annotated[datetime | None, FilterLookup("bookmarked_at__lt")] = None


-@router.get("/snapshots", response=List[SnapshotSchema], url_name="get_snapshots")
+@router.get("/snapshots", response=list[SnapshotSchema], url_name="get_snapshots")
@paginate(CustomPagination)
 def get_snapshots(request: HttpRequest, filters: Query[SnapshotFilterSchema], with_archiveresults: bool = False):
    """List all Snapshot entries matching these filters."""
-    setattr(request, 'with_archiveresults', with_archiveresults)
-    return filters.filter(Snapshot.objects.all()).distinct()
+    setattr(request, "with_archiveresults", with_archiveresults)
+    queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
+    return filters.filter(queryset).distinct()


@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot")
 def get_snapshot(request: HttpRequest, snapshot_id: str, with_archiveresults: bool = True):
    """Get a specific Snapshot by id."""
-    setattr(request, 'with_archiveresults', with_archiveresults)
+    setattr(request, "with_archiveresults", with_archiveresults)
+    queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0))
    try:
-        return Snapshot.objects.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
+        return queryset.get(Q(id__startswith=snapshot_id) | Q(timestamp__startswith=snapshot_id))
    except Snapshot.DoesNotExist:
-        return Snapshot.objects.get(Q(id__icontains=snapshot_id))
+        return queryset.get(Q(id__icontains=snapshot_id))


@router.post("/snapshots", response=SnapshotSchema, url_name="create_snapshot")
 def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
    tags = normalize_tag_list(data.tags)
    if data.status is not None and data.status not in Snapshot.StatusChoices.values:
-        raise HttpError(400, f'Invalid status: {data.status}')
+        raise HttpError(400, f"Invalid status: {data.status}")
    if not data.url.strip():
-        raise HttpError(400, 'URL is required')
+        raise HttpError(400, "URL is required")
    if data.depth not in (0, 1, 2, 3, 4):
-        raise HttpError(400, 'depth must be between 0 and 4')
+        raise HttpError(400, "depth must be between 0 and 4")

    if data.crawl_id:
        crawl = Crawl.objects.get(id__icontains=data.crawl_id)
-        crawl_tags = normalize_tag_list(crawl.tags_str.split(','))
+        crawl_tags = normalize_tag_list(crawl.tags_str.split(","))
        tags = tags or crawl_tags
    else:
        crawl = Crawl.objects.create(
            urls=data.url,
            max_depth=max(data.depth, 0),
-            tags_str=','.join(tags),
+            tags_str=",".join(tags),
            status=Crawl.StatusChoices.QUEUED,
            retry_at=timezone.now(),
            created_by=request.user if isinstance(request.user, User) else None,
        )

    snapshot_defaults = {
-        'depth': data.depth,
-        'title': data.title,
-        'timestamp': str(timezone.now().timestamp()),
-        'status': data.status or Snapshot.StatusChoices.QUEUED,
-        'retry_at': timezone.now(),
+        "depth": data.depth,
+        "title": data.title,
+        "timestamp": str(timezone.now().timestamp()),
+        "status": data.status or Snapshot.StatusChoices.QUEUED,
+        "retry_at": timezone.now(),
    }
    snapshot, _ = Snapshot.objects.get_or_create(
        url=data.url,
@@ -309,17 +365,17 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
        defaults=snapshot_defaults,
    )

-    update_fields: List[str] = []
+    update_fields: list[str] = []
    if data.title is not None and snapshot.title != data.title:
        snapshot.title = data.title
-        update_fields.append('title')
+        update_fields.append("title")
    if data.status is not None and snapshot.status != data.status:
        if data.status not in Snapshot.StatusChoices.values:
-            raise HttpError(400, f'Invalid status: {data.status}')
+            raise HttpError(400, f"Invalid status: {data.status}")
        snapshot.status = data.status
-        update_fields.append('status')
+        update_fields.append("status")
    if update_fields:
-        update_fields.append('modified_at')
+        update_fields.append("modified_at")
        snapshot.save(update_fields=update_fields)

    if tags:
@@ -330,7 +386,7 @@ def create_snapshot(request: HttpRequest, data: SnapshotCreateSchema):
    except Exception:
        pass

-    setattr(request, 'with_archiveresults', False)
+    setattr(request, "with_archiveresults", False)
    return snapshot


@@ -343,26 +399,26 @@ def patch_snapshot(request: HttpRequest, snapshot_id: str, data: SnapshotUpdateS
        snapshot = Snapshot.objects.get(Q(id__icontains=snapshot_id))

    payload = data.dict(exclude_unset=True)
-    update_fields = ['modified_at']
-    tags = payload.pop('tags', None)
+    update_fields = ["modified_at"]
+    tags = payload.pop("tags", None)

-    if 'status' in payload:
-        if payload['status'] not in Snapshot.StatusChoices.values:
-            raise HttpError(400, f'Invalid status: {payload["status"]}')
-        snapshot.status = payload['status']
-        if snapshot.status == Snapshot.StatusChoices.SEALED and 'retry_at' not in payload:
+    if "status" in payload:
+        if payload["status"] not in Snapshot.StatusChoices.values:
+            raise HttpError(400, f"Invalid status: {payload['status']}")
+        snapshot.status = payload["status"]
+        if snapshot.status == Snapshot.StatusChoices.SEALED and "retry_at" not in payload:
            snapshot.retry_at = None
-        update_fields.append('status')
+        update_fields.append("status")

-    if 'retry_at' in payload:
-        snapshot.retry_at = payload['retry_at']
-        update_fields.append('retry_at')
+    if "retry_at" in payload:
+        snapshot.retry_at = payload["retry_at"]
+        update_fields.append("retry_at")

    if tags is not None:
        snapshot.save_tags(normalize_tag_list(tags))

    snapshot.save(update_fields=update_fields)
-    setattr(request, 'with_archiveresults', False)
+    setattr(request, "with_archiveresults", False)
    return snapshot


@@ -373,17 +429,18 @@ def delete_snapshot(request: HttpRequest, snapshot_id: str):
    crawl_id_str = str(snapshot.crawl.pk)
    deleted_count, _ = snapshot.delete()
    return {
-        'success': True,
-        'snapshot_id': snapshot_id_str,
-        'crawl_id': crawl_id_str,
-        'deleted_count': deleted_count,
+        "success": True,
+        "snapshot_id": snapshot_id_str,
+        "crawl_id": crawl_id_str,
+        "deleted_count": deleted_count,
    }


 ### Tag #########################################################################

+
 class TagSchema(Schema):
-    TYPE: str = 'core.models.Tag'
+    TYPE: str = "core.models.Tag"
    id: int
    modified_at: datetime
    created_at: datetime
@@ -392,7 +449,7 @@ class TagSchema(Schema):
    name: str
    slug: str
    num_snapshots: int
-    snapshots: List[SnapshotSchema]
+    snapshots: list[SnapshotSchema]

    @staticmethod
    def resolve_created_by_id(obj):
@@ -402,7 +459,7 @@ class TagSchema(Schema):
    def resolve_created_by_username(obj):
        user_model = get_user_model()
        user = user_model.objects.get(id=obj.created_by_id)
-        username = getattr(user, 'username', None)
+        username = getattr(user, "username", None)
        return username if isinstance(username, str) else str(user)

    @staticmethod
@@ -411,58 +468,67 @@ class TagSchema(Schema):

    @staticmethod
    def resolve_snapshots(obj, context):
-        if bool(getattr(context['request'], 'with_snapshots', False)):
+        if bool(getattr(context["request"], "with_snapshots", False)):
            return obj.snapshot_set.all().distinct()
        return Snapshot.objects.none()


-@router.get("/tags", response=List[TagSchema], url_name="get_tags")
+@router.get("/tags", response=list[TagSchema], url_name="get_tags")
@paginate(CustomPagination)
 def get_tags(request: HttpRequest):
-    setattr(request, 'with_snapshots', False)
-    setattr(request, 'with_archiveresults', False)
+    setattr(request, "with_snapshots", False)
+    setattr(request, "with_archiveresults", False)
    return get_matching_tags()


@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
 def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
-    setattr(request, 'with_snapshots', with_snapshots)
-    setattr(request, 'with_archiveresults', False)
+    setattr(request, "with_snapshots", with_snapshots)
+    setattr(request, "with_archiveresults", False)
    try:
        return get_tag_by_ref(tag_id)
    except (Tag.DoesNotExist, ValidationError):
-        raise HttpError(404, 'Tag not found')
+        raise HttpError(404, "Tag not found")


-@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
+@router.get(
+    "/any/{id}",
+    response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema],
+    url_name="get_any",
+    summary="Get any object by its ID",
+)
 def get_any(request: HttpRequest, id: str):
    """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
-    setattr(request, 'with_snapshots', False)
-    setattr(request, 'with_archiveresults', False)
+    setattr(request, "with_snapshots", False)
+    setattr(request, "with_archiveresults", False)

    for getter in [get_snapshot, get_archiveresult, get_tag]:
        try:
            response = getter(request, id)
            if isinstance(response, Model):
-                return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
+                return redirect(
+                    f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}",
+                )
        except Exception:
            pass

    try:
        from archivebox.api.v1_crawls import get_crawl
+
        response = get_crawl(request, id)
        if isinstance(response, Model):
            return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
    except Exception:
        pass

-    raise HttpError(404, 'Object with given ID not found')
+    raise HttpError(404, "Object with given ID not found")


 ### Tag Editor API Endpoints #########################################################################

+
 class TagAutocompleteSchema(Schema):
-    tags: List[dict]
+    tags: list[dict]


 class TagCreateSchema(Schema):
@@ -483,7 +549,7 @@ class TagSearchSnapshotSchema(Schema):
    favicon_url: str
    admin_url: str
    archive_url: str
-    downloaded_at: Optional[str] = None
+    downloaded_at: str | None = None


 class TagSearchCardSchema(Schema):
@@ -497,11 +563,11 @@ class TagSearchCardSchema(Schema):
    export_jsonl_url: str
    rename_url: str
    delete_url: str
-    snapshots: List[TagSearchSnapshotSchema]
+    snapshots: list[TagSearchSnapshotSchema]


 class TagSearchResponseSchema(Schema):
-    tags: List[TagSearchCardSchema]
+    tags: list[TagSearchCardSchema]
    sort: str
    created_by: str
    year: str
@@ -527,8 +593,8 @@ class TagDeleteResponseSchema(Schema):

 class TagSnapshotRequestSchema(Schema):
    snapshot_id: str
-    tag_name: Optional[str] = None
-    tag_id: Optional[int] = None
+    tag_name: str | None = None
+    tag_id: int | None = None


 class TagSnapshotResponseSchema(Schema):
@@ -541,10 +607,10 @@ class TagSnapshotResponseSchema(Schema):
 def search_tags(
    request: HttpRequest,
    q: str = "",
-    sort: str = 'created_desc',
-    created_by: str = '',
-    year: str = '',
-    has_snapshots: str = 'all',
+    sort: str = "created_desc",
+    created_by: str = "",
+    year: str = "",
+    has_snapshots: str = "all",
 ):
    """Return detailed tag cards for admin/live-search UIs."""
    normalized_sort = normalize_tag_sort(sort)
@@ -552,7 +618,7 @@ def search_tags(
    normalized_year = normalize_created_year_filter(year)
    normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
    return {
-        'tags': build_tag_cards(
+        "tags": build_tag_cards(
            query=q,
            request=request,
            sort=normalized_sort,
@@ -560,28 +626,28 @@ def search_tags(
            year=normalized_year,
            has_snapshots=normalized_has_snapshots,
        ),
-        'sort': normalized_sort,
-        'created_by': normalized_created_by,
-        'year': normalized_year,
-        'has_snapshots': normalized_has_snapshots,
+        "sort": normalized_sort,
+        "created_by": normalized_created_by,
+        "year": normalized_year,
+        "has_snapshots": normalized_has_snapshots,
    }


 def _public_tag_listing_enabled() -> bool:
-    explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
+    explicit = getattr(settings, "PUBLIC_SNAPSHOTS_LIST", None)
    if explicit is not None:
        return bool(explicit)
-    return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
+    return bool(getattr(settings, "PUBLIC_INDEX", SERVER_CONFIG.PUBLIC_INDEX))


 def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
-    user = getattr(request, 'user', None)
-    if getattr(user, 'is_authenticated', False):
+    user = getattr(request, "user", None)
+    if getattr(user, "is_authenticated", False):
        return True

-    token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
-    auth_header = request.headers.get('Authorization', '')
-    if not token and auth_header.lower().startswith('bearer '):
+    token = request.GET.get("api_key") or request.headers.get("X-ArchiveBox-API-Key")
+    auth_header = request.headers.get("Authorization", "")
+    if not token and auth_header.lower().startswith("bearer "):
        token = auth_header.split(None, 1)[1].strip()

    if token and auth_using_token(token=token, request=request):
@@ -594,12 +660,12 @@ def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
 def tags_autocomplete(request: HttpRequest, q: str = ""):
    """Return tags matching the query for autocomplete."""
    if not _request_has_tag_autocomplete_access(request):
-        raise HttpError(401, 'Authentication required')
+        raise HttpError(401, "Authentication required")

-    tags = get_matching_tags(q)[:50 if not q else 20]
+    tags = get_matching_tags(q)[: 50 if not q else 20]

    return {
-        'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
+        "tags": [{"id": tag.pk, "name": tag.name, "slug": tag.slug, "num_snapshots": getattr(tag, "num_snapshots", 0)} for tag in tags],
    }


@@ -615,10 +681,10 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
        raise HttpError(400, str(err)) from err

    return {
-        'success': True,
-        'tag_id': tag.pk,
-        'tag_name': tag.name,
-        'created': created,
+        "success": True,
+        "tag_id": tag.pk,
+        "tag_name": tag.name,
+        "created": created,
    }


@@ -627,15 +693,15 @@ def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
    try:
        tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
    except Tag.DoesNotExist as err:
-        raise HttpError(404, 'Tag not found') from err
+        raise HttpError(404, "Tag not found") from err
    except ValueError as err:
        raise HttpError(400, str(err)) from err

    return {
-        'success': True,
-        'tag_id': tag.pk,
-        'tag_name': tag.name,
-        'slug': tag.slug,
+        "success": True,
+        "tag_id": tag.pk,
+        "tag_name": tag.name,
+        "slug": tag.slug,
    }


@@ -644,13 +710,13 @@ def delete_tag(request: HttpRequest, tag_id: int):
    try:
        tag = get_tag_by_ref(tag_id)
    except Tag.DoesNotExist as err:
-        raise HttpError(404, 'Tag not found') from err
+        raise HttpError(404, "Tag not found") from err

    deleted_count, _ = delete_tag_record(tag)
    return {
-        'success': True,
-        'tag_id': int(tag_id),
-        'deleted_count': deleted_count,
+        "success": True,
+        "tag_id": int(tag_id),
+        "deleted_count": deleted_count,
    }


@@ -659,10 +725,10 @@ def tag_urls_export(request: HttpRequest, tag_id: int):
    try:
        tag = get_tag_by_ref(tag_id)
    except Tag.DoesNotExist as err:
-        raise HttpError(404, 'Tag not found') from err
+        raise HttpError(404, "Tag not found") from err

-    response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
-    response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
+    response = HttpResponse(export_tag_urls(tag), content_type="text/plain; charset=utf-8")
+    response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
    return response


@@ -671,10 +737,10 @@ def tag_snapshots_export(request: HttpRequest, tag_id: int):
    try:
        tag = get_tag_by_ref(tag_id)
    except Tag.DoesNotExist as err:
-        raise HttpError(404, 'Tag not found') from err
+        raise HttpError(404, "Tag not found") from err

-    response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
-    response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
+    response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type="application/x-ndjson; charset=utf-8")
+    response["Content-Disposition"] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
    return response


@@ -684,16 +750,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
    # Get the snapshot
    try:
        snapshot = Snapshot.objects.get(
-            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
+            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
        )
    except Snapshot.DoesNotExist:
-        raise HttpError(404, 'Snapshot not found')
+        raise HttpError(404, "Snapshot not found")
    except Snapshot.MultipleObjectsReturned:
        snapshot = Snapshot.objects.filter(
-            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
+            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
        ).first()
        if snapshot is None:
-            raise HttpError(404, 'Snapshot not found')
+            raise HttpError(404, "Snapshot not found")

    # Get or create the tag
    if data.tag_name:
@@ -708,17 +774,17 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
        try:
            tag = get_tag_by_ref(data.tag_id)
        except Tag.DoesNotExist:
-            raise HttpError(404, 'Tag not found')
+            raise HttpError(404, "Tag not found")
    else:
-        raise HttpError(400, 'Either tag_name or tag_id is required')
+        raise HttpError(400, "Either tag_name or tag_id is required")

    # Add the tag to the snapshot
    snapshot.tags.add(tag.pk)

    return {
-        'success': True,
-        'tag_id': tag.pk,
-        'tag_name': tag.name,
+        "success": True,
+        "tag_id": tag.pk,
+        "tag_name": tag.name,
    }


@@ -728,36 +794,36 @@ def tags_remove_from_snapshot(request: HttpRequest, data: TagSnapshotRequestSche
    # Get the snapshot
    try:
        snapshot = Snapshot.objects.get(
-            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
+            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
        )
    except Snapshot.DoesNotExist:
-        raise HttpError(404, 'Snapshot not found')
+        raise HttpError(404, "Snapshot not found")
    except Snapshot.MultipleObjectsReturned:
        snapshot = Snapshot.objects.filter(
-            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id)
+            Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id),
        ).first()
        if snapshot is None:
-            raise HttpError(404, 'Snapshot not found')
+            raise HttpError(404, "Snapshot not found")

    # Get the tag
    if data.tag_id:
        try:
            tag = Tag.objects.get(pk=data.tag_id)
        except Tag.DoesNotExist:
-            raise HttpError(404, 'Tag not found')
+            raise HttpError(404, "Tag not found")
    elif data.tag_name:
        try:
            tag = Tag.objects.get(name__iexact=data.tag_name.strip())
        except Tag.DoesNotExist:
-            raise HttpError(404, 'Tag not found')
+            raise HttpError(404, "Tag not found")
    else:
-        raise HttpError(400, 'Either tag_name or tag_id is required')
+        raise HttpError(400, "Either tag_name or tag_id is required")

    # Remove the tag from the snapshot
    snapshot.tags.remove(tag.pk)

    return {
-        'success': True,
-        'tag_id': tag.pk,
-        'tag_name': tag.name,
+        "success": True,
+        "tag_id": tag.pk,
+        "tag_name": tag.name,
    }
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -1,7 +1,6 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 from uuid import UUID
-from typing import List, Optional
 from datetime import datetime
 from django.http import HttpRequest
 from django.utils import timezone
@@ -17,11 +16,11 @@ from archivebox.crawls.models import Crawl

 from .auth import API_AUTH_METHODS

-router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
+router = Router(tags=["Crawl Models"], auth=API_AUTH_METHODS)


 class CrawlSchema(Schema):
-    TYPE: str = 'crawls.models.Crawl'
+    TYPE: str = "crawls.models.Crawl"

    id: UUID

@@ -35,6 +34,8 @@ class CrawlSchema(Schema):

    urls: str
    max_depth: int
+    max_urls: int
+    max_size: int
    tags_str: str
    config: dict

@@ -48,12 +49,12 @@ class CrawlSchema(Schema):
    def resolve_created_by_username(obj):
        user_model = get_user_model()
        user = user_model.objects.get(id=obj.created_by_id)
-        username = getattr(user, 'username', None)
+        username = getattr(user, "username", None)
        return username if isinstance(username, str) else str(user)

    @staticmethod
    def resolve_snapshots(obj, context):
-        if bool(getattr(context['request'], 'with_snapshots', False)):
+        if bool(getattr(context["request"], "with_snapshots", False)):
            return obj.snapshot_set.all().distinct()
        return Snapshot.objects.none()

@@ -61,17 +62,19 @@ class CrawlSchema(Schema):
 class CrawlUpdateSchema(Schema):
    status: str | None = None
    retry_at: datetime | None = None
-    tags: Optional[List[str]] = None
+    tags: list[str] | None = None
    tags_str: str | None = None


 class CrawlCreateSchema(Schema):
-    urls: List[str]
+    urls: list[str]
    max_depth: int = 0
-    tags: Optional[List[str]] = None
-    tags_str: str = ''
-    label: str = ''
-    notes: str = ''
+    max_urls: int = 0
+    max_size: int = 0
+    tags: list[str] | None = None
+    tags_str: str = ""
+    label: str = ""
+    notes: str = ""
    config: dict = {}


@@ -82,13 +85,13 @@ class CrawlDeleteResponseSchema(Schema):
    deleted_snapshots: int


-def normalize_tag_list(tags: Optional[List[str]] = None, tags_str: str = '') -> List[str]:
+def normalize_tag_list(tags: list[str] | None = None, tags_str: str = "") -> list[str]:
    if tags is not None:
        return [tag.strip() for tag in tags if tag and tag.strip()]
-    return [tag.strip() for tag in tags_str.split(',') if tag.strip()]
+    return [tag.strip() for tag in tags_str.split(",") if tag.strip()]


-@router.get("/crawls", response=List[CrawlSchema], url_name="get_crawls")
+@router.get("/crawls", response=list[CrawlSchema], url_name="get_crawls")
 def get_crawls(request: HttpRequest):
    return Crawl.objects.all().distinct()

@@ -97,15 +100,21 @@ def get_crawls(request: HttpRequest):
 def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
    urls = [url.strip() for url in data.urls if url and url.strip()]
    if not urls:
-        raise HttpError(400, 'At least one URL is required')
+        raise HttpError(400, "At least one URL is required")
    if data.max_depth not in (0, 1, 2, 3, 4):
-        raise HttpError(400, 'max_depth must be between 0 and 4')
+        raise HttpError(400, "max_depth must be between 0 and 4")
+    if data.max_urls < 0:
+        raise HttpError(400, "max_urls must be >= 0")
+    if data.max_size < 0:
+        raise HttpError(400, "max_size must be >= 0")

    tags = normalize_tag_list(data.tags, data.tags_str)
    crawl = Crawl.objects.create(
-        urls='\n'.join(urls),
+        urls="\n".join(urls),
        max_depth=data.max_depth,
-        tags_str=','.join(tags),
+        max_urls=data.max_urls,
+        max_size=data.max_size,
+        tags_str=",".join(tags),
        label=data.label,
        notes=data.notes,
        config=data.config,
@@ -116,25 +125,26 @@ def create_crawl(request: HttpRequest, data: CrawlCreateSchema):
    crawl.create_snapshots_from_urls()
    return crawl

+
@router.get("/crawl/{crawl_id}", response=CrawlSchema | str, url_name="get_crawl")
-def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool=False, with_snapshots: bool=False, with_archiveresults: bool=False):
+def get_crawl(request: HttpRequest, crawl_id: str, as_rss: bool = False, with_snapshots: bool = False, with_archiveresults: bool = False):
    """Get a specific Crawl by id."""
-    setattr(request, 'with_snapshots', with_snapshots)
-    setattr(request, 'with_archiveresults', with_archiveresults)
+    setattr(request, "with_snapshots", with_snapshots)
+    setattr(request, "with_archiveresults", with_archiveresults)
    crawl = Crawl.objects.get(id__icontains=crawl_id)
-    
+
    if crawl and as_rss:
        # return snapshots as XML rss feed
        urls = [
-            {'url': snapshot.url, 'title': snapshot.title, 'bookmarked_at': snapshot.bookmarked_at, 'tags': snapshot.tags_str}
+            {"url": snapshot.url, "title": snapshot.title, "bookmarked_at": snapshot.bookmarked_at, "tags": snapshot.tags_str}
            for snapshot in crawl.snapshot_set.all()
        ]
        xml = '<rss version="2.0"><channel>'
        for url in urls:
-            xml += f'<item><url>{url["url"]}</url><title>{url["title"]}</title><bookmarked_at>{url["bookmarked_at"]}</bookmarked_at><tags>{url["tags"]}</tags></item>'
-        xml += '</channel></rss>'
+            xml += f"<item><url>{url['url']}</url><title>{url['title']}</title><bookmarked_at>{url['bookmarked_at']}</bookmarked_at><tags>{url['tags']}</tags></item>"
+        xml += "</channel></rss>"
        return xml
-    
+
    return crawl


@@ -143,29 +153,29 @@ def patch_crawl(request: HttpRequest, crawl_id: str, data: CrawlUpdateSchema):
    """Update a crawl (e.g., set status=sealed to cancel queued work)."""
    crawl = Crawl.objects.get(id__icontains=crawl_id)
    payload = data.dict(exclude_unset=True)
-    update_fields = ['modified_at']
+    update_fields = ["modified_at"]

-    tags = payload.pop('tags', None)
-    tags_str = payload.pop('tags_str', None)
+    tags = payload.pop("tags", None)
+    tags_str = payload.pop("tags_str", None)
    if tags is not None or tags_str is not None:
-        crawl.tags_str = ','.join(normalize_tag_list(tags, tags_str or ''))
-        update_fields.append('tags_str')
+        crawl.tags_str = ",".join(normalize_tag_list(tags, tags_str or ""))
+        update_fields.append("tags_str")

-    if 'status' in payload:
-        if payload['status'] not in Crawl.StatusChoices.values:
-            raise HttpError(400, f'Invalid status: {payload["status"]}')
-        crawl.status = payload['status']
-        if crawl.status == Crawl.StatusChoices.SEALED and 'retry_at' not in payload:
+    if "status" in payload:
+        if payload["status"] not in Crawl.StatusChoices.values:
+            raise HttpError(400, f"Invalid status: {payload['status']}")
+        crawl.status = payload["status"]
+        if crawl.status == Crawl.StatusChoices.SEALED and "retry_at" not in payload:
            crawl.retry_at = None
-        update_fields.append('status')
+        update_fields.append("status")

-    if 'retry_at' in payload:
-        crawl.retry_at = payload['retry_at']
-        update_fields.append('retry_at')
+    if "retry_at" in payload:
+        crawl.retry_at = payload["retry_at"]
+        update_fields.append("retry_at")

    crawl.save(update_fields=update_fields)

-    if payload.get('status') == Crawl.StatusChoices.SEALED:
+    if payload.get("status") == Crawl.StatusChoices.SEALED:
        Snapshot.objects.filter(
            crawl=crawl,
            status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
@@ -184,8 +194,8 @@ def delete_crawl(request: HttpRequest, crawl_id: str):
    snapshot_count = crawl.snapshot_set.count()
    deleted_count, _ = crawl.delete()
    return {
-        'success': True,
-        'crawl_id': crawl_id_str,
-        'deleted_count': deleted_count,
-        'deleted_snapshots': snapshot_count,
+        "success": True,
+        "crawl_id": crawl_id_str,
+        "deleted_count": deleted_count,
+        "deleted_snapshots": snapshot_count,
    }
--- a/archivebox/api/v1_machine.py
+++ b/archivebox/api/v1_machine.py
@@ -1,7 +1,7 @@
-__package__ = 'archivebox.api'
+__package__ = "archivebox.api"

 from uuid import UUID
-from typing import Annotated, List, Optional
+from typing import Annotated
 from datetime import datetime

 from django.http import HttpRequest
@@ -12,16 +12,18 @@ from ninja.pagination import paginate
 from archivebox.api.v1_core import CustomPagination


-router = Router(tags=['Machine and Dependencies'])
+router = Router(tags=["Machine and Dependencies"])


 # ============================================================================
 # Machine Schemas
 # ============================================================================

+
 class MachineSchema(Schema):
    """Schema for Machine model."""
-    TYPE: str = 'machine.Machine'
+
+    TYPE: str = "machine.Machine"
    id: UUID
    created_at: datetime
    modified_at: datetime
@@ -43,22 +45,24 @@ class MachineSchema(Schema):


 class MachineFilterSchema(FilterSchema):
-    id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
-    hostname: Annotated[Optional[str], FilterLookup('hostname__icontains')] = None
-    os_platform: Annotated[Optional[str], FilterLookup('os_platform__icontains')] = None
-    os_arch: Annotated[Optional[str], FilterLookup('os_arch')] = None
-    hw_in_docker: Annotated[Optional[bool], FilterLookup('hw_in_docker')] = None
-    hw_in_vm: Annotated[Optional[bool], FilterLookup('hw_in_vm')] = None
-    bin_providers: Annotated[Optional[str], FilterLookup('bin_providers__icontains')] = None
+    id: Annotated[str | None, FilterLookup("id__startswith")] = None
+    hostname: Annotated[str | None, FilterLookup("hostname__icontains")] = None
+    os_platform: Annotated[str | None, FilterLookup("os_platform__icontains")] = None
+    os_arch: Annotated[str | None, FilterLookup("os_arch")] = None
+    hw_in_docker: Annotated[bool | None, FilterLookup("hw_in_docker")] = None
+    hw_in_vm: Annotated[bool | None, FilterLookup("hw_in_vm")] = None
+    bin_providers: Annotated[str | None, FilterLookup("bin_providers__icontains")] = None


 # ============================================================================
 # Binary Schemas
 # ============================================================================

+
 class BinarySchema(Schema):
    """Schema for Binary model."""
-    TYPE: str = 'machine.Binary'
+
+    TYPE: str = "machine.Binary"
    id: UUID
    created_at: datetime
    modified_at: datetime
@@ -85,23 +89,25 @@ class BinarySchema(Schema):


 class BinaryFilterSchema(FilterSchema):
-    id: Annotated[Optional[str], FilterLookup('id__startswith')] = None
-    name: Annotated[Optional[str], FilterLookup('name__icontains')] = None
-    binprovider: Annotated[Optional[str], FilterLookup('binprovider')] = None
-    status: Annotated[Optional[str], FilterLookup('status')] = None
-    machine_id: Annotated[Optional[str], FilterLookup('machine_id__startswith')] = None
-    version: Annotated[Optional[str], FilterLookup('version__icontains')] = None
+    id: Annotated[str | None, FilterLookup("id__startswith")] = None
+    name: Annotated[str | None, FilterLookup("name__icontains")] = None
+    binprovider: Annotated[str | None, FilterLookup("binprovider")] = None
+    status: Annotated[str | None, FilterLookup("status")] = None
+    machine_id: Annotated[str | None, FilterLookup("machine_id__startswith")] = None
+    version: Annotated[str | None, FilterLookup("version__icontains")] = None


 # ============================================================================
 # Machine Endpoints
 # ============================================================================

-@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
+
+@router.get("/machines", response=list[MachineSchema], url_name="get_machines")
@paginate(CustomPagination)
 def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
    """List all machines."""
    from archivebox.machine.models import Machine
+
    return filters.filter(Machine.objects.all()).distinct()


@@ -109,6 +115,7 @@ def get_machines(request: HttpRequest, filters: Query[MachineFilterSchema]):
 def get_current_machine(request: HttpRequest):
    """Get the current machine."""
    from archivebox.machine.models import Machine
+
    return Machine.current()


@@ -117,6 +124,7 @@ def get_machine(request: HttpRequest, machine_id: str):
    """Get a specific machine by ID."""
    from archivebox.machine.models import Machine
    from django.db.models import Q
+
    return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))


@@ -127,23 +135,27 @@ def get_machine(request: HttpRequest, machine_id: str):
 # Binary Endpoints
 # ============================================================================

-@router.get("/binaries", response=List[BinarySchema], url_name="get_binaries")
+
+@router.get("/binaries", response=list[BinarySchema], url_name="get_binaries")
@paginate(CustomPagination)
 def get_binaries(request: HttpRequest, filters: Query[BinaryFilterSchema]):
    """List all binaries."""
    from archivebox.machine.models import Binary
-    return filters.filter(Binary.objects.all().select_related('machine')).distinct()
+
+    return filters.filter(Binary.objects.all().select_related("machine")).distinct()


@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
 def get_binary(request: HttpRequest, binary_id: str):
    """Get a specific binary by ID."""
    from archivebox.machine.models import Binary
-    return Binary.objects.select_related('machine').get(id__startswith=binary_id)
+
+    return Binary.objects.select_related("machine").get(id__startswith=binary_id)


-@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
+@router.get("/binary/by-name/{name}", response=list[BinarySchema], url_name="get_binaries_by_name")
 def get_binaries_by_name(request: HttpRequest, name: str):
    """Get all binaries with the given name."""
    from archivebox.machine.models import Binary
-    return list(Binary.objects.filter(name__iexact=name).select_related('machine'))
+
+    return list(Binary.objects.filter(name__iexact=name).select_related("machine"))