WIP: checkpoint working tree before rebasing onto dev

2026-04-06 07:47:53 +10:00 · 2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions
--- a/archivebox/api/urls.py
+++ b/archivebox/api/urls.py
@@ -6,8 +6,9 @@ from django.views.generic.base import RedirectView
 from .v1_api import urls as v1_api_urls

 urlpatterns = [
-    path("",                 RedirectView.as_view(url='/api/v1')),
+    path("",                 RedirectView.as_view(url='/api/v1/docs')),

+    path("v1/",              RedirectView.as_view(url='/api/v1/docs')),
    path("v1/",              v1_api_urls),
    path("v1",               RedirectView.as_view(url='/api/v1/docs')),

--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -6,7 +6,8 @@ from typing import List, Optional, Union, Any, Annotated
 from datetime import datetime

 from django.db.models import Model, Q
-from django.http import HttpRequest
+from django.conf import settings
+from django.http import HttpRequest, HttpResponse
 from django.core.exceptions import ValidationError
 from django.contrib.auth import get_user_model
 from django.contrib.auth.models import User
@@ -18,6 +19,22 @@ from ninja.pagination import paginate, PaginationBase
 from ninja.errors import HttpError

 from archivebox.core.models import Snapshot, ArchiveResult, Tag
+from archivebox.api.auth import auth_using_token
+from archivebox.config.common import SERVER_CONFIG
+from archivebox.core.tag_utils import (
+    build_tag_cards,
+    delete_tag as delete_tag_record,
+    export_tag_snapshots_jsonl,
+    export_tag_urls,
+    get_matching_tags,
+    get_or_create_tag,
+    get_tag_by_ref,
+    normalize_created_by_filter,
+    normalize_created_year_filter,
+    normalize_has_snapshots_filter,
+    normalize_tag_sort,
+    rename_tag as rename_tag_record,
+)
 from archivebox.crawls.models import Crawl
 from archivebox.api.v1_crawls import CrawlSchema

@@ -404,7 +421,7 @@ class TagSchema(Schema):
 def get_tags(request: HttpRequest):
    setattr(request, 'with_snapshots', False)
    setattr(request, 'with_archiveresults', False)
-    return Tag.objects.all().distinct()
+    return get_matching_tags()


@router.get("/tag/{tag_id}", response=TagSchema, url_name="get_tag")
@@ -412,9 +429,9 @@ def get_tag(request: HttpRequest, tag_id: str, with_snapshots: bool = True):
    setattr(request, 'with_snapshots', with_snapshots)
    setattr(request, 'with_archiveresults', False)
    try:
-        return Tag.objects.get(id__icontains=tag_id)
+        return get_tag_by_ref(tag_id)
    except (Tag.DoesNotExist, ValidationError):
-        return Tag.objects.get(slug__icontains=tag_id)
+        raise HttpError(404, 'Tag not found')


@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
@@ -459,6 +476,55 @@ class TagCreateResponseSchema(Schema):
    created: bool


+class TagSearchSnapshotSchema(Schema):
+    id: str
+    title: str
+    url: str
+    favicon_url: str
+    admin_url: str
+    archive_url: str
+    downloaded_at: Optional[str] = None
+
+
+class TagSearchCardSchema(Schema):
+    id: int
+    name: str
+    slug: str
+    num_snapshots: int
+    filter_url: str
+    edit_url: str
+    export_urls_url: str
+    export_jsonl_url: str
+    rename_url: str
+    delete_url: str
+    snapshots: List[TagSearchSnapshotSchema]
+
+
+class TagSearchResponseSchema(Schema):
+    tags: List[TagSearchCardSchema]
+    sort: str
+    created_by: str
+    year: str
+    has_snapshots: str
+
+
+class TagUpdateSchema(Schema):
+    name: str
+
+
+class TagUpdateResponseSchema(Schema):
+    success: bool
+    tag_id: int
+    tag_name: str
+    slug: str
+
+
+class TagDeleteResponseSchema(Schema):
+    success: bool
+    tag_id: int
+    deleted_count: int
+
+
 class TagSnapshotRequestSchema(Schema):
    snapshot_id: str
    tag_name: Optional[str] = None
@@ -471,41 +537,82 @@ class TagSnapshotResponseSchema(Schema):
    tag_name: str


-@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete")
+@router.get("/tags/search/", response=TagSearchResponseSchema, url_name="search_tags")
+def search_tags(
+    request: HttpRequest,
+    q: str = "",
+    sort: str = 'created_desc',
+    created_by: str = '',
+    year: str = '',
+    has_snapshots: str = 'all',
+):
+    """Return detailed tag cards for admin/live-search UIs."""
+    normalized_sort = normalize_tag_sort(sort)
+    normalized_created_by = normalize_created_by_filter(created_by)
+    normalized_year = normalize_created_year_filter(year)
+    normalized_has_snapshots = normalize_has_snapshots_filter(has_snapshots)
+    return {
+        'tags': build_tag_cards(
+            query=q,
+            request=request,
+            sort=normalized_sort,
+            created_by=normalized_created_by,
+            year=normalized_year,
+            has_snapshots=normalized_has_snapshots,
+        ),
+        'sort': normalized_sort,
+        'created_by': normalized_created_by,
+        'year': normalized_year,
+        'has_snapshots': normalized_has_snapshots,
+    }
+
+
+def _public_tag_listing_enabled() -> bool:
+    explicit = getattr(settings, 'PUBLIC_SNAPSHOTS_LIST', None)
+    if explicit is not None:
+        return bool(explicit)
+    return bool(getattr(settings, 'PUBLIC_INDEX', SERVER_CONFIG.PUBLIC_INDEX))
+
+
+def _request_has_tag_autocomplete_access(request: HttpRequest) -> bool:
+    user = getattr(request, 'user', None)
+    if getattr(user, 'is_authenticated', False):
+        return True
+
+    token = request.GET.get('api_key') or request.headers.get('X-ArchiveBox-API-Key')
+    auth_header = request.headers.get('Authorization', '')
+    if not token and auth_header.lower().startswith('bearer '):
+        token = auth_header.split(None, 1)[1].strip()
+
+    if token and auth_using_token(token=token, request=request):
+        return True
+
+    return _public_tag_listing_enabled()
+
+
+@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete", auth=None)
 def tags_autocomplete(request: HttpRequest, q: str = ""):
    """Return tags matching the query for autocomplete."""
-    if not q:
-        # Return all tags if no query (limited to 50)
-        tags = Tag.objects.all().order_by('name')[:50]
-    else:
-        tags = Tag.objects.filter(name__icontains=q).order_by('name')[:20]
+    if not _request_has_tag_autocomplete_access(request):
+        raise HttpError(401, 'Authentication required')
+
+    tags = get_matching_tags(q)[:50 if not q else 20]

    return {
-        'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug} for tag in tags]
+        'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug, 'num_snapshots': getattr(tag, 'num_snapshots', 0)} for tag in tags]
    }


@router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create")
 def tags_create(request: HttpRequest, data: TagCreateSchema):
    """Create a new tag or return existing one."""
-    name = data.name.strip()
-    if not name:
-        raise HttpError(400, 'Tag name is required')
-
-    tag, created = Tag.objects.get_or_create(
-        name__iexact=name,
-        defaults={
-            'name': name,
-            'created_by': request.user if request.user.is_authenticated else None,
-        }
-    )
-
-    # If found by case-insensitive match, use that tag
-    if not created:
-        existing_tag = Tag.objects.filter(name__iexact=name).first()
-        if existing_tag is None:
-            raise HttpError(500, 'Failed to load existing tag after get_or_create')
-        tag = existing_tag
+    try:
+        tag, created = get_or_create_tag(
+            data.name,
+            created_by=request.user if request.user.is_authenticated else None,
+        )
+    except ValueError as err:
+        raise HttpError(400, str(err)) from err

    return {
        'success': True,
@@ -515,6 +622,62 @@ def tags_create(request: HttpRequest, data: TagCreateSchema):
    }


+@router.post("/tag/{tag_id}/rename", response=TagUpdateResponseSchema, url_name="rename_tag")
+def rename_tag(request: HttpRequest, tag_id: int, data: TagUpdateSchema):
+    try:
+        tag = rename_tag_record(get_tag_by_ref(tag_id), data.name)
+    except Tag.DoesNotExist as err:
+        raise HttpError(404, 'Tag not found') from err
+    except ValueError as err:
+        raise HttpError(400, str(err)) from err
+
+    return {
+        'success': True,
+        'tag_id': tag.pk,
+        'tag_name': tag.name,
+        'slug': tag.slug,
+    }
+
+
+@router.delete("/tag/{tag_id}", response=TagDeleteResponseSchema, url_name="delete_tag")
+def delete_tag(request: HttpRequest, tag_id: int):
+    try:
+        tag = get_tag_by_ref(tag_id)
+    except Tag.DoesNotExist as err:
+        raise HttpError(404, 'Tag not found') from err
+
+    deleted_count, _ = delete_tag_record(tag)
+    return {
+        'success': True,
+        'tag_id': int(tag_id),
+        'deleted_count': deleted_count,
+    }
+
+
+@router.get("/tag/{tag_id}/urls.txt", url_name="tag_urls_export")
+def tag_urls_export(request: HttpRequest, tag_id: int):
+    try:
+        tag = get_tag_by_ref(tag_id)
+    except Tag.DoesNotExist as err:
+        raise HttpError(404, 'Tag not found') from err
+
+    response = HttpResponse(export_tag_urls(tag), content_type='text/plain; charset=utf-8')
+    response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-urls.txt"'
+    return response
+
+
+@router.get("/tag/{tag_id}/snapshots.jsonl", url_name="tag_snapshots_export")
+def tag_snapshots_export(request: HttpRequest, tag_id: int):
+    try:
+        tag = get_tag_by_ref(tag_id)
+    except Tag.DoesNotExist as err:
+        raise HttpError(404, 'Tag not found') from err
+
+    response = HttpResponse(export_tag_snapshots_jsonl(tag), content_type='application/x-ndjson; charset=utf-8')
+    response['Content-Disposition'] = f'attachment; filename="tag-{tag.slug}-snapshots.jsonl"'
+    return response
+
+
@router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot")
 def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):
    """Add a tag to a snapshot. Creates the tag if it doesn't exist."""
@@ -534,24 +697,16 @@ def tags_add_to_snapshot(request: HttpRequest, data: TagSnapshotRequestSchema):

    # Get or create the tag
    if data.tag_name:
-        name = data.tag_name.strip()
-        if not name:
-            raise HttpError(400, 'Tag name is required')
-
-        tag, _ = Tag.objects.get_or_create(
-            name__iexact=name,
-            defaults={
-                'name': name,
-                'created_by': request.user if request.user.is_authenticated else None,
-            }
-        )
-        # If found by case-insensitive match, use that tag
-        existing_tag = Tag.objects.filter(name__iexact=name).first()
-        if existing_tag is not None:
-            tag = existing_tag
+        try:
+            tag, _ = get_or_create_tag(
+                data.tag_name,
+                created_by=request.user if request.user.is_authenticated else None,
+            )
+        except ValueError as err:
+            raise HttpError(400, str(err)) from err
    elif data.tag_id:
        try:
-            tag = Tag.objects.get(pk=data.tag_id)
+            tag = get_tag_by_ref(data.tag_id)
        except Tag.DoesNotExist:
            raise HttpError(404, 'Tag not found')
    else: