Avoid filesystem lookups in snapshot admin list

This commit is contained in:
Nick Sweeting
2026-03-15 17:18:53 -07:00
parent 21a0a27091
commit e598614b05
5 changed files with 80 additions and 11 deletions

View File

@@ -241,6 +241,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
qs = ( qs = (
super() super()
.get_queryset(request) .get_queryset(request)
.select_related('crawl__created_by')
.defer('config', 'notes') .defer('config', 'notes')
.prefetch_related('tags') .prefetch_related('tags')
.prefetch_related(Prefetch('archiveresult_set', queryset=prefetch_qs)) .prefetch_related(Prefetch('archiveresult_set', queryset=prefetch_qs))
@@ -403,7 +404,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
show_title = bool(title_raw) and title_normalized != 'pending...' and title_normalized != url_normalized show_title = bool(title_raw) and title_normalized != 'pending...' and title_normalized != url_normalized
css_class = 'fetched' if show_title else 'pending' css_class = 'fetched' if show_title else 'pending'
detail_url = build_web_url(f'/{obj.archive_path}/index.html') detail_url = build_web_url(f'/{obj.archive_path_from_db}/index.html')
title_html = '' title_html = ''
if show_title: if show_title:
title_html = format_html( title_html = format_html(
@@ -489,7 +490,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
) )
def files(self, obj): def files(self, obj):
# return '-' # return '-'
return obj.icons() return obj.icons(path=obj.archive_path_from_db)
@admin.display( @admin.display(
@@ -595,7 +596,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'{}</a>' '{}</a>'
'<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">' '<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">'
'{}/{} hooks</div>', '{}/{} hooks</div>',
build_web_url(f'/{obj.archive_path}'), build_web_url(f'/{obj.archive_path_from_db}'),
size_txt, size_txt,
stats['succeeded'], stats['succeeded'],
stats['total'], stats['total'],
@@ -603,7 +604,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
return format_html( return format_html(
'<a href="{}" title="View all files">{}</a>', '<a href="{}" title="View all files">{}</a>',
build_web_url(f'/{obj.archive_path}'), build_web_url(f'/{obj.archive_path_from_db}'),
size_txt, size_txt,
) )

View File

@@ -1280,7 +1280,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
cache_key = f'{self.pk}-tags' cache_key = f'{self.pk}-tags'
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str() return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
def icons(self) -> str: def icons(self, path: Optional[str] = None) -> str:
"""Generate HTML icons showing which extractor plugins have succeeded for this snapshot""" """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe from django.utils.html import format_html, mark_safe
@@ -1296,7 +1296,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str='')) Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
)} )}
path = self.archive_path archive_path = path or self.archive_path
output = "" output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>' output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>'
@@ -1316,7 +1316,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
embed_path = result.embed_path() if result else f'{plugin}/' embed_path = result.embed_path() if result else f'{plugin}/'
output += format_html( output += format_html(
output_template, output_template,
path, archive_path,
embed_path, embed_path,
str(bool(existing)), str(bool(existing)),
plugin, plugin,
@@ -1435,6 +1435,34 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def legacy_archive_path(self) -> str: def legacy_archive_path(self) -> str:
return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}' return f'{CONSTANTS.ARCHIVE_DIR_NAME}/{self.timestamp}'
@cached_property
def archive_path_from_db(self) -> str:
"""Best-effort public URL path derived from DB fields only."""
if self.fs_version in ('0.7.0', '0.8.0'):
return self.legacy_archive_path
if self.fs_version in ('0.9.0', '1.0.0'):
username = 'web'
crawl = getattr(self, 'crawl', None)
if crawl and getattr(crawl, 'created_by_id', None):
username = crawl.created_by.username
if username == 'system':
username = 'web'
date_base = self.created_at or self.bookmarked_at
if date_base:
date_str = date_base.strftime('%Y%m%d')
else:
try:
date_str = datetime.fromtimestamp(float(self.timestamp)).strftime('%Y%m%d')
except (TypeError, ValueError, OSError):
return self.legacy_archive_path
domain = self.extract_domain_from_url(self.url)
return f'{username}/{date_str}/{domain}/{self.id}'
return self.legacy_archive_path
@cached_property @cached_property
def url_path(self) -> str: def url_path(self) -> str:
"""URL path matching the current snapshot output_dir layout.""" """URL path matching the current snapshot output_dir layout."""

View File

@@ -11,6 +11,8 @@ from typing import List, Dict, Any, Optional, Tuple
import pytest import pytest
from archivebox.uuid_compat import uuid7
# ============================================================================= # =============================================================================
# CLI Helpers (defined before fixtures that use them) # CLI Helpers (defined before fixtures that use them)
@@ -399,8 +401,7 @@ def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str])
def create_test_url(domain: str = 'example.com', path: str = None) -> str: def create_test_url(domain: str = 'example.com', path: str = None) -> str:
"""Generate unique test URL.""" """Generate unique test URL."""
import uuid path = path or uuid7().hex[:8]
path = path or uuid.uuid4().hex[:8]
return f'https://{domain}/{path}' return f'https://{domain}/{path}'

View File

@@ -16,7 +16,8 @@ import subprocess
from pathlib import Path from pathlib import Path
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from uuid import uuid4
from archivebox.uuid_compat import uuid7
# ============================================================================= # =============================================================================
@@ -495,7 +496,7 @@ INSERT INTO django_content_type (app_label, model) VALUES
def generate_uuid() -> str: def generate_uuid() -> str:
"""Generate a UUID string without dashes for SQLite.""" """Generate a UUID string without dashes for SQLite."""
return uuid4().hex return uuid7().hex
def generate_timestamp() -> str: def generate_timestamp() -> str:

View File

@@ -135,6 +135,44 @@ class TestAdminSnapshotListView:
assert response.status_code == 200 assert response.status_code == 200
assert b'example.com' in response.content assert b'example.com' in response.content
def test_list_view_avoids_legacy_title_fallbacks(self, client, admin_user, snapshot, monkeypatch):
"""Title-less snapshots should render without touching history-based fallback paths."""
from archivebox.core.models import Snapshot
Snapshot.objects.filter(pk=snapshot.pk).update(title='')
def _latest_title_should_not_be_used(self):
raise AssertionError('admin changelist should not access Snapshot.latest_title')
def _history_should_not_be_used(self):
raise AssertionError('admin changelist should not access Snapshot.history')
monkeypatch.setattr(Snapshot, 'latest_title', property(_latest_title_should_not_be_used), raising=False)
monkeypatch.setattr(Snapshot, 'history', property(_history_should_not_be_used), raising=False)
client.login(username='testadmin', password='testpassword')
url = reverse('admin:core_snapshot_changelist')
response = client.get(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'example.com' in response.content
def test_list_view_avoids_output_dir_lookups(self, client, admin_user, snapshot, monkeypatch):
"""Changelist links should render without probing snapshot paths on disk."""
from archivebox.core.models import Snapshot
def _output_dir_should_not_be_used(self):
raise AssertionError('admin changelist should not access Snapshot.output_dir')
monkeypatch.setattr(Snapshot, 'output_dir', property(_output_dir_should_not_be_used), raising=False)
client.login(username='testadmin', password='testpassword')
url = reverse('admin:core_snapshot_changelist')
response = client.get(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'example.com' in response.content
def test_grid_view_renders(self, client, admin_user): def test_grid_view_renders(self, client, admin_user):
"""Test that the grid view renders successfully.""" """Test that the grid view renders successfully."""
client.login(username='testadmin', password='testpassword') client.login(username='testadmin', password='testpassword')