Files
ArchiveBox/archivebox/core/admin_snapshots.py
Claude 68061656e3 Refactor archive file access to use DB instead of filesystem
This prepares the codebase for S3 storage support by eliminating
filesystem scanning for archive metadata. All file listings and
size calculations now use the existing output_files and output_size
fields on ArchiveResult.

Changes:
- Snapshot.archive_size: now uses Sum(ArchiveResult.output_size) from DB
- Snapshot.canonical_outputs(): uses output_files dict instead of rglob
- Admin size displays: removed os.access checks, use DB directly
- views.py render_live_index: uses output_files/output_size from DB
- archivebox_status: uses DB aggregation instead of get_dir_size
- logging_util: uses snapshot.archive_size instead of get_dir_size

No new models or DB fields required - leverages existing output_files
and output_size fields that are already populated during archiving.
2026-01-01 14:14:38 +00:00

675 lines
27 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

__package__ = 'archivebox.core'
from django.contrib import admin, messages
from django.urls import path
from django.utils.html import format_html, mark_safe
from django.utils import timezone
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.util import htmldecode, urldecode
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.misc.logging_util import printable_filesize
from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
GLOBAL_CONTEXT = {}
class SnapshotActionForm(ActionForm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Define tags field in __init__ to avoid database access during app initialization
self.fields['tags'] = forms.CharField(
label='Edit tags',
required=False,
widget=TagEditorWidget(),
)
def clean_tags(self):
"""Parse comma-separated tag names into Tag objects."""
tags_str = self.cleaned_data.get('tags', '')
if not tags_str:
return []
tag_names = [name.strip() for name in tags_str.split(',') if name.strip()]
tags = []
for name in tag_names:
tag, _ = Tag.objects.get_or_create(
name__iexact=name,
defaults={'name': name}
)
# Use the existing tag if found by case-insensitive match
tag = Tag.objects.filter(name__iexact=name).first() or tag
tags.append(tag)
return tags
# TODO: allow selecting actions for specific extractor plugins? is this useful?
# plugin = forms.ChoiceField(
# choices=ArchiveResult.PLUGIN_CHOICES,
# required=False,
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
# )
class SnapshotAdminForm(forms.ModelForm):
"""Custom form for Snapshot admin with tag editor widget."""
tags_editor = forms.CharField(
label='Tags',
required=False,
widget=TagEditorWidget(),
help_text='Type tag names and press Enter or Space to add. Click × to remove.',
)
class Meta:
model = Snapshot
fields = '__all__'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Initialize tags_editor with current tags
if self.instance and self.instance.pk:
self.initial['tags_editor'] = ','.join(
sorted(tag.name for tag in self.instance.tags.all())
)
def save(self, commit=True):
instance = super().save(commit=False)
# Handle tags_editor field
if commit:
instance.save()
self._save_m2m()
# Parse and save tags from tags_editor
tags_str = self.cleaned_data.get('tags_editor', '')
if tags_str:
tag_names = [name.strip() for name in tags_str.split(',') if name.strip()]
tags = []
for name in tag_names:
tag, _ = Tag.objects.get_or_create(
name__iexact=name,
defaults={'name': name}
)
tag = Tag.objects.filter(name__iexact=name).first() or tag
tags.append(tag)
instance.tags.set(tags)
else:
instance.tags.clear()
return instance
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
form = SnapshotAdminForm
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
fieldsets = (
('URL', {
'fields': ('url', 'title'),
'classes': ('card', 'wide'),
}),
('Tags', {
'fields': ('tags_editor',),
'classes': ('card',),
}),
('Status', {
'fields': ('status', 'retry_at', 'status_info'),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
'classes': ('card',),
}),
('Relations', {
'fields': ('crawl',),
'classes': ('card',),
}),
('Config', {
'fields': ('config',),
'classes': ('card',),
}),
('Files', {
'fields': ('output_dir',),
'classes': ('card',),
}),
('Actions', {
'fields': ('admin_actions',),
'classes': ('card', 'wide'),
}),
('Archive Results', {
'fields': ('archiveresults_list',),
'classes': ('card', 'wide'),
}),
)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [] # Removed TagInline, using TagEditorWidget instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm
paginator = AccelleratedPaginator
save_on_top = True
show_full_result_count = False
def changelist_view(self, request, extra_context=None):
self.request = request
extra_context = extra_context or {}
try:
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
except Exception as e:
self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
return super().changelist_view(request, GLOBAL_CONTEXT)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
]
return custom_urls + urls
# def get_queryset(self, request):
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
@admin.display(description="Imported Timestamp")
def imported_timestamp(self, obj):
context = RequestContext(self.request, {
'bookmarked_date': obj.bookmarked_at,
'timestamp': obj.timestamp,
})
html = Template("""{{bookmarked_date}} (<code>{{timestamp}}</code>)""")
return mark_safe(html.render(context))
# pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S')
# return f'{pretty_time} ({obj.timestamp})'
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj):
# # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
# # action: update_snapshots
# # select_across: 0
# # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
# return format_html(
# '''
# <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
# <input type="hidden" name="csrfmiddlewaretoken" value="{}">
# <input type="hidden" name="_selected_action" value="{}">
# <button name="update_snapshots">Check</button>
# <button name="update_titles">Pull title + favicon</button>
# <button name="update_snapshots">Update</button>
# <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
# <button name="delete_snapshots">Permanently delete</button>
# </form>
# ''',
# csrf.get_token(self.request),
# obj.pk,
# )
def admin_actions(self, obj):
return format_html(
'''
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/archive/{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 Summary Page
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/archive/{}/index.html#all"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Result Files
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="{}"
target="_blank"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
🔗 Original URL
</a>
<span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Get missing extractors"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
⬇️ Get Missing
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Create a fresh new snapshot of this URL"
onmouseover="this.style.background='#dbeafe';"
onmouseout="this.style.background='#eff6ff';">
🆕 Archive Again
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Re-run all extractors (overwrite existing)"
onmouseover="this.style.background='#fef3c7';"
onmouseout="this.style.background='#fffbeb';">
🔄 Redo All
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; color: #991b1b; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Permanently delete this snapshot"
onmouseover="this.style.background='#fee2e2';"
onmouseout="this.style.background='#fef2f2';">
☠️ Delete
</a>
</div>
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
</p>
''',
obj.timestamp,
obj.timestamp,
obj.url,
obj.pk,
obj.pk,
obj.pk,
obj.pk,
)
def status_info(self, obj):
return format_html(
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp;
''',
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
obj.extension or '-',
)
@admin.display(description='Archive Results')
def archiveresults_list(self, obj):
return render_archiveresults_list(obj.archiveresult_set.all())
@admin.display(
description='Title',
ordering='title',
)
def title_str(self, obj):
# Render inline tag editor widget
widget = InlineTagEditorWidget(snapshot_id=str(obj.pk))
tags_html = widget.render(
name=f'tags_{obj.pk}',
value=obj.tags.all(),
attrs={'id': f'tags_{obj.pk}'},
snapshot_id=str(obj.pk),
)
# Show title if available, otherwise show URL
display_text = obj.title or obj.url
css_class = 'fetched' if obj.title else 'pending'
return format_html(
'<a href="/{}">'
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/index.html">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path,
obj.archive_path,
css_class,
urldecode(htmldecode(display_text))[:128]
) + mark_safe(f' <span class="tags-inline-editor">{tags_html}</span>')
@admin.display(
description='Files Saved',
# ordering='archiveresult_count',
)
def files(self, obj):
# return '-'
return obj.icons()
@admin.display(
# ordering='archiveresult_count'
)
def size(self, obj):
"""Display archive size from DB (no filesystem access)."""
archive_size = obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
size_txt = mark_safe(f'<b>{size_txt}</b>')
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
size_txt,
)
@admin.display(
description='Status',
ordering='status',
)
def status_with_progress(self, obj):
"""Show status with progress bar for in-progress snapshots."""
stats = obj.get_progress_stats()
# Status badge colors
status_colors = {
'queued': ('#f59e0b', '#fef3c7'), # amber
'started': ('#3b82f6', '#dbeafe'), # blue
'sealed': ('#10b981', '#d1fae5'), # green
'succeeded': ('#10b981', '#d1fae5'), # green
'failed': ('#ef4444', '#fee2e2'), # red
'backoff': ('#f59e0b', '#fef3c7'), # amber
'skipped': ('#6b7280', '#f3f4f6'), # gray
}
fg_color, bg_color = status_colors.get(obj.status, ('#6b7280', '#f3f4f6'))
# For started snapshots, show progress bar
if obj.status == 'started' and stats['total'] > 0:
percent = stats['percent']
running = stats['running']
succeeded = stats['succeeded']
failed = stats['failed']
return format_html(
'''<div style="min-width: 120px;">
<div style="display: flex; align-items: center; gap: 6px; margin-bottom: 4px;">
<span class="snapshot-progress-spinner"></span>
<span style="font-size: 11px; color: #64748b;">{}/{} hooks</span>
</div>
<div style="background: #e2e8f0; border-radius: 4px; height: 6px; overflow: hidden;">
<div style="background: linear-gradient(90deg, #10b981 0%, #10b981 {}%, #ef4444 {}%, #ef4444 {}%, #3b82f6 {}%, #3b82f6 100%);
width: {}%; height: 100%; transition: width 0.3s;"></div>
</div>
<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">
{}{}{}
</div>
</div>''',
succeeded + failed + stats['skipped'],
stats['total'],
int(succeeded / stats['total'] * 100) if stats['total'] else 0,
int(succeeded / stats['total'] * 100) if stats['total'] else 0,
int((succeeded + failed) / stats['total'] * 100) if stats['total'] else 0,
int((succeeded + failed) / stats['total'] * 100) if stats['total'] else 0,
percent,
succeeded,
failed,
running,
)
# For other statuses, show simple badge
return format_html(
'<span style="display: inline-block; padding: 2px 8px; border-radius: 12px; '
'font-size: 11px; font-weight: 500; background: {}; color: {};">{}</span>',
bg_color,
fg_color,
obj.status.upper(),
)
@admin.display(
description='Size',
)
def size_with_stats(self, obj):
"""Show archive size from DB (no filesystem access)."""
stats = obj.get_progress_stats()
# Use output_size from archive results (already aggregated in stats)
size_bytes = stats['output_size'] or 0
if size_bytes:
size_txt = printable_filesize(size_bytes)
if size_bytes > 52428800: # 50MB
size_txt = mark_safe(f'<b>{size_txt}</b>')
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
# Show hook statistics
if stats['total'] > 0:
return format_html(
'<a href="/{}" title="View all files" style="white-space: nowrap;">'
'{}</a>'
'<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">'
'{}/{} hooks</div>',
obj.archive_path,
size_txt,
stats['succeeded'],
stats['total'],
)
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
size_txt,
)
@admin.display(
description='Original URL',
ordering='url',
)
def url_str(self, obj):
return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>',
obj.url,
obj.url[:128],
)
def grid_view(self, request, extra_context=None):
# cl = self.get_changelist_instance(request)
# Save before monkey patching to restore for changelist list view
saved_change_list_template = self.change_list_template
saved_list_per_page = self.list_per_page
saved_list_max_show_all = self.list_max_show_all
# Monkey patch here plus core_tags.py
self.change_list_template = 'private_index_grid.html'
self.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
self.list_max_show_all = self.list_per_page
# Call monkey patched view
rendered_response = self.changelist_view(request, extra_context=extra_context)
# Restore values
self.change_list_template = saved_change_list_template
self.list_per_page = saved_list_per_page
self.list_max_show_all = saved_list_max_show_all
return rendered_response
# for debugging, uncomment this to print all requests:
# def changelist_view(self, request, extra_context=None):
# print('[*] Got request', request.method, request.POST)
# return super().changelist_view(request, extra_context=None)
@admin.action(
description=" Get Title"
)
def update_titles(self, request, queryset):
count = queryset.count()
# Queue snapshots for archiving via the state machine system
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
)
@admin.action(
description="⬇️ Get Missing"
)
def update_snapshots(self, request, queryset):
count = queryset.count()
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
request,
f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
)
@admin.action(
description="🆕 Archive Again"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
timestamp = timezone.now().isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
messages.success(
request,
f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
)
@admin.action(
description="🔄 Redo"
)
def overwrite_snapshots(self, request, queryset):
count = queryset.count()
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
request,
f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
)
@admin.action(
description="☠️ Delete"
)
def delete_snapshots(self, request, queryset):
"""Delete snapshots in a single transaction to avoid SQLite concurrency issues."""
from django.db import transaction
total = queryset.count()
# Get list of IDs to delete first (outside transaction)
ids_to_delete = list(queryset.values_list('pk', flat=True))
# Delete everything in a single atomic transaction
with transaction.atomic():
deleted_count, _ = Snapshot.objects.filter(pk__in=ids_to_delete).delete()
messages.success(
request,
mark_safe(f"Successfully deleted {total} Snapshots ({deleted_count} total objects including related records). Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
)
@admin.action(
description="+"
)
def add_tags(self, request, queryset):
from archivebox.core.models import SnapshotTag
# Get tags from the form - now comma-separated string
tags_str = request.POST.get('tags', '')
if not tags_str:
messages.warning(request, "No tags specified.")
return
# Parse comma-separated tag names and get/create Tag objects
tag_names = [name.strip() for name in tags_str.split(',') if name.strip()]
tags = []
for name in tag_names:
tag, _ = Tag.objects.get_or_create(
name__iexact=name,
defaults={'name': name}
)
tag = Tag.objects.filter(name__iexact=name).first() or tag
tags.append(tag)
# Get snapshot IDs efficiently (works with select_across for all pages)
snapshot_ids = list(queryset.values_list('id', flat=True))
num_snapshots = len(snapshot_ids)
print('[+] Adding tags', [t.name for t in tags], 'to', num_snapshots, 'Snapshots')
# Bulk create M2M relationships (1 query per tag, not per snapshot)
for tag in tags:
SnapshotTag.objects.bulk_create(
[SnapshotTag(snapshot_id=sid, tag=tag) for sid in snapshot_ids],
ignore_conflicts=True # Skip if relationship already exists
)
messages.success(
request,
f"Added {len(tags)} tag(s) to {num_snapshots} Snapshot(s).",
)
@admin.action(
description=""
)
def remove_tags(self, request, queryset):
from archivebox.core.models import SnapshotTag
# Get tags from the form - now comma-separated string
tags_str = request.POST.get('tags', '')
if not tags_str:
messages.warning(request, "No tags specified.")
return
# Parse comma-separated tag names and find matching Tag objects (case-insensitive)
tag_names = [name.strip() for name in tags_str.split(',') if name.strip()]
tags = []
for name in tag_names:
tag = Tag.objects.filter(name__iexact=name).first()
if tag:
tags.append(tag)
if not tags:
messages.warning(request, "No matching tags found.")
return
# Get snapshot IDs efficiently (works with select_across for all pages)
snapshot_ids = list(queryset.values_list('id', flat=True))
num_snapshots = len(snapshot_ids)
tag_ids = [t.pk for t in tags]
print('[-] Removing tags', [t.name for t in tags], 'from', num_snapshots, 'Snapshots')
# Bulk delete M2M relationships (1 query total, not per snapshot)
deleted_count, _ = SnapshotTag.objects.filter(
snapshot_id__in=snapshot_ids,
tag_id__in=tag_ids
).delete()
messages.success(
request,
f"Removed {len(tags)} tag(s) from {num_snapshots} Snapshot(s) ({deleted_count} associations deleted).",
)