This commit is contained in:
Nick Sweeting
2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions

View File

@@ -14,6 +14,7 @@ from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.models import ArchiveResult, Snapshot
@@ -57,7 +58,11 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_link = f'/{result.snapshot.archive_path}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/{result.snapshot.archive_path}/'
snapshot_id = str(getattr(result, 'snapshot_id', ''))
if embed_path and result.status == 'succeeded':
output_link = build_snapshot_url(snapshot_id, embed_path)
else:
output_link = build_snapshot_url(snapshot_id, '')
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
@@ -252,7 +257,7 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
@@ -300,10 +305,11 @@ class ArchiveResultAdmin(BaseModelAdmin):
description='Snapshot Info'
)
def snapshot_info(self, result):
snapshot_id = str(result.snapshot_id)
return format_html(
'<a href="/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.archive_path,
str(result.snapshot.id)[:8],
'<a href="{}"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
build_snapshot_url(snapshot_id, "index.html"),
snapshot_id[:8],
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
)
@@ -335,10 +341,10 @@ class ArchiveResultAdmin(BaseModelAdmin):
# Determine output link path - use embed_path() which checks output_files
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
snapshot_id = str(result.snapshot_id)
return format_html(
'<a href="/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.archive_path,
output_path,
'<a href="{}" class="output-link">↗️</a><pre>{}</pre>',
build_snapshot_url(snapshot_id, output_path),
result.output_str,
)
@@ -348,7 +354,11 @@ class ArchiveResultAdmin(BaseModelAdmin):
'<pre style="display: inline-block">{}</pre><br/>',
result.output_str,
)
output_html += format_html('<a href="/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.archive_path))
snapshot_id = str(result.snapshot_id)
output_html += format_html(
'<a href="{}#all">See result files ...</a><br/><pre><code>',
build_snapshot_url(snapshot_id, "index.html"),
)
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
path_from_embed = (snapshot_dir / (embed_path or ''))
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))

View File

@@ -8,6 +8,8 @@ from django.contrib import admin, messages
from django.urls import path
from django.utils.html import format_html, mark_safe
from django.utils import timezone
from django.db.models import Q, Sum, Count, Prefetch
from django.db.models.functions import Coalesce
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
@@ -18,11 +20,12 @@ from archivebox.misc.util import htmldecode, urldecode
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.misc.logging_util import printable_filesize
from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot
from archivebox.core.models import Tag, Snapshot, ArchiveResult
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget
@@ -36,7 +39,7 @@ class SnapshotActionForm(ActionForm):
super().__init__(*args, **kwargs)
# Define tags field in __init__ to avoid database access during app initialization
self.fields['tags'] = forms.CharField(
label='Edit tags',
label='',
required=False,
widget=TagEditorWidget(),
)
@@ -67,6 +70,19 @@ class SnapshotActionForm(ActionForm):
# )
class TagNameListFilter(admin.SimpleListFilter):
title = 'By tag name'
parameter_name = 'tag'
def lookups(self, request, model_admin):
return [(str(tag.pk), tag.name) for tag in Tag.objects.order_by('name')]
def queryset(self, request, queryset):
if self.value():
return queryset.filter(tags__id=self.value())
return queryset
class SnapshotAdminForm(forms.ModelForm):
"""Custom form for Snapshot admin with tag editor widget."""
tags_editor = forms.CharField(
@@ -117,11 +133,11 @@ class SnapshotAdminForm(forms.ModelForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
form = SnapshotAdminForm
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
list_display = ('created_at', 'preview_icon', 'title_str', 'tags_inline', 'status_with_progress', 'files', 'size_with_stats')
sort_fields = ('title_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter)
fieldsets = (
('URL', {
@@ -163,7 +179,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
actions = ['add_tags', 'remove_tags', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [] # Removed TagInline, using TagEditorWidget instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
@@ -182,6 +198,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
return super().changelist_view(request, GLOBAL_CONTEXT)
def get_actions(self, request):
actions = super().get_actions(request)
if 'delete_selected' in actions:
func, name, _desc = actions['delete_selected']
actions['delete_selected'] = (func, name, 'Delete')
return actions
def get_urls(self):
urls = super().get_urls()
@@ -196,6 +219,52 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
def get_queryset(self, request):
self.request = request
ordering_fields = self._get_ordering_fields(request)
needs_size_sort = 'size_with_stats' in ordering_fields
needs_files_sort = 'files' in ordering_fields
needs_tags_sort = 'tags_inline' in ordering_fields
prefetch_qs = ArchiveResult.objects.filter(
Q(status='succeeded')
).only(
'id',
'snapshot_id',
'plugin',
'status',
'output_size',
'output_files',
'output_str',
)
qs = (
super()
.get_queryset(request)
.defer('config', 'notes')
.prefetch_related('tags')
.prefetch_related(Prefetch('archiveresult_set', queryset=prefetch_qs))
)
if needs_size_sort:
qs = qs.annotate(
output_size_sum=Coalesce(Sum(
'archiveresult__output_size',
filter=Q(archiveresult__status='succeeded'),
), 0),
)
if needs_files_sort:
qs = qs.annotate(
ar_succeeded_count=Count(
'archiveresult',
filter=Q(archiveresult__status='succeeded'),
),
)
if needs_tags_sort:
qs = qs.annotate(tag_count=Count('tags', distinct=True))
return qs
@admin.display(description="Imported Timestamp")
def imported_timestamp(self, obj):
@@ -233,17 +302,19 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# )
def admin_actions(self, obj):
summary_url = build_web_url(f'/{obj.archive_path}')
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
return format_html(
'''
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/{}"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 Summary Page
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/{}/index.html#all"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Result Files
@@ -263,7 +334,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
title="Get missing extractors"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
⬇️ Get Missing
⬇️ Finish
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
@@ -291,8 +362,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
</p>
''',
obj.archive_path,
obj.archive_path,
summary_url,
results_url,
obj.url,
obj.pk,
obj.pk,
@@ -301,6 +372,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
def status_info(self, obj):
favicon_url = build_snapshot_url(str(obj.id), 'favicon.ico')
return format_html(
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
@@ -310,7 +382,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/{obj.archive_path}/favicon.ico',
favicon_url,
obj.extension or '-',
)
@@ -323,7 +395,37 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
ordering='title',
)
def title_str(self, obj):
# Render inline tag editor widget
title_raw = (obj.title or '').strip()
url_raw = (obj.url or '').strip()
title_normalized = title_raw.lower()
url_normalized = url_raw.lower()
show_title = bool(title_raw) and title_normalized != 'pending...' and title_normalized != url_normalized
css_class = 'fetched' if show_title else 'pending'
detail_url = build_web_url(f'/{obj.archive_path}/index.html')
title_html = ''
if show_title:
title_html = format_html(
'<a href="{}">'
'<b class="status-{}">{}</b>'
'</a>',
detail_url,
css_class,
urldecode(htmldecode(title_raw))[:128],
)
return format_html(
'{}'
'<div style="font-size: 11px; color: #64748b; margin-top: 2px;">'
'<a href="{}"><code style="user-select: all;">{}</code></a>'
'</div>',
title_html,
url_raw or obj.url,
(url_raw or obj.url)[:128],
)
@admin.display(description='Tags', ordering='tag_count')
def tags_inline(self, obj):
widget = InlineTagEditorWidget(snapshot_id=str(obj.pk))
tags_html = widget.render(
name=f'tags_{obj.pk}',
@@ -331,28 +433,58 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
attrs={'id': f'tags_{obj.pk}'},
snapshot_id=str(obj.pk),
)
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
# Show title if available, otherwise show URL
display_text = obj.title or obj.url
css_class = 'fetched' if obj.title else 'pending'
@admin.display(description='Preview', empty_value='')
def preview_icon(self, obj):
results = self._get_prefetched_results(obj)
has_screenshot = False
has_favicon = False
if results is not None:
has_screenshot = any(r.plugin == 'screenshot' for r in results)
has_favicon = any(r.plugin == 'favicon' for r in results)
if not has_screenshot and not has_favicon:
return None
if has_screenshot:
img_url = build_snapshot_url(str(obj.id), 'screenshot/screenshot.png')
fallbacks = [
build_snapshot_url(str(obj.id), 'screenshot.png'),
build_snapshot_url(str(obj.id), 'favicon/favicon.ico'),
build_snapshot_url(str(obj.id), 'favicon.ico'),
]
img_alt = 'Screenshot'
preview_class = 'screenshot'
else:
img_url = build_snapshot_url(str(obj.id), 'favicon/favicon.ico')
fallbacks = [
build_snapshot_url(str(obj.id), 'favicon.ico'),
]
img_alt = 'Favicon'
preview_class = 'favicon'
fallback_list = ','.join(fallbacks)
onerror_js = (
"this.dataset.fallbacks && this.dataset.fallbacks.length ? "
"(this.src=this.dataset.fallbacks.split(',').shift(), "
"this.dataset.fallbacks=this.dataset.fallbacks.split(',').slice(1).join(',')) : "
"this.remove()"
)
return format_html(
'<a href="/{}">'
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/index.html">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path,
obj.archive_path,
css_class,
urldecode(htmldecode(display_text))[:128]
) + mark_safe(f' <span class="tags-inline-editor">{tags_html}</span>')
'<img src="{}" alt="{}" class="snapshot-preview {}" decoding="async" loading="lazy" '
'onerror="{}" data-fallbacks="{}">',
img_url,
img_alt,
preview_class,
onerror_js,
fallback_list,
)
@admin.display(
description='Files Saved',
# ordering='archiveresult_count',
ordering='ar_succeeded_count',
)
def files(self, obj):
# return '-'
@@ -371,8 +503,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
'<a href="{}" title="View all files">{}</a>',
build_web_url(f'/{obj.archive_path}'),
size_txt,
)
@@ -382,7 +514,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
def status_with_progress(self, obj):
"""Show status with progress bar for in-progress snapshots."""
stats = obj.get_progress_stats()
stats = self._get_progress_stats(obj)
# Status badge colors
status_colors = {
@@ -440,16 +572,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
@admin.display(
description='Size',
ordering='output_size_sum',
)
def size_with_stats(self, obj):
"""Show archive size with output size from archive results."""
stats = obj.get_progress_stats()
# Use output_size from archive results if available, fallback to disk size
stats = self._get_progress_stats(obj)
output_size = stats['output_size']
archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
size_bytes = output_size or archive_size or 0
size_bytes = output_size or 0
if size_bytes:
size_txt = printable_filesize(size_bytes)
@@ -461,22 +590,76 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# Show hook statistics
if stats['total'] > 0:
return format_html(
'<a href="/{}" title="View all files" style="white-space: nowrap;">'
'<a href="{}" title="View all files" style="white-space: nowrap;">'
'{}</a>'
'<div style="font-size: 10px; color: #94a3b8; margin-top: 2px;">'
'{}/{} hooks</div>',
obj.archive_path,
build_web_url(f'/{obj.archive_path}'),
size_txt,
stats['succeeded'],
stats['total'],
)
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
'<a href="{}" title="View all files">{}</a>',
build_web_url(f'/{obj.archive_path}'),
size_txt,
)
def _get_progress_stats(self, obj):
results = self._get_prefetched_results(obj)
if results is None:
return obj.get_progress_stats()
total = len(results)
succeeded = sum(1 for r in results if r.status == 'succeeded')
failed = sum(1 for r in results if r.status == 'failed')
running = sum(1 for r in results if r.status == 'started')
skipped = sum(1 for r in results if r.status == 'skipped')
pending = max(total - succeeded - failed - running - skipped, 0)
completed = succeeded + failed + skipped
percent = int((completed / total * 100) if total > 0 else 0)
is_sealed = obj.status not in (obj.StatusChoices.QUEUED, obj.StatusChoices.STARTED)
output_size = None
if hasattr(obj, 'output_size_sum'):
output_size = obj.output_size_sum or 0
else:
output_size = sum(r.output_size or 0 for r in results if r.status == 'succeeded')
return {
'total': total,
'succeeded': succeeded,
'failed': failed,
'running': running,
'pending': pending,
'skipped': skipped,
'percent': percent,
'output_size': output_size or 0,
'is_sealed': is_sealed,
}
def _get_prefetched_results(self, obj):
if hasattr(obj, '_prefetched_objects_cache') and 'archiveresult_set' in obj._prefetched_objects_cache:
return obj.archiveresult_set.all()
return None
def _get_ordering_fields(self, request):
ordering = request.GET.get('o')
if not ordering:
return set()
fields = set()
for part in ordering.split('.'):
if not part:
continue
try:
idx = abs(int(part)) - 1
except ValueError:
continue
if 0 <= idx < len(self.list_display):
fields.add(self.list_display[idx])
return fields
@admin.display(
description='Original URL',
ordering='url',
@@ -524,20 +707,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# return super().changelist_view(request, extra_context=None)
@admin.action(
description=" Get Title"
)
def update_titles(self, request, queryset):
count = queryset.count()
# Queue snapshots for archiving via the state machine system
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
)
@admin.action(
description="⬇️ Get Missing"
description=" Finish"
)
def update_snapshots(self, request, queryset):
count = queryset.count()
@@ -551,7 +721,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
@admin.action(
description="🆕 Archive Again"
description="⬇️ Fresh"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
@@ -579,7 +749,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
@admin.action(
description=" Delete"
description="🗑 Delete"
)
def delete_snapshots(self, request, queryset):
"""Delete snapshots in a single transaction to avoid SQLite concurrency issues."""

View File

@@ -1,6 +1,9 @@
__package__ = 'archivebox.core'
from django.apps import AppConfig
import os
_ORCHESTRATOR_BOOTSTRAPPED = False
class CoreConfig(AppConfig):
@@ -10,6 +13,7 @@ class CoreConfig(AppConfig):
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
import sys
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
from archivebox.core.admin_site import register_admin_site
register_admin_site()
@@ -18,3 +22,45 @@ class CoreConfig(AppConfig):
# Skip during makemigrations to avoid premature state machine access
if 'makemigrations' not in sys.argv:
from archivebox.core import models # noqa: F401
pidfile = os.environ.get('ARCHIVEBOX_RUNSERVER_PIDFILE')
if pidfile:
should_write_pid = True
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
should_write_pid = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if should_write_pid:
try:
with open(pidfile, 'w') as handle:
handle.write(str(os.getpid()))
except Exception:
pass
def _should_manage_orchestrator() -> bool:
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER') == '1':
return False
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_PROCESS') == '1':
return False
if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
return True
argv = ' '.join(sys.argv).lower()
if 'orchestrator' in argv:
return False
return 'daphne' in argv and '--reload' in sys.argv
if _should_manage_orchestrator():
global _ORCHESTRATOR_BOOTSTRAPPED
if _ORCHESTRATOR_BOOTSTRAPPED:
return
_ORCHESTRATOR_BOOTSTRAPPED = True
from archivebox.machine.models import Process, Machine
from archivebox.workers.orchestrator import Orchestrator
Process.cleanup_stale_running()
machine = Machine.current()
if not Orchestrator.is_running():
Orchestrator(exit_on_idle=False).start()

View File

@@ -0,0 +1,189 @@
from __future__ import annotations
from __future__ import annotations
import re
from urllib.parse import urlparse
from archivebox.config.common import SERVER_CONFIG
_SNAPSHOT_ID_RE = re.compile(r"^[0-9a-fA-F-]{8,36}$")
def split_host_port(host: str) -> tuple[str, str | None]:
parsed = urlparse(f"//{host}")
hostname = (parsed.hostname or host or "").lower()
port = str(parsed.port) if parsed.port else None
return hostname, port
def _normalize_base_url(value: str | None) -> str:
if not value:
return ""
base = value.strip()
if not base:
return ""
if "://" not in base:
base = f"http://{base}"
parsed = urlparse(base)
if not parsed.netloc:
return ""
return f"{parsed.scheme}://{parsed.netloc}"
def normalize_base_url(value: str | None) -> str:
return _normalize_base_url(value)
def get_listen_host() -> str:
return (SERVER_CONFIG.LISTEN_HOST or "").strip()
def get_listen_parts() -> tuple[str, str | None]:
return split_host_port(get_listen_host())
def _build_listen_host(subdomain: str | None) -> str:
host, port = get_listen_parts()
if not host:
return ""
full_host = f"{subdomain}.{host}" if subdomain else host
if port:
return f"{full_host}:{port}"
return full_host
def get_admin_host() -> str:
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
if override:
return urlparse(override).netloc.lower()
return _build_listen_host("admin")
def get_web_host() -> str:
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
if override:
return urlparse(override).netloc.lower()
return _build_listen_host("web")
def get_api_host() -> str:
return _build_listen_host("api")
def get_public_host() -> str:
return _build_listen_host("public")
def get_snapshot_host(snapshot_id: str) -> str:
return _build_listen_host(snapshot_id)
def get_original_host(domain: str) -> str:
return _build_listen_host(domain)
def is_snapshot_subdomain(subdomain: str) -> bool:
return bool(_SNAPSHOT_ID_RE.match(subdomain or ""))
def get_listen_subdomain(request_host: str) -> str:
req_host, req_port = split_host_port(request_host)
listen_host, listen_port = get_listen_parts()
if not listen_host:
return ""
if listen_port and req_port and listen_port != req_port:
return ""
if req_host == listen_host:
return ""
suffix = f".{listen_host}"
if req_host.endswith(suffix):
return req_host[: -len(suffix)]
return ""
def host_matches(request_host: str, target_host: str) -> bool:
if not request_host or not target_host:
return False
req_host, req_port = split_host_port(request_host)
target_host_only, target_port = split_host_port(target_host)
if req_host != target_host_only:
return False
if target_port and req_port and target_port != req_port:
return False
return True
def _scheme_from_request(request=None) -> str:
if request:
return request.scheme
return "http"
def _build_base_url_for_host(host: str, request=None) -> str:
if not host:
return ""
scheme = _scheme_from_request(request)
return f"{scheme}://{host}"
def get_admin_base_url(request=None) -> str:
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
if override:
return override
return _build_base_url_for_host(get_admin_host(), request=request)
def get_web_base_url(request=None) -> str:
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
if override:
return override
return _build_base_url_for_host(get_web_host(), request=request)
def get_api_base_url(request=None) -> str:
return _build_base_url_for_host(get_api_host(), request=request)
# Backwards-compat aliases (archive == web)
def get_archive_base_url(request=None) -> str:
return get_web_base_url(request=request)
def get_snapshot_base_url(snapshot_id: str, request=None) -> str:
return _build_base_url_for_host(get_snapshot_host(snapshot_id), request=request)
def get_original_base_url(domain: str, request=None) -> str:
return _build_base_url_for_host(get_original_host(domain), request=request)
def build_admin_url(path: str = "", request=None) -> str:
return _build_url(get_admin_base_url(request), path)
def build_web_url(path: str = "", request=None) -> str:
return _build_url(get_web_base_url(request), path)
def build_api_url(path: str = "", request=None) -> str:
return _build_url(get_api_base_url(request), path)
def build_archive_url(path: str = "", request=None) -> str:
return _build_url(get_archive_base_url(request), path)
def build_snapshot_url(snapshot_id: str, path: str = "", request=None) -> str:
return _build_url(get_snapshot_base_url(snapshot_id, request=request), path)
def build_original_url(domain: str, path: str = "", request=None) -> str:
return _build_url(get_original_base_url(domain, request=request), path)
def _build_url(base_url: str, path: str) -> str:
if not base_url:
if not path:
return ""
return path if path.startswith("/") else f"/{path}"
if not path:
return base_url
return f"{base_url}{path if path.startswith('/') else f'/{path}'}"

View File

@@ -2,11 +2,33 @@ __package__ = 'archivebox.core'
import ipaddress
import re
from pathlib import Path
from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware
from django.contrib.auth.models import AnonymousUser
from django.core.exceptions import ImproperlyConfigured
from django.shortcuts import redirect
from django.contrib.staticfiles import finders
from django.utils.http import http_date
from django.http import HttpResponseNotModified
from archivebox.config.common import SERVER_CONFIG
from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from archivebox.core.host_utils import (
build_admin_url,
build_api_url,
build_web_url,
get_api_host,
get_admin_host,
get_listen_host,
get_listen_subdomain,
get_public_host,
get_web_host,
host_matches,
is_snapshot_subdomain,
)
from archivebox.core.views import SnapshotHostView, OriginalDomainHostView
def detect_timezone(request, activate: bool=True):
@@ -30,17 +52,112 @@ def TimezoneMiddleware(get_response):
def CacheControlMiddleware(get_response):
snapshot_path_re = re.compile(r"^/[^/]+/\\d{8}/[^/]+/[0-9a-fA-F-]{8,36}/")
static_cache_key = (get_COMMIT_HASH() or VERSION or "dev").strip()
def middleware(request):
response = get_response(request)
if request.path.startswith('/static/'):
rel_path = request.path[len('/static/'):]
static_path = finders.find(rel_path)
if static_path:
try:
mtime = Path(static_path).stat().st_mtime
except OSError:
mtime = None
etag = f'"{static_cache_key}:{int(mtime) if mtime else 0}"'
inm = request.META.get("HTTP_IF_NONE_MATCH")
if inm:
inm_list = [item.strip() for item in inm.split(",")]
if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]:
not_modified = HttpResponseNotModified()
not_modified.headers["ETag"] = etag
not_modified.headers["Cache-Control"] = "public, max-age=31536000, immutable"
if mtime:
not_modified.headers["Last-Modified"] = http_date(mtime)
return not_modified
response.headers["ETag"] = etag
response.headers["Cache-Control"] = "public, max-age=31536000, immutable"
if mtime and not response.headers.get("Last-Modified"):
response.headers["Last-Modified"] = http_date(mtime)
return response
if '/archive/' in request.path or '/static/' in request.path or snapshot_path_re.match(request.path):
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
# print('Set Cache-Control header to', response['Cache-Control'])
if not response.get('Cache-Control'):
policy = 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
response['Cache-Control'] = f'{policy}, max-age=60, stale-while-revalidate=300'
# print('Set Cache-Control header to', response['Cache-Control'])
return response
return middleware
def HostRoutingMiddleware(get_response):
def middleware(request):
request_host = (request.get_host() or "").lower()
admin_host = get_admin_host()
web_host = get_web_host()
api_host = get_api_host()
public_host = get_public_host()
listen_host = get_listen_host()
subdomain = get_listen_subdomain(request_host)
if host_matches(request_host, admin_host):
return get_response(request)
if host_matches(request_host, api_host):
request.user = AnonymousUser()
request._cached_user = request.user
if request.path.startswith("/admin"):
target = build_admin_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
if not request.path.startswith("/api/"):
target_path = f"/api{request.path if request.path.startswith('/') else f'/{request.path}'}"
if request.META.get("QUERY_STRING"):
target_path = f"{target_path}?{request.META['QUERY_STRING']}"
return redirect(target_path)
return get_response(request)
if host_matches(request_host, web_host):
request.user = AnonymousUser()
request._cached_user = request.user
if request.path.startswith("/admin"):
target = build_admin_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return get_response(request)
if host_matches(request_host, public_host):
request.user = AnonymousUser()
request._cached_user = request.user
return get_response(request)
if subdomain:
if is_snapshot_subdomain(subdomain):
view = SnapshotHostView.as_view()
return view(request, snapshot_id=subdomain, path=request.path.lstrip("/"))
view = OriginalDomainHostView.as_view()
return view(request, domain=subdomain, path=request.path.lstrip("/"))
if host_matches(request_host, listen_host):
target = build_web_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
if admin_host or web_host:
target = build_web_url(request.path, request=request)
if target:
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return get_response(request)
return middleware
class ReverseProxyAuthMiddleware(RemoteUserMiddleware):
header = 'HTTP_{normalized}'.format(normalized=SERVER_CONFIG.REVERSE_PROXY_USER_HEADER.replace('-', '_').upper())

View File

@@ -0,0 +1,17 @@
# Generated by Codex on 2026-01-21
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0030_alter_archiveresult_id'),
]
operations = [
migrations.AddIndex(
model_name='archiveresult',
index=models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'),
),
]

View File

@@ -1297,7 +1297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
path = self.archive_path
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a>'
# Get all plugins from hooks system (sorted by numeric prefix)
all_plugins = [get_plugin_name(e) for e in get_plugins()]
@@ -1322,7 +1322,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
icon
)
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
return format_html('<span class="files-icons" style="font-size: 1em; opacity: 0.8; display: inline-grid; grid-auto-flow: column; grid-auto-columns: auto; grid-template-rows: repeat(4, auto); gap: 0 0; justify-content: start; align-content: start;">{}</span>', mark_safe(output))
cache_result = cache.get(cache_key)
if cache_result:
@@ -1789,7 +1789,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)['total_size'] or 0
# Check if sealed
is_sealed = self.status in (self.StatusChoices.SEALED, self.StatusChoices.FAILED, self.StatusChoices.BACKOFF)
is_sealed = self.status not in (self.StatusChoices.QUEUED, self.StatusChoices.STARTED)
return {
'total': total,
@@ -1992,6 +1992,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
size = sum(p.stat().st_size for p in abs_path.rglob('*') if p.is_file())
else:
size = abs_path.stat().st_size
plugin_lower = (result.plugin or '').lower()
if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl'):
plugin_dir = snap_dir / result.plugin
if plugin_dir.exists():
try:
size = sum(p.stat().st_size for p in plugin_dir.rglob('*') if p.is_file())
except OSError:
pass
outputs.append({
'name': result.plugin,
'path': embed_path,
@@ -2057,6 +2065,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
from archivebox.misc.util import ts_to_date_str
from archivebox.core.host_utils import build_snapshot_url
result = {
'TYPE': 'core.models.Snapshot',
@@ -2078,6 +2087,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'is_static': self.is_static,
'is_archived': self.is_archived,
'archive_path': self.archive_path,
'archive_url': build_snapshot_url(str(self.id), 'index.html'),
'output_dir': self.output_dir,
'link_dir': self.output_dir, # backwards compatibility alias
'archive_size': self.archive_size,
@@ -2129,14 +2139,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
outputs_by_plugin = {out['name']: out for out in outputs}
best_preview_path = 'about:blank'
best_result = {'path': 'about:blank', 'result': None}
for plugin in preview_priority:
out = outputs_by_plugin.get(plugin)
if out and out.get('path'):
best_preview_path = out['path']
best_result = out
break
if best_preview_path == 'about:blank' and outputs:
best_preview_path = outputs[0].get('path') or 'about:blank'
best_result = outputs[0]
context = {
**self.to_dict(extended=True),
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
@@ -2151,6 +2164,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'best_preview_path': best_preview_path,
'best_result': best_result,
'archiveresults': outputs,
}
rendered_html = render_to_string('snapshot.html', context)
@@ -2326,6 +2340,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
app_label = 'core'
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
indexes = [
models.Index(fields=['snapshot', 'status'], name='archiveresult_snap_status_idx'),
]
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
@@ -2487,6 +2504,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_lower = (plugin_name or '').lower()
prefer_media = plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl')
preferred_text = []
if plugin_lower:
preferred_text.extend([
f'{plugin_lower}.jsonl',
f'{plugin_lower}.json',
f'{plugin_lower}.txt',
f'{plugin_lower}.log',
])
preferred_text.extend(['index.jsonl', 'index.json'])
for name in preferred_text:
candidate = dir_path / name
if candidate.exists() and candidate.is_file():
return candidate
if not prefer_media:
for name in ('index.html', 'index.htm'):
candidate = dir_path / name
@@ -2504,6 +2535,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if file_path.is_dir() or file_path.name.startswith('.'):
continue
ext = file_path.suffix.lstrip('.').lower()
if ext in ('pid', 'log', 'sh'):
continue
if ext not in embeddable_exts:
continue
try:
@@ -2547,20 +2580,44 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Fallback: treat output_str as a file path only if it exists on disk
if self.output_str:
try:
output_path = Path(self.output_str)
raw_output = str(self.output_str).strip()
if raw_output in ('.', './', ''):
best_file = self._find_best_output_file(plugin_dir, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
output_path = None
else:
output_path = Path(raw_output)
if output_path.is_absolute():
if output_path and output_path.is_absolute():
# If absolute and within snapshot dir, normalize to relative
if snapshot_dir in output_path.parents and output_path.exists():
return str(output_path.relative_to(snapshot_dir))
else:
if output_path.is_file():
return str(output_path.relative_to(snapshot_dir))
if output_path.is_dir():
best_file = self._find_best_output_file(output_path, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
elif output_path:
# If relative, prefer plugin-prefixed path, then direct path
if (plugin_dir / output_path).exists():
return f'{self.plugin}/{output_path}'
plugin_candidate = plugin_dir / output_path
if plugin_candidate.exists():
if plugin_candidate.is_file():
return f'{self.plugin}/{output_path}'
if plugin_candidate.is_dir():
best_file = self._find_best_output_file(plugin_candidate, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
if output_path.name in ('index.html', 'index.json') and output_path.parent == Path('.'):
return None
if (snapshot_dir / output_path).exists():
return str(output_path)
snapshot_candidate = snapshot_dir / output_path
if snapshot_candidate.exists():
if snapshot_candidate.is_file():
return str(output_path)
if snapshot_candidate.is_dir():
best_file = self._find_best_output_file(snapshot_candidate, self.plugin)
if best_file:
return str(best_file.relative_to(snapshot_dir))
except Exception:
pass
@@ -2569,7 +2626,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
ignored = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid', 'cmd.sh'}
output_candidates = [
f for f in self.output_files.keys()
if Path(f).name not in ignored
if Path(f).name not in ignored and Path(f).suffix not in ('.pid', '.log', '.sh')
]
first_file = output_candidates[0] if output_candidates else None
if first_file and (plugin_dir / first_file).exists():

View File

@@ -12,6 +12,7 @@ import archivebox
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, STORAGE_CONFIG # noqa
from archivebox.core.host_utils import normalize_base_url, get_admin_base_url, get_api_base_url
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
@@ -77,9 +78,11 @@ MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"archivebox.api.middleware.ApiCorsMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
"archivebox.core.middleware.HostRoutingMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"archivebox.core.middleware.CacheControlMiddleware",
# Additional middlewares from plugins (if any)
@@ -347,6 +350,14 @@ SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, "abcdefghijklmnop
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(",")
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(",")))
admin_base_url = normalize_base_url(get_admin_base_url())
if admin_base_url and admin_base_url not in CSRF_TRUSTED_ORIGINS:
CSRF_TRUSTED_ORIGINS.append(admin_base_url)
api_base_url = normalize_base_url(get_api_base_url())
if api_base_url and api_base_url not in CSRF_TRUSTED_ORIGINS:
CSRF_TRUSTED_ORIGINS.append(api_base_url)
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
for hostname in ALLOWED_HOSTS:
@@ -363,6 +374,7 @@ CSRF_COOKIE_SECURE = False
SESSION_COOKIE_SECURE = False
SESSION_COOKIE_HTTPONLY = True
SESSION_COOKIE_DOMAIN = None
CSRF_COOKIE_DOMAIN = None
SESSION_COOKIE_AGE = 1209600 # 2 weeks
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
SESSION_SAVE_EVERY_REQUEST = False

View File

@@ -15,6 +15,6 @@ def get_config(key: str) -> any:
Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
"""
try:
return _get_config(key)
return _get_config().get(key)
except (KeyError, AttributeError):
return None

View File

@@ -9,10 +9,114 @@ from pathlib import Path
from archivebox.hooks import (
get_plugin_icon, get_plugin_template, get_plugin_name,
)
from archivebox.core.host_utils import (
get_admin_base_url,
get_web_base_url,
get_snapshot_base_url,
build_snapshot_url,
)
register = template.Library()
_MEDIA_FILE_EXTS = {
'.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v', '.mpg', '.mpeg', '.ts', '.m2ts', '.mts',
'.3gp', '.3g2', '.ogv',
'.mp3', '.m4a', '.aac', '.ogg', '.oga', '.opus', '.wav', '.flac', '.alac', '.aiff', '.wma', '.mka', '.ac3', '.eac3', '.dts',
}
def _count_media_files(result) -> int:
try:
output_files = getattr(result, 'output_files', None) or {}
except Exception:
output_files = {}
count_from_output = 0
if output_files:
count_from_output = sum(
1
for path in output_files.keys()
if Path(path).suffix.lower() in _MEDIA_FILE_EXTS
)
if count_from_output >= 2:
return count_from_output
try:
plugin_dir = Path(result.snapshot_dir) / result.plugin
except Exception:
return 0
if not plugin_dir.exists():
return 0
count = 0
scanned = 0
max_scan = 500
for file_path in plugin_dir.rglob('*'):
if scanned >= max_scan:
break
scanned += 1
if not file_path.is_file():
continue
if file_path.suffix.lower() in _MEDIA_FILE_EXTS:
count += 1
return max(count_from_output, count)
def _list_media_files(result) -> list[dict]:
media_files: list[dict] = []
try:
plugin_dir = Path(result.snapshot_dir) / result.plugin
snapshot_dir = Path(result.snapshot_dir)
except Exception:
return media_files
output_files = getattr(result, 'output_files', None) or {}
candidates: list[Path] = []
if output_files:
for path in output_files.keys():
rel_path = Path(path)
if rel_path.suffix.lower() in _MEDIA_FILE_EXTS:
candidates.append(rel_path)
if not candidates and plugin_dir.exists():
scanned = 0
max_scan = 2000
for file_path in plugin_dir.rglob('*'):
if scanned >= max_scan:
break
scanned += 1
if not file_path.is_file():
continue
if file_path.suffix.lower() in _MEDIA_FILE_EXTS:
try:
rel_path = file_path.relative_to(plugin_dir)
except ValueError:
continue
candidates.append(rel_path)
for rel_path in candidates:
file_path = plugin_dir / rel_path
if not file_path.exists() or not file_path.is_file():
continue
try:
size = file_path.stat().st_size
except OSError:
size = None
try:
href = str(file_path.relative_to(snapshot_dir))
except ValueError:
href = str(Path(result.plugin) / rel_path)
media_files.append({
'name': file_path.name,
'path': href,
'size': size,
})
media_files.sort(key=lambda item: item['name'].lower())
return media_files
@register.filter(name='split')
def split(value, separator: str=','):
return (value or '').split(separator)
@@ -52,6 +156,28 @@ def url_replace(context, **kwargs):
return dict_.urlencode()
@register.simple_tag(takes_context=True)
def admin_base_url(context) -> str:
return get_admin_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def web_base_url(context) -> str:
return get_web_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def snapshot_base_url(context, snapshot) -> str:
snapshot_id = getattr(snapshot, 'id', snapshot)
return get_snapshot_base_url(str(snapshot_id), request=context.get('request'))
@register.simple_tag(takes_context=True)
def snapshot_url(context, snapshot, path: str = "") -> str:
snapshot_id = getattr(snapshot, 'id', snapshot)
return build_snapshot_url(str(snapshot_id), path, request=context.get('request'))
@register.simple_tag
def plugin_icon(plugin: str) -> str:
"""
@@ -82,24 +208,41 @@ def plugin_card(context, result) -> str:
template_str = get_plugin_template(plugin, 'card')
# Use embed_path() for the display path
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
output_url = build_snapshot_url(
str(getattr(result, 'snapshot_id', '')),
raw_output_path or '',
request=context.get('request'),
)
icon_html = get_plugin_icon(plugin)
plugin_lower = (plugin or '').lower()
media_file_count = _count_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else 0
media_files = _list_media_files(result) if plugin_lower in ('ytdlp', 'yt-dlp', 'youtube-dl') else []
if media_files:
snapshot_id = str(getattr(result, 'snapshot_id', ''))
request = context.get('request')
for item in media_files:
path = item.get('path') or ''
item['url'] = build_snapshot_url(snapshot_id, path, request=request) if path else ''
output_lower = (output_path or '').lower()
output_lower = (raw_output_path or '').lower()
text_preview_exts = ('.json', '.jsonl', '.txt', '.csv', '.tsv', '.xml', '.yml', '.yaml', '.md', '.log')
force_text_preview = output_lower.endswith(text_preview_exts)
# Create a mini template and render it with context
try:
if template_str and output_path and str(output_path).strip() not in ('.', '/', './') and not force_text_preview:
if template_str and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './') and not force_text_preview:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'output_path': output_url,
'output_path_raw': raw_output_path,
'plugin': plugin,
'plugin_icon': icon_html,
'media_file_count': media_file_count,
'media_files': media_files,
})
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
@@ -108,10 +251,10 @@ def plugin_card(context, result) -> str:
except Exception:
pass
if force_text_preview and output_path and str(output_path).strip() not in ('.', '/', './'):
output_file = Path(output_path)
if force_text_preview and raw_output_path and str(raw_output_path).strip() not in ('.', '/', './'):
output_file = Path(raw_output_path)
if not output_file.is_absolute():
output_file = Path(result.snapshot_dir) / output_path
output_file = Path(result.snapshot_dir) / raw_output_path
try:
output_file = output_file.resolve()
snap_dir = Path(result.snapshot_dir).resolve()
@@ -169,14 +312,20 @@ def plugin_full(context, result) -> str:
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
raw_output_path = result.embed_path() if hasattr(result, 'embed_path') else ''
output_url = build_snapshot_url(
str(getattr(result, 'snapshot_id', '')),
raw_output_path or '',
request=context.get('request'),
)
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'output_path': output_url,
'output_path_raw': raw_output_path,
'plugin': plugin,
})
rendered = tpl.render(ctx)
@@ -198,3 +347,30 @@ def plugin_name(value: str) -> str:
Usage: {{ result.plugin|plugin_name }}
"""
return get_plugin_name(value)
@register.filter
def plugin_display_name(value: str) -> str:
"""
Human-friendly plugin name overrides for UI display.
"""
name = get_plugin_name(value)
if name == 'merkletree':
return 'hashes'
return name
@register.simple_tag(takes_context=True)
def api_token(context) -> str:
"""
Return an API token string for the logged-in user, creating one if needed.
"""
from archivebox.api.auth import get_or_create_api_token
request = context.get('request')
user = getattr(request, 'user', None)
if not user or not user.is_authenticated:
return ''
token = get_or_create_api_token(user)
return token.token if token else ''

View File

@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from archivebox.core.admin_site import archivebox_admin
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view
from archivebox.workers.views import JobsDashboardView
@@ -29,11 +29,15 @@ urlpatterns = [
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
path('public/', PublicIndexView.as_view(), name='public-index'),
path('public.html', RedirectView.as_view(url='/public/'), name='public-index-html'),
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
re_path(r'^web/(?P<url>(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$', WebAddView.as_view(), name='web-add'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<url>https?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'),
re_path(r'^(?P<username>[^/]+)/(?P<url>https?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url-nodate'),
re_path(r'^(?P<username>[^/]+)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path-nodate'),
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),

View File

@@ -1,13 +1,16 @@
__package__ = 'archivebox.core'
import os
import posixpath
from glob import glob, escape
from django.utils import timezone
import inspect
from typing import Callable, get_type_hints
from pathlib import Path
from urllib.parse import urlparse
from django.shortcuts import render, redirect
from django.http import HttpRequest, HttpResponse, Http404
from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html, mark_safe
from django.views import View
from django.views.generic.list import ListView
@@ -31,6 +34,21 @@ from archivebox.misc.logging_util import printable_filesize
from archivebox.search import query_search_index
from archivebox.core.models import Snapshot
from archivebox.core.host_utils import build_snapshot_url
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
target = archivefile or ''
if target == 'index.html':
target = ''
fullpath = Path(snapshot.output_dir) / target
if fullpath.is_file():
target = str(Path(target).parent)
if target == '.':
target = ''
return target
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_enabled_plugins, get_plugin_name
@@ -86,13 +104,95 @@ class SnapshotView(View):
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
outputs = snapshot.discover_outputs()
hidden_card_plugins = {'archivedotorg', 'favicon', 'title'}
outputs = [
out for out in snapshot.discover_outputs()
if (out.get('size') or 0) > 0 and out.get('name') not in hidden_card_plugins
]
archiveresults = {out['name']: out for out in outputs}
snap_dir = Path(snapshot.output_dir)
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
accounted_entries: set[str] = set()
for output in outputs:
output_name = output.get('name') or ''
if output_name:
accounted_entries.add(output_name)
output_path = output.get('path') or ''
if not output_path:
continue
parts = Path(output_path).parts
if parts:
accounted_entries.add(parts[0])
ignore_names = {
'.DS_Store',
'index.html',
'index.json',
'index.jsonl',
'favicon.ico',
}
ignored_suffixes = {'.log', '.pid', '.sh'}
max_loose_scan = 300
def has_meaningful_files(dir_path: Path) -> bool:
scanned = 0
for file_path in dir_path.rglob('*'):
scanned += 1
if scanned > max_loose_scan:
return True
if file_path.is_dir() or file_path.name.startswith('.'):
continue
if file_path.suffix.lower() in ignored_suffixes:
continue
try:
if file_path.stat().st_size == 0:
continue
except OSError:
continue
return True
return False
unaccounted_entries = []
if snap_dir.exists():
for entry in snap_dir.iterdir():
name = entry.name
if name.startswith('.') or name in ignore_names or name in accounted_entries:
continue
is_dir = entry.is_dir()
is_meaningful = False
size = None
if is_dir:
is_meaningful = has_meaningful_files(entry)
elif entry.is_file():
if entry.suffix.lower() not in ignored_suffixes:
try:
size = entry.stat().st_size
is_meaningful = size > 0
except OSError:
size = None
is_meaningful = False
unaccounted_entries.append({
'name': name,
'path': name,
'is_dir': is_dir,
'size': size,
'is_meaningful': is_meaningful,
})
unaccounted_entries.sort(key=lambda item: item['name'].lower())
loose_items = [item for item in unaccounted_entries if item['is_meaningful']]
failed_exclude_suffixes = {'.json', '.jsonl', '.sh', '.log'}
failed_items = [
item for item in unaccounted_entries
if not item['is_meaningful']
and not (
not item['is_dir']
and Path(item['name']).suffix.lower() in failed_exclude_suffixes
)
]
preview_priority = [
'singlefile',
'screenshot',
@@ -111,12 +211,48 @@ class SnapshotView(View):
break
snapshot_info = snapshot.to_dict(extended=True)
related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url)
related_snapshots = list(
related_snapshots_qs.exclude(id=snapshot.id).order_by('-bookmarked_at', '-created_at', '-timestamp')[:25]
)
related_years_map: dict[int, list[Snapshot]] = {}
for snap in [snapshot, *related_snapshots]:
snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at
if not snap_dt:
continue
related_years_map.setdefault(snap_dt.year, []).append(snap)
related_years = []
for year, snaps in related_years_map.items():
snaps_sorted = sorted(
snaps,
key=lambda s: (s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now()),
reverse=True,
)
related_years.append({
'year': year,
'latest': snaps_sorted[0],
'snapshots': snaps_sorted,
})
related_years.sort(key=lambda item: item['year'], reverse=True)
try:
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
except IndexError:
warc_path = 'warc/'
ordered_outputs = sorted(
archiveresults.values(),
key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size'],
)
non_compact_outputs = [
out for out in ordered_outputs
if not out.get('is_compact') and not out.get('is_metadata')
]
compact_outputs = [
out for out in ordered_outputs
if out.get('is_compact') or out.get('is_metadata')
]
context = {
**snapshot_info,
'title': htmlencode(
@@ -131,9 +267,13 @@ class SnapshotView(View):
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
'warc_path': warc_path,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'archiveresults': [*non_compact_outputs, *compact_outputs],
'best_result': best_result,
'snapshot': snapshot, # Pass the snapshot object for template tags
'related_snapshots': related_snapshots,
'related_years': related_years,
'loose_items': loose_items,
'failed_items': failed_items,
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -168,13 +308,20 @@ class SnapshotView(View):
target_path = f'{target_path}?{query}'
return redirect(target_path)
if archivefile == 'index.html':
if request.GET.get('files'):
target_path = _files_index_target(snapshot, archivefile)
response = serve_static_with_byterange_support(
request, target_path, document_root=snapshot.output_dir, show_indexes=True,
)
elif archivefile == 'index.html':
# if they requested snapshot index, serve live rendered template instead of static html
response = self.render_live_index(request, snapshot)
else:
response = serve_static_with_byterange_support(
request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
)
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
query = request.META.get('QUERY_STRING')
if query:
target = f'{target}?{query}'
return redirect(target)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
except Snapshot.DoesNotExist:
@@ -328,13 +475,16 @@ class SnapshotView(View):
class SnapshotPathView(View):
"""Serve snapshots by the new URL scheme: /<username>/<YYYYMMDD>/<domain>/<uuid>/..."""
def get(self, request, username: str, date: str, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
def get(self, request, username: str, date: str | None = None, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
if username == 'system':
return redirect(request.path.replace('/system/', '/web/', 1))
if date and domain and domain == date:
raise Http404
requested_url = url
if not requested_url and domain and domain.startswith(('http://', 'https://')):
requested_url = domain
@@ -358,19 +508,20 @@ class SnapshotPathView(View):
else:
qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)
try:
if len(date) == 4:
qs = qs.filter(created_at__year=int(date))
elif len(date) == 6:
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
elif len(date) == 8:
qs = qs.filter(
created_at__year=int(date[:4]),
created_at__month=int(date[4:6]),
created_at__day=int(date[6:8]),
)
except ValueError:
pass
if date:
try:
if len(date) == 4:
qs = qs.filter(created_at__year=int(date))
elif len(date) == 6:
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
elif len(date) == 8:
qs = qs.filter(
created_at__year=int(date[:4]),
created_at__month=int(date[4:6]),
created_at__day=int(date[6:8]),
)
except ValueError:
pass
if requested_url:
snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
@@ -401,7 +552,10 @@ class SnapshotPathView(View):
)
canonical_base = snapshot.url_path
requested_base = f'{username}/{date}/{domain or url or ""}'
if date:
requested_base = f'{username}/{date}/{domain or url or ""}'
else:
requested_base = f'{username}/{domain or url or ""}'
if snapshot_id:
requested_base = f'{requested_base}/{snapshot_id}'
if canonical_base != requested_base:
@@ -412,6 +566,18 @@ class SnapshotPathView(View):
return redirect(target)
archivefile = path or "index.html"
if archivefile != "index.html" and not request.GET.get('files'):
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
query = request.META.get('QUERY_STRING')
if query:
target = f'{target}?{query}'
return redirect(target)
if request.GET.get('files'):
target_path = _files_index_target(snapshot, archivefile)
return serve_static_with_byterange_support(
request, target_path, document_root=snapshot.output_dir, show_indexes=True,
)
if archivefile == "index.html":
return SnapshotView.render_live_index(request, snapshot)
@@ -421,6 +587,202 @@ class SnapshotPathView(View):
)
def _safe_archive_relpath(path: str) -> str | None:
if not path:
return ""
cleaned = posixpath.normpath(path)
cleaned = cleaned.lstrip("/")
if cleaned.startswith("..") or "/../" in f"/{cleaned}/":
return None
return cleaned
def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None:
if not domain or not rel_path:
return None
domain = domain.split(":", 1)[0].lower()
# TODO: optimize by querying output_files in DB instead of globbing filesystem
data_root = DATA_DIR / "users"
escaped_domain = escape(domain)
escaped_path = escape(rel_path)
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path)
matches = glob(pattern)
if not matches:
return None
def sort_key(match_path: str) -> tuple[str, str]:
parts = Path(match_path).parts
date_str = ""
try:
idx = parts.index("snapshots")
date_str = parts[idx + 1]
except Exception:
date_str = ""
return (date_str, match_path)
best = max(matches, key=sort_key)
best_path = Path(best)
parts = best_path.parts
try:
responses_idx = parts.index("responses")
except ValueError:
return None
responses_root = Path(*parts[: responses_idx + 1])
rel_to_root = Path(*parts[responses_idx + 1 :])
return responses_root, rel_to_root
def _latest_responses_root(domain: str) -> Path | None:
if not domain:
return None
domain = domain.split(":", 1)[0].lower()
data_root = DATA_DIR / "users"
escaped_domain = escape(domain)
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain)
matches = glob(pattern)
if not matches:
return None
def sort_key(match_path: str) -> tuple[str, str]:
parts = Path(match_path).parts
date_str = ""
try:
idx = parts.index("snapshots")
date_str = parts[idx + 1]
except Exception:
date_str = ""
return (date_str, match_path)
best = max(matches, key=sort_key)
return Path(best)
def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool):
candidates: list[str] = []
rel_path = rel_path or ""
if rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
if "." not in Path(rel_path).name:
candidates.append(f"{rel_path.rstrip('/')}/index.html")
candidates.append(rel_path)
for candidate in candidates:
try:
return serve_static_with_byterange_support(
request,
candidate,
document_root=str(responses_root),
show_indexes=show_indexes,
)
except Http404:
pass
if rel_path.endswith("index.html"):
rel_dir = rel_path[: -len("index.html")]
try:
return serve_static_with_byterange_support(
request,
rel_dir,
document_root=str(responses_root),
show_indexes=True,
)
except Http404:
return None
return None
class SnapshotHostView(View):
"""Serve snapshot directory contents on <snapshot_id>.<listen_host>/<path>."""
def get(self, request, snapshot_id: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return HttpResponseForbidden("Public snapshots are disabled.")
snapshot = None
if snapshot_id:
try:
snapshot = Snapshot.objects.get(pk=snapshot_id)
except Snapshot.DoesNotExist:
try:
snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
except Snapshot.DoesNotExist:
snapshot = None
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
if not snapshot:
raise Http404
rel_path = path or ""
show_indexes = bool(request.GET.get("files"))
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
else:
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
try:
return serve_static_with_byterange_support(
request,
rel_path,
document_root=snapshot.output_dir,
show_indexes=show_indexes,
)
except Http404:
pass
# Fallback to responses/<domain>/<path>
host = urlparse(snapshot.url).hostname or snapshot.domain
responses_root = Path(snapshot.output_dir) / "responses" / host
if responses_root.exists():
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
class OriginalDomainHostView(View):
"""Serve responses from the most recent snapshot when using <domain>.<listen_host>/<path>."""
def get(self, request, domain: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return HttpResponseForbidden("Public snapshots are disabled.")
rel_path = path or ""
if not rel_path or rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
domain = domain.lower()
match = _latest_response_match(domain, rel_path)
if not match and "." not in Path(rel_path).name:
index_path = f"{rel_path.rstrip('/')}/index.html"
match = _latest_response_match(domain, index_path)
if not match and "." not in Path(rel_path).name:
html_path = f"{rel_path}.html"
match = _latest_response_match(domain, html_path)
show_indexes = bool(request.GET.get("files"))
if match:
responses_root, rel_to_root = match
response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
if response is not None:
return response
# If no direct match, try serving directory index from latest responses root
responses_root = _latest_responses_root(domain)
if responses_root:
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
class PublicIndexView(ListView):
template_name = 'public_index.html'
model = Snapshot
@@ -508,7 +870,7 @@ class AddView(UserPassesTestMixin, FormView):
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
def form_valid(self, form):
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
@@ -522,13 +884,21 @@ class AddView(UserPassesTestMixin, FormView):
update = form.cleaned_data.get("update", False)
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
custom_config = form.cleaned_data.get("config", {})
custom_config = form.cleaned_data.get("config") or {}
from archivebox.config.permissions import HOSTNAME
if created_by_id is None:
if self.request.user.is_authenticated:
created_by_id = self.request.user.pk
else:
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web'
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Crawl with the URLs from the file
@@ -552,8 +922,8 @@ class AddView(UserPassesTestMixin, FormView):
max_depth=depth,
tags_str=tag,
notes=notes,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
created_by_id=self.request.user.pk,
label=f'{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}',
created_by_id=created_by_id,
config=config
)
@@ -566,7 +936,7 @@ class AddView(UserPassesTestMixin, FormView):
is_enabled=True,
label=crawl.label,
notes=f"Auto-created from add page. {notes}".strip(),
created_by_id=self.request.user.pk,
created_by_id=created_by_id,
)
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
@@ -576,7 +946,13 @@ class AddView(UserPassesTestMixin, FormView):
# from archivebox.crawls.actors import CrawlActor
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
return crawl
def form_valid(self, form):
crawl = self._create_crawl_from_form(form)
urls = form.cleaned_data["url"]
schedule = form.cleaned_data.get("schedule", "").strip()
rough_url_count = urls.count('://')
# Build success message with schedule link if created
@@ -593,6 +969,74 @@ class AddView(UserPassesTestMixin, FormView):
return redirect(crawl.admin_change_url)
class WebAddView(AddView):
def _latest_snapshot_for_url(self, requested_url: str):
return SnapshotView.find_snapshots_for_url(requested_url).order_by(
'-created_at', '-bookmarked_at', '-timestamp'
).first()
def _normalize_add_url(self, requested_url: str) -> str:
if requested_url.startswith(('http://', 'https://')):
return requested_url
return f'https://{requested_url}'
def dispatch(self, request, *args, **kwargs):
requested_url = urldecode(kwargs.get('url', '') or '')
if requested_url:
snapshot = self._latest_snapshot_for_url(requested_url)
if snapshot:
return redirect(f'/{snapshot.url_path}')
if not self.test_func():
return HttpResponse(
format_html(
(
'<center><br/><br/><br/>'
'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
'Return to the <a href="/" target="_top">Main Index</a>'
'</center>'
),
requested_url or '',
),
content_type="text/html",
status=404,
)
return super().dispatch(request, *args, **kwargs)
def get(self, request, url: str):
requested_url = urldecode(url)
if not requested_url:
raise Http404
snapshot = self._latest_snapshot_for_url(requested_url)
if snapshot:
return redirect(f'/{snapshot.url_path}')
add_url = self._normalize_add_url(requested_url)
defaults_form = self.form_class()
form_data = {
'url': add_url,
'depth': defaults_form.fields['depth'].initial or '0',
'persona': defaults_form.fields['persona'].initial or 'Default',
'config': {},
}
if defaults_form.fields['update'].initial:
form_data['update'] = 'on'
if defaults_form.fields['overwrite'].initial:
form_data['overwrite'] = 'on'
if defaults_form.fields['index_only'].initial:
form_data['index_only'] = 'on'
form = self.form_class(data=form_data)
if not form.is_valid():
return self.form_invalid(form)
crawl = self._create_crawl_from_form(form)
snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl})
return redirect(f'/{snapshot.url_path}')
class HealthCheckView(View):
"""
A Django view that renders plain text "OK" for service discovery tools
@@ -617,11 +1061,19 @@ def live_progress_view(request):
from archivebox.workers.orchestrator import Orchestrator
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
machine = Machine.current()
orchestrator_proc = Process.objects.filter(
machine=machine,
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING,
).order_by('-started_at').first()
orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
# Get model counts by status
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
@@ -653,24 +1105,47 @@ def live_progress_view(request):
ext = embed.lower().split('.')[-1] if '.' in embed else ''
is_embeddable = ext in ('png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'pdf', 'html')
if is_embeddable or ar.plugin in ('screenshot', 'favicon', 'dom'):
archive_path = embed or ''
recent_thumbnails.append({
'id': str(ar.id),
'plugin': ar.plugin,
'snapshot_id': str(ar.snapshot_id),
'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '',
'embed_path': embed,
'archive_path': f'/{ar.snapshot.archive_path}/{embed}' if ar.snapshot else '',
'archive_path': archive_path,
'archive_url': build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else '',
'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
})
# Build hierarchical active crawls with nested snapshots and archive results
from django.db.models import Prefetch
running_workers = Process.objects.filter(
machine=machine,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
)
crawl_worker_pids: dict[str, int] = {}
snapshot_worker_pids: dict[str, int] = {}
for proc in running_workers:
env = proc.env or {}
if not isinstance(env, dict):
continue
if proc.worker_type == 'crawl':
crawl_id = env.get('CRAWL_ID')
if crawl_id:
crawl_worker_pids[str(crawl_id)] = proc.pid
elif proc.worker_type == 'snapshot':
snapshot_id = env.get('SNAPSHOT_ID')
if snapshot_id:
snapshot_worker_pids[str(snapshot_id)] = proc.pid
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).prefetch_related(
'snapshot_set',
'snapshot_set__archiveresult_set',
'snapshot_set__archiveresult_set__process',
).distinct().order_by('-modified_at')[:10]
active_crawls = []
@@ -710,8 +1185,9 @@ def live_progress_view(request):
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
# Calculate snapshot progress
snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
# Calculate snapshot progress using per-plugin progress
now = timezone.now()
plugin_progress_values: list[int] = []
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
@@ -724,14 +1200,42 @@ def live_progress_view(request):
}
return (status_order.get(ar.status, 4), ar.plugin)
all_plugins = [
{
all_plugins = []
for ar in sorted(snapshot_results, key=plugin_sort_key):
status = ar.status
progress_value = 0
if status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
):
progress_value = 100
elif status == ArchiveResult.StatusChoices.STARTED:
started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
timeout = ar.timeout or 120
if started_at and timeout:
elapsed = max(0.0, (now - started_at).total_seconds())
progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100)))
else:
progress_value = 1
else:
progress_value = 0
plugin_progress_values.append(progress_value)
plugin_payload = {
'id': str(ar.id),
'plugin': ar.plugin,
'status': ar.status,
'status': status,
}
for ar in sorted(snapshot_results, key=plugin_sort_key)
]
if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
plugin_payload['pid'] = ar.process.pid
if status == ArchiveResult.StatusChoices.STARTED:
plugin_payload['progress'] = progress_value
plugin_payload['timeout'] = ar.timeout or 120
all_plugins.append(plugin_payload)
snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
@@ -744,6 +1248,7 @@ def live_progress_view(request):
'failed_plugins': failed_plugins,
'pending_plugins': pending_plugins,
'all_plugins': all_plugins,
'worker_pid': snapshot_worker_pids.get(str(snapshot.id)),
})
# Check if crawl can start (for debugging stuck crawls)
@@ -772,10 +1277,12 @@ def live_progress_view(request):
'urls_preview': urls_preview,
'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry,
'worker_pid': crawl_worker_pids.get(str(crawl.id)),
})
return JsonResponse({
'orchestrator_running': orchestrator_running,
'orchestrator_pid': orchestrator_pid,
'total_workers': total_workers,
'crawls_pending': crawls_pending,
'crawls_started': crawls_started,

View File

@@ -1,8 +1,11 @@
__package__ = 'archivebox.core'
import json
import re
import hashlib
from django import forms
from django.utils.html import escape
from django.utils.safestring import mark_safe
class TagEditorWidget(forms.Widget):
@@ -27,6 +30,23 @@ class TagEditorWidget(forms.Widget):
"""Escape HTML entities in value."""
return escape(str(value)) if value else ''
def _normalize_id(self, value):
"""Normalize IDs for HTML + JS usage (letters, digits, underscore; JS-safe start)."""
normalized = re.sub(r'[^A-Za-z0-9_]', '_', str(value))
if not normalized or not re.match(r'[A-Za-z_]', normalized):
normalized = f't_{normalized}'
return normalized
def _tag_style(self, value):
"""Compute a stable pastel color style for a tag value."""
tag = (value or '').strip().lower()
digest = hashlib.md5(tag.encode('utf-8')).hexdigest()
hue = int(digest[:4], 16) % 360
bg = f'hsl({hue}, 70%, 92%)'
border = f'hsl({hue}, 60%, 82%)'
fg = f'hsl({hue}, 35%, 28%)'
return f'--tag-bg: {bg}; --tag-border: {border}; --tag-fg: {fg};'
def render(self, name, value, attrs=None, renderer=None):
"""
Render the tag editor widget.
@@ -67,13 +87,14 @@ class TagEditorWidget(forms.Widget):
elif isinstance(value, str):
tags = sorted([t.strip() for t in value.split(',') if t.strip()])
widget_id = attrs.get('id', name) if attrs else name
widget_id_raw = attrs.get('id', name) if attrs else name
widget_id = self._normalize_id(widget_id_raw)
# Build pills HTML
pills_html = ''
for tag in tags:
pills_html += f'''
<span class="tag-pill" data-tag="{self._escape(tag)}">
<span class="tag-pill" data-tag="{self._escape(tag)}" style="{self._tag_style(tag)}">
{self._escape(tag)}
<button type="button" class="tag-remove-btn" data-tag-name="{self._escape(tag)}">&times;</button>
</span>
@@ -92,6 +113,7 @@ class TagEditorWidget(forms.Widget):
placeholder="Add tag..."
autocomplete="off"
onkeydown="handleTagKeydown_{widget_id}(event)"
onkeypress="if(event.key==='Enter' || event.keyCode===13){{event.preventDefault(); event.stopPropagation();}}"
oninput="fetchTagAutocomplete_{widget_id}(this.value)"
>
<datalist id="{widget_id}_datalist"></datalist>
@@ -112,6 +134,47 @@ class TagEditorWidget(forms.Widget):
document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
}};
function computeTagStyle_{widget_id}(tagName) {{
var hash = 0;
var name = String(tagName || '').toLowerCase();
for (var i = 0; i < name.length; i++) {{
hash = (hash * 31 + name.charCodeAt(i)) % 360;
}}
var bg = 'hsl(' + hash + ', 70%, 92%)';
var border = 'hsl(' + hash + ', 60%, 82%)';
var fg = 'hsl(' + hash + ', 35%, 28%)';
return {{ bg: bg, border: border, fg: fg }};
}}
function applyTagStyle_{widget_id}(el, tagName) {{
var colors = computeTagStyle_{widget_id}(tagName);
el.style.setProperty('--tag-bg', colors.bg);
el.style.setProperty('--tag-border', colors.border);
el.style.setProperty('--tag-fg', colors.fg);
}}
function getApiKey() {{
return (window.ARCHIVEBOX_API_KEY || '').trim();
}}
function buildApiUrl(path) {{
var apiKey = getApiKey();
if (!apiKey) return path;
var sep = path.indexOf('?') !== -1 ? '&' : '?';
return path + sep + 'api_key=' + encodeURIComponent(apiKey);
}}
function buildApiHeaders() {{
var headers = {{
'Content-Type': 'application/json',
}};
var apiKey = getApiKey();
if (apiKey) headers['X-ArchiveBox-API-Key'] = apiKey;
var csrfToken = getCSRFToken();
if (csrfToken) headers['X-CSRFToken'] = csrfToken;
return headers;
}}
window.addTag_{widget_id} = function(tagName) {{
tagName = tagName.trim();
if (!tagName) return;
@@ -139,12 +202,9 @@ class TagEditorWidget(forms.Widget):
document.getElementById('{widget_id}_input').value = '';
// Create tag via API if it doesn't exist (fire and forget)
fetch('/api/v1/core/tags/create/', {{
fetch(buildApiUrl('/api/v1/core/tags/create/'), {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': getCSRFToken()
}},
headers: buildApiHeaders(),
body: JSON.stringify({{ name: tagName }})
}}).catch(function(err) {{
console.log('Tag creation note:', err);
@@ -166,6 +226,7 @@ class TagEditorWidget(forms.Widget):
var pill = document.createElement('span');
pill.className = 'tag-pill';
pill.setAttribute('data-tag', tag);
applyTagStyle_{widget_id}(pill, tag);
var tagText = document.createTextNode(tag);
pill.appendChild(tagText);
@@ -195,14 +256,16 @@ class TagEditorWidget(forms.Widget):
var input = event.target;
var value = input.value.trim();
if (event.key === 'Enter' || event.key === ' ' || event.key === ',') {{
if (event.key === 'Enter' || event.keyCode === 13 || event.key === ' ' || event.key === ',') {{
event.preventDefault();
event.stopPropagation();
if (value) {{
// Handle comma-separated values
value.split(',').forEach(function(tag) {{
addTag_{widget_id}(tag.trim());
}});
}}
return false;
}} else if (event.key === 'Backspace' && !value && currentTags_{widget_id}.length > 0) {{
// Remove last tag on backspace when input is empty
var lastTag = currentTags_{widget_id}.pop();
@@ -222,7 +285,7 @@ class TagEditorWidget(forms.Widget):
return;
}}
fetch('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))
fetch(buildApiUrl('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query)))
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
var datalist = document.getElementById('{widget_id}_datalist');
@@ -261,7 +324,7 @@ class TagEditorWidget(forms.Widget):
</script>
'''
return html
return mark_safe(html)
class InlineTagEditorWidget(TagEditorWidget):
@@ -295,20 +358,23 @@ class InlineTagEditorWidget(TagEditorWidget):
tag_data.sort(key=lambda x: x['name'].lower())
tags = [t['name'] for t in tag_data]
widget_id = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
widget_id_raw = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name)
widget_id = self._normalize_id(widget_id_raw)
# Build pills HTML with filter links
pills_html = ''
for td in tag_data:
pills_html += f'''
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}">
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}" style="{self._tag_style(td['name'])}">
<a href="/admin/core/snapshot/?tags__id__exact={td['id']}" class="tag-link">{self._escape(td['name'])}</a>
<button type="button" class="tag-remove-btn" data-tag-id="{td['id']}" data-tag-name="{self._escape(td['name'])}">&times;</button>
</span>
'''
tags_json = escape(json.dumps(tag_data))
html = f'''
<span id="{widget_id}_container" class="tag-editor-inline" onclick="focusInlineTagInput_{widget_id}(event)">
<span id="{widget_id}_container" class="tag-editor-inline" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}">
<span id="{widget_id}_pills" class="tag-pills-inline">
{pills_html}
</span>
@@ -318,195 +384,10 @@ class InlineTagEditorWidget(TagEditorWidget):
list="{widget_id}_datalist"
placeholder="+"
autocomplete="off"
onkeydown="handleInlineTagKeydown_{widget_id}(event)"
oninput="fetchInlineTagAutocomplete_{widget_id}(this.value)"
onfocus="this.placeholder='add tag...'"
onblur="this.placeholder='+'"
data-inline-tag-input="1"
>
<datalist id="{widget_id}_datalist"></datalist>
</span>
<script>
(function() {{
var snapshotId_{widget_id} = '{snapshot_id}';
var currentTagData_{widget_id} = {json.dumps(tag_data)};
var autocompleteTimeout_{widget_id} = null;
window.focusInlineTagInput_{widget_id} = function(event) {{
event.stopPropagation();
if (event.target.classList.contains('tag-remove-btn') || event.target.classList.contains('tag-link')) return;
document.getElementById('{widget_id}_input').focus();
}};
window.addInlineTag_{widget_id} = function(tagName) {{
tagName = tagName.trim();
if (!tagName) return;
// Check if tag already exists
var exists = currentTagData_{widget_id}.some(function(t) {{
return t.name.toLowerCase() === tagName.toLowerCase();
}});
if (exists) {{
document.getElementById('{widget_id}_input').value = '';
return;
}}
// Add via API
fetch('/api/v1/core/tags/add-to-snapshot/', {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': getCSRFToken()
}},
body: JSON.stringify({{
snapshot_id: snapshotId_{widget_id},
tag_name: tagName
}})
}})
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
if (data.success) {{
currentTagData_{widget_id}.push({{ id: data.tag_id, name: data.tag_name }});
currentTagData_{widget_id}.sort(function(a, b) {{
return a.name.toLowerCase().localeCompare(b.name.toLowerCase());
}});
rebuildInlinePills_{widget_id}();
}}
}})
.catch(function(err) {{
console.error('Error adding tag:', err);
}});
document.getElementById('{widget_id}_input').value = '';
}};
window.removeInlineTag_{widget_id} = function(tagId) {{
fetch('/api/v1/core/tags/remove-from-snapshot/', {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': getCSRFToken()
}},
body: JSON.stringify({{
snapshot_id: snapshotId_{widget_id},
tag_id: tagId
}})
}})
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
if (data.success) {{
currentTagData_{widget_id} = currentTagData_{widget_id}.filter(function(t) {{
return t.id !== tagId;
}});
rebuildInlinePills_{widget_id}();
}}
}})
.catch(function(err) {{
console.error('Error removing tag:', err);
}});
}};
window.rebuildInlinePills_{widget_id} = function() {{
var container = document.getElementById('{widget_id}_pills');
container.innerHTML = '';
currentTagData_{widget_id}.forEach(function(td) {{
var pill = document.createElement('span');
pill.className = 'tag-pill';
pill.setAttribute('data-tag', td.name);
pill.setAttribute('data-tag-id', td.id);
var link = document.createElement('a');
link.href = '/admin/core/snapshot/?tags__id__exact=' + td.id;
link.className = 'tag-link';
link.textContent = td.name;
pill.appendChild(link);
var removeBtn = document.createElement('button');
removeBtn.type = 'button';
removeBtn.className = 'tag-remove-btn';
removeBtn.setAttribute('data-tag-id', td.id);
removeBtn.setAttribute('data-tag-name', td.name);
removeBtn.innerHTML = '&times;';
pill.appendChild(removeBtn);
container.appendChild(pill);
}});
}};
// Add event delegation for remove buttons
document.getElementById('{widget_id}_pills').addEventListener('click', function(event) {{
if (event.target.classList.contains('tag-remove-btn')) {{
event.stopPropagation();
event.preventDefault();
var tagId = parseInt(event.target.getAttribute('data-tag-id'), 10);
if (tagId) {{
removeInlineTag_{widget_id}(tagId);
}}
}}
}});
window.handleInlineTagKeydown_{widget_id} = function(event) {{
event.stopPropagation();
var input = event.target;
var value = input.value.trim();
if (event.key === 'Enter' || event.key === ',') {{
event.preventDefault();
if (value) {{
value.split(',').forEach(function(tag) {{
addInlineTag_{widget_id}(tag.trim());
}});
}}
}}
}};
window.fetchInlineTagAutocomplete_{widget_id} = function(query) {{
if (autocompleteTimeout_{widget_id}) {{
clearTimeout(autocompleteTimeout_{widget_id});
}}
autocompleteTimeout_{widget_id} = setTimeout(function() {{
if (!query || query.length < 1) {{
document.getElementById('{widget_id}_datalist').innerHTML = '';
return;
}}
fetch('/api/v1/core/tags/autocomplete/?q=' + encodeURIComponent(query))
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
var datalist = document.getElementById('{widget_id}_datalist');
datalist.innerHTML = '';
(data.tags || []).forEach(function(tag) {{
var option = document.createElement('option');
option.value = tag.name;
datalist.appendChild(option);
}});
}})
.catch(function(err) {{
console.log('Autocomplete error:', err);
}});
}}, 150);
}};
function escapeHtml(text) {{
var div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}}
function getCSRFToken() {{
var cookies = document.cookie.split(';');
for (var i = 0; i < cookies.length; i++) {{
var cookie = cookies[i].trim();
if (cookie.startsWith('csrftoken=')) {{
return cookie.substring('csrftoken='.length);
}}
}}
var input = document.querySelector('input[name="csrfmiddlewaretoken"]');
return input ? input.value : '';
}}
}})();
</script>
'''
return html
return mark_safe(html)