WIP: checkpoint working tree before rebasing onto dev

This commit is contained in:
Nick Sweeting
2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions

View File

@@ -1,14 +1,23 @@
__package__ = 'archivebox.core'
import html
import json
import os
import shlex
from pathlib import Path
from urllib.parse import quote
from functools import reduce
from operator import and_
from django.contrib import admin
from django.db.models import Min, Q, TextField
from django.db.models.functions import Cast
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
from django.utils.text import smart_split
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
@@ -16,11 +25,71 @@ from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.widgets import InlineTagEditorWidget
from archivebox.core.views import LIVE_PLUGIN_BASE_URL
from archivebox.core.models import ArchiveResult, Snapshot
def _stringify_env_value(value) -> str:
if value is None:
return ''
if isinstance(value, str):
return value
return json.dumps(value, separators=(',', ':'))
def _quote_shell_string(value: str) -> str:
return "'" + str(value).replace("'", "'\"'\"'") + "'"
def _get_replay_source_url(result: ArchiveResult) -> str:
process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
def build_abx_dl_display_command(result: ArchiveResult) -> str:
source_url = _get_replay_source_url(result)
plugin_name = str(result.plugin or '').strip()
if not plugin_name and not source_url:
return 'abx-dl'
if not source_url:
return f'abx-dl --plugins={plugin_name}'
return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
def build_abx_dl_replay_command(result: ArchiveResult) -> str:
display_command = build_abx_dl_display_command(result)
process = getattr(result, 'process', None)
env = getattr(process, 'env', None) or {}
env_items = ' '.join(
f'{key}={shlex.quote(_stringify_env_value(value))}'
for key, value in sorted(env.items())
if value is not None
)
snapshot_dir = shlex.quote(str(result.snapshot_dir))
if env_items:
return f'cd {snapshot_dir}; env {env_items} {display_command}'
return f'cd {snapshot_dir}; {display_command}'
def get_plugin_admin_url(plugin_name: str) -> str:
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
if plugin_dir:
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(builtin_root):
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
user_root = USER_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(user_root):
return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
@@ -35,6 +104,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
'failed': ('#991b1b', '#fee2e2'), # red
'queued': ('#6b7280', '#f3f4f6'), # gray
'started': ('#92400e', '#fef3c7'), # amber
'backoff': ('#92400e', '#fef3c7'),
'skipped': ('#475569', '#f1f5f9'),
'noresults': ('#475569', '#f1f5f9'),
}
rows = []
@@ -54,8 +126,10 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
if len(full_output) > 60:
output_display += '...'
# Get full command as tooltip
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
display_cmd = build_abx_dl_display_command(result)
replay_cmd = build_abx_dl_replay_command(result)
cmd_str_escaped = html.escape(display_cmd)
cmd_attr = html.escape(replay_cmd, quote=True)
# Build output link - use embed_path() which checks output_files first
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
@@ -77,7 +151,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
title="View/edit archive result">
<code>{str(result.id)[:8]}</code>
<code>{str(result.id)[-8:]}</code>
</a>
</td>
<td style="padding: 10px 12px; white-space: nowrap;">
@@ -140,7 +214,15 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
<b>Command:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
<div style="position: relative; margin: 0; padding: 8px 56px 8px 8px; background: #1e293b; border-radius: 4px;">
<button type="button"
data-command="{cmd_attr}"
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
style="position: absolute; top: 6px; right: 6px; padding: 2px 8px; border: 0; border-radius: 4px; background: #334155; color: #e2e8f0; font-size: 11px; cursor: pointer;">
Copy
</button>
<code title="{cmd_attr}" style="display: block; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; color: #e2e8f0; font-size: 11px;">{cmd_str_escaped}</code>
</div>
</div>
</details>
</td>
@@ -165,7 +247,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
<thead>
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Details</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
@@ -193,7 +275,7 @@ class ArchiveResultInline(admin.TabularInline):
extra = 0
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
@@ -259,10 +341,11 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
list_display_links = None
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
search_fields = ()
autocomplete_fields = ['snapshot']
fieldsets = (
@@ -271,7 +354,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Plugin', {
'fields': ('plugin', 'plugin_with_icon', 'status', 'retry_at'),
'fields': ('plugin_with_icon', 'process_link', 'status'),
'classes': ('card',),
}),
('Timing', {
@@ -305,8 +388,61 @@ class ArchiveResultAdmin(BaseModelAdmin):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
def get_queryset(self, request):
return (
super()
.get_queryset(request)
.select_related('snapshot', 'process')
.prefetch_related('snapshot__tags')
.annotate(snapshot_first_tag=Min('snapshot__tags__name'))
)
def get_search_results(self, request, queryset, search_term):
if not search_term:
return queryset, False
queryset = queryset.annotate(
snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
output_json_text=Cast('output_json', output_field=TextField()),
cmd_text=Cast('process__cmd', output_field=TextField()),
)
search_bits = [
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
for bit in smart_split(search_term)
]
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
if not search_bits:
return queryset, False
filters = []
for bit in search_bits:
filters.append(
Q(snapshot_id_text__icontains=bit)
| Q(snapshot__url__icontains=bit)
| Q(snapshot__tags__name__icontains=bit)
| Q(snapshot_crawl_id_text__icontains=bit)
| Q(plugin__icontains=bit)
| Q(hook_name__icontains=bit)
| Q(output_str__icontains=bit)
| Q(output_json_text__icontains=bit)
| Q(cmd_text__icontains=bit)
)
return queryset.filter(reduce(and_, filters)).distinct(), True
@admin.display(description='Details', ordering='id')
def details_link(self, result):
return format_html(
'<a href="{}"><code>{}</code></a>',
reverse('admin:core_archiveresult_change', args=[result.id]),
str(result.id)[-8:],
)
@admin.display(
description='Snapshot Info'
description='Snapshot',
ordering='snapshot__url',
)
def snapshot_info(self, result):
snapshot_id = str(result.snapshot_id)
@@ -325,20 +461,83 @@ class ArchiveResultAdmin(BaseModelAdmin):
def tags_str(self, result):
return result.snapshot.tags_str()
@admin.display(description='Tags', ordering='snapshot_first_tag')
def tags_inline(self, result):
widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
tags_html = widget.render(
name=f'tags_{result.snapshot_id}',
value=result.snapshot.tags.all(),
attrs={'id': f'tags_{result.snapshot_id}'},
snapshot_id=str(result.snapshot_id),
)
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
@admin.display(description='Status', ordering='status')
def status_badge(self, result):
status = result.status or ArchiveResult.StatusChoices.QUEUED
return format_html(
'<span class="status-badge {} status-{}">{}</span>',
status,
status,
result.get_status_display() or status,
)
@admin.display(description='Plugin', ordering='plugin')
def plugin_with_icon(self, result):
icon = get_plugin_icon(result.plugin)
return format_html(
'<span title="{}">{}</span> {}',
'<a href="{}" title="{}">{}</a> <a href="{}"><code>{}</code></a>',
get_plugin_admin_url(result.plugin),
result.plugin,
icon,
get_plugin_admin_url(result.plugin),
result.plugin,
)
def cmd_str(self, result):
@admin.display(description='Process', ordering='process__pid')
def process_link(self, result):
if not result.process_id:
return '-'
process_label = result.process.pid if result.process and result.process.pid else '-'
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
'<a href="{}"><code>{}</code></a>',
reverse('admin:machine_process_change', args=[result.process_id]),
process_label,
)
@admin.display(description='Machine', ordering='process__machine__hostname')
def machine_link(self, result):
if not result.process_id or not result.process or not result.process.machine_id:
return '-'
machine = result.process.machine
return format_html(
'<a href="{}"><code>{}</code> {}</a>',
reverse('admin:machine_machine_change', args=[machine.id]),
str(machine.id)[:8],
machine.hostname,
)
@admin.display(description='Command')
def cmd_str(self, result):
display_cmd = build_abx_dl_display_command(result)
replay_cmd = build_abx_dl_replay_command(result)
return format_html(
'''
<div style="position: relative; width: 300px; min-width: 300px; max-width: 300px; overflow: hidden; box-sizing: border-box;">
<button type="button"
data-command="{}"
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
style="position: absolute; top: 6px; right: 6px; z-index: 1; padding: 2px 8px; border: 0; border-radius: 4px; background: #e2e8f0; color: #334155; font-size: 11px; cursor: pointer;">
Copy
</button>
<code title="{}" style="display: block; width: 100%; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 56px 8px 8px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; font-size: 11px; box-sizing: border-box;">
{}
</code>
</div>
''',
replay_cmd,
replay_cmd,
display_cmd,
)
def output_display(self, result):
@@ -352,6 +551,27 @@ class ArchiveResultAdmin(BaseModelAdmin):
result.output_str,
)
@admin.display(description='Output', ordering='output_str')
def output_str_display(self, result):
output_text = str(result.output_str or '').strip()
if not output_text:
return '-'
live_path = result.embed_path() if hasattr(result, 'embed_path') else None
if live_path:
return format_html(
'<a href="{}" title="{}"><code>{}</code></a>',
build_snapshot_url(str(result.snapshot_id), live_path),
output_text,
output_text,
)
return format_html(
'<span title="{}">{}</span>',
output_text,
output_text,
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
output_html = format_html(

View File

@@ -61,12 +61,14 @@ def register_admin_site():
from archivebox.crawls.admin import register_admin as register_crawls_admin
from archivebox.api.admin import register_admin as register_api_admin
from archivebox.machine.admin import register_admin as register_machine_admin
from archivebox.personas.admin import register_admin as register_personas_admin
from archivebox.workers.admin import register_admin as register_workers_admin
register_core_admin(archivebox_admin)
register_crawls_admin(archivebox_admin)
register_api_admin(archivebox_admin)
register_machine_admin(archivebox_admin)
register_personas_admin(archivebox_admin)
register_workers_admin(archivebox_admin)
return archivebox_admin

View File

@@ -6,6 +6,7 @@ from pathlib import Path
from django.contrib import admin, messages
from django.urls import path
from django.shortcuts import get_object_or_404, redirect
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.utils import timezone
@@ -14,6 +15,7 @@ from django.db.models.functions import Coalesce
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
from django.middleware.csrf import get_token
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
@@ -24,7 +26,7 @@ from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from archivebox.workers.tasks import bg_archive_snapshot, bg_archive_snapshots, bg_add
from archivebox.core.models import Tag, Snapshot, ArchiveResult
from archivebox.core.admin_archiveresults import render_archiveresults_list
@@ -215,10 +217,23 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid'),
path('<path:object_id>/redo-failed/', self.admin_site.admin_view(self.redo_failed_view), name='core_snapshot_redo_failed'),
]
return custom_urls + urls
def redo_failed_view(self, request, object_id):
snapshot = get_object_or_404(Snapshot, pk=object_id)
if request.method == 'POST':
queued = bg_archive_snapshot(snapshot, overwrite=False)
messages.success(
request,
f"Queued {queued} snapshot for re-archiving. The background runner will process it.",
)
return redirect(snapshot.admin_change_url)
# def get_queryset(self, request):
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
@@ -312,6 +327,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def admin_actions(self, obj):
summary_url = build_web_url(f'/{obj.archive_path}')
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
redo_failed_url = f'/admin/core/snapshot/{obj.pk}/redo-failed/'
csrf_token = get_token(self.request)
return format_html(
'''
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
@@ -344,13 +361,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
onmouseout="this.style.background='#eff6ff';">
🆕 Archive Now
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Redo failed extractors (missing outputs)"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
🔁 Redo Failed
</a>
<form action="{}" method="post" style="display: inline-flex; margin: 0;">
<input type="hidden" name="csrfmiddlewaretoken" value="{}">
<button type="submit" class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s; cursor: pointer;"
title="Redo failed extractors (missing outputs)"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
🔁 Redo Failed
</button>
</form>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Re-run all extractors (overwrite existing)"
@@ -367,14 +386,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
</a>
</div>
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
<b>Tip:</b> Redo Failed runs immediately. The other action buttons link to the list view with this snapshot pre-selected.
</p>
''',
summary_url,
results_url,
obj.url,
obj.pk,
obj.pk,
redo_failed_url,
csrf_token,
obj.pk,
obj.pk,
)

View File

@@ -1,63 +1,74 @@
__package__ = 'archivebox.core'
from django.contrib import admin
from urllib.parse import quote
from django import forms
from django.contrib import admin, messages
from django.contrib.admin.options import IS_POPUP_VAR
from django.http import HttpRequest, HttpResponseRedirect
from django.urls import reverse
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.core.models import SnapshotTag, Tag
from archivebox.core.tag_utils import (
TAG_HAS_SNAPSHOTS_CHOICES,
TAG_SORT_CHOICES,
build_tag_cards,
get_tag_creator_choices,
get_tag_year_choices,
normalize_created_by_filter,
normalize_created_year_filter,
normalize_has_snapshots_filter,
normalize_tag_sort,
)
from archivebox.core.host_utils import build_snapshot_url
class TagInline(admin.TabularInline):
model = SnapshotTag
# fk_name = 'snapshot'
fields = ('id', 'tag')
extra = 1
# min_num = 1
max_num = 1000
autocomplete_fields = (
'tag',
)
# class AutocompleteTags:
# model = Tag
# search_fields = ['name']
# name = 'name'
# # source_field = 'name'
# remote_field = Tag._meta.get_field('name')
# class AutocompleteTagsAdminStub:
# name = 'admin'
# class TaggedItemInline(admin.TabularInline):
# readonly_fields = ('object_link',)
# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields)
# model = TaggedItem
# extra = 1
# show_change_link = True
# @admin.display(description='object')
# def object_link(self, obj):
# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id)
# return format_html('<a href="/admin/{}/{}/{}/change"><b>[{}]</b></a>', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj))
class TagAdminForm(forms.ModelForm):
class Meta:
model = Tag
fields = '__all__'
widgets = {
'name': forms.TextInput(attrs={
'placeholder': 'research, receipts, product-design...',
'autocomplete': 'off',
'spellcheck': 'false',
'data-tag-name-input': '1',
}),
}
def clean_name(self):
name = (self.cleaned_data.get('name') or '').strip()
if not name:
raise forms.ValidationError('Tag name is required.')
return name
class TagAdmin(BaseModelAdmin):
list_display = ('created_at', 'created_by', 'id', 'name', 'num_snapshots', 'snapshots')
form = TagAdminForm
change_list_template = 'admin/core/tag/change_list.html'
change_form_template = 'admin/core/tag/change_form.html'
list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
list_filter = ('created_at', 'created_by')
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
search_fields = ('id', 'name', 'slug')
actions = ['delete_selected', 'merge_tags']
ordering = ['-created_at']
# inlines = [TaggedItemInline]
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
actions = ['delete_selected']
ordering = ['name', 'id']
fieldsets = (
('Tag Info', {
('Tag', {
'fields': ('name', 'slug'),
'classes': ('card',),
}),
@@ -65,112 +76,137 @@ class TagAdmin(BaseModelAdmin):
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Snapshots', {
('Recent Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
)
paginator = AccelleratedPaginator
add_fieldsets = (
('Tag', {
'fields': ('name',),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
)
def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
return self.fieldsets if obj else self.add_fieldsets
def num_snapshots(self, tag):
def changelist_view(self, request: HttpRequest, extra_context=None):
query = (request.GET.get('q') or '').strip()
sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
year = normalize_created_year_filter((request.GET.get('year') or '').strip())
has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
extra_context = {
**(extra_context or {}),
'initial_query': query,
'initial_sort': sort,
'initial_created_by': created_by,
'initial_year': year,
'initial_has_snapshots': has_snapshots,
'tag_sort_choices': TAG_SORT_CHOICES,
'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
'tag_created_by_choices': get_tag_creator_choices(),
'tag_year_choices': get_tag_year_choices(),
'initial_tag_cards': build_tag_cards(
query=query,
request=request,
sort=sort,
created_by=created_by,
year=year,
has_snapshots=has_snapshots,
),
'tag_search_api_url': reverse('api-1:search_tags'),
'tag_create_api_url': reverse('api-1:tags_create'),
}
return super().changelist_view(request, extra_context=extra_context)
def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
current_name = (request.POST.get('name') or '').strip()
if not current_name and obj:
current_name = obj.name
similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
if obj:
similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
context.update({
'tag_search_api_url': reverse('api-1:search_tags'),
'tag_similar_cards': similar_tag_cards,
'tag_similar_query': current_name,
})
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
return super().response_add(request, obj, post_url_continue=post_url_continue)
self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
return self._redirect_to_changelist(obj.name)
def response_change(self, request: HttpRequest, obj: Tag):
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
return super().response_change(request, obj)
self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
return self._redirect_to_changelist(obj.name)
def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
changelist_url = reverse('admin:core_tag_changelist')
if query:
changelist_url = f'{changelist_url}?q={quote(query)}'
return HttpResponseRedirect(changelist_url)
@admin.display(description='Snapshots')
def snapshots(self, tag: Tag):
snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
total_count = tag.snapshot_set.count()
if not snapshots:
return mark_safe(
f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. '
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>'
)
cards = []
for snapshot in snapshots:
title = (snapshot.title or '').strip() or snapshot.url
cards.append(format_html(
'''
<a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;">
<img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'">
<span style="min-width:0;">
<strong style="display:block;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</strong>
<code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code>
</span>
</a>
''',
reverse('admin:core_snapshot_change', args=[snapshot.pk]),
build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
title[:120],
snapshot.url[:120],
))
cards.append(format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
tag.id,
total_count,
))
return mark_safe('<div style="display:grid;gap:10px;">' + ''.join(cards) + '</div>')
@admin.display(description='Snapshots', ordering='num_snapshots')
def num_snapshots(self, tag: Tag):
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
tag.id,
tag.snapshot_set.count(),
count,
)
def snapshots(self, tag):
total_count = tag.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> {}',
snap.pk,
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
snap.url[:64],
)
for snap in tag.snapshot_set.order_by('-downloaded_at')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">{total_count} total snapshots...<a>'))
# def get_urls(self):
# urls = super().get_urls()
# custom_urls = [
# path(
# "merge-tags/",
# self.admin_site.admin_view(self.merge_tags_view),
# name="taggit_tag_merge_tags",
# ),
# ]
# return custom_urls + urls
# @admin.action(description="Merge selected tags")
# def merge_tags(self, request, queryset):
# selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME)
# if not selected:
# self.message_user(request, "Please select at least one tag.")
# return redirect(request.get_full_path())
# selected_tag_ids = ",".join(selected)
# redirect_url = f"{request.get_full_path()}merge-tags/"
# request.session["selected_tag_ids"] = selected_tag_ids
# return redirect(redirect_url)
# def merge_tags_view(self, request):
# selected_tag_ids = request.session.get("selected_tag_ids", "").split(",")
# if request.method == "POST":
# form = MergeTagsForm(request.POST)
# if form.is_valid():
# new_tag_name = form.cleaned_data["new_tag_name"]
# new_tag, created = Tag.objects.get_or_create(name=new_tag_name)
# with transaction.atomic():
# for tag_id in selected_tag_ids:
# tag = Tag.objects.get(id=tag_id)
# tagged_items = TaggedItem.objects.filter(tag=tag)
# for tagged_item in tagged_items:
# if TaggedItem.objects.filter(
# tag=new_tag,
# content_type=tagged_item.content_type,
# object_id=tagged_item.object_id,
# ).exists():
# # we have the new tag as well, so we can just
# # remove the tag association
# tagged_item.delete()
# else:
# # point this taggedItem to the new one
# tagged_item.tag = new_tag
# tagged_item.save()
# # delete the old tag
# if tag.id != new_tag.id:
# tag.delete()
# self.message_user(request, "Tags have been merged", level="success")
# # clear the selected_tag_ids from session after merge is complete
# request.session.pop("selected_tag_ids", None)
# return redirect("..")
# else:
# self.message_user(request, "Form is invalid.", level="error")
# context = {
# "form": MergeTagsForm(),
# "selected_tag_ids": selected_tag_ids,
# }
# return render(request, "admin/taggit/merge_tags_form.html", context)
# @admin.register(SnapshotTag, site=archivebox_admin)
# class SnapshotTagAdmin(BaseModelAdmin):
# list_display = ('id', 'snapshot', 'tag')
# sort_fields = ('id', 'snapshot', 'tag')
# search_fields = ('id', 'snapshot_id', 'tag_id')
# fields = ('snapshot', 'id')
# actions = ['delete_selected']
# ordering = ['-id']
def register_admin(admin_site):
admin_site.register(Tag, TagAdmin)

View File

@@ -1,12 +1,16 @@
__package__ = 'archivebox.core'
from django import forms
from django.utils.html import format_html
from archivebox.misc.util import URL_REGEX
from archivebox.misc.util import URL_REGEX, find_all_urls
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
from archivebox.crawls.schedule_utils import validate_schedule
from archivebox.hooks import get_plugins
from archivebox.config.common import SEARCH_BACKEND_CONFIG
from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget
from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_icon
from archivebox.personas.models import Persona
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
@@ -22,6 +26,22 @@ def get_plugin_choices():
return [(name, name) for name in get_plugins()]
def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
schema = plugin_configs.get(plugin_name, {})
description = str(schema.get('description') or '').strip()
if not description:
return plugin_name
icon_html = get_plugin_icon(plugin_name)
return format_html(
'<span class="plugin-choice-icon">{}</span><span class="plugin-choice-name">{}</span><a class="plugin-choice-description" href="https://archivebox.github.io/abx-plugins/#{}" target="_blank" rel="noopener noreferrer">{}</a>',
icon_html,
plugin_name,
plugin_name,
description,
)
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
field = form.fields[name]
if not isinstance(field, forms.ChoiceField):
@@ -31,22 +51,19 @@ def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
class AddLinkForm(forms.Form):
# Basic fields
url = forms.RegexField(
label="URLs (one per line)",
regex=URL_REGEX,
min_length=6,
url = forms.CharField(
label="URLs",
strip=True,
widget=forms.Textarea,
widget=forms.Textarea(attrs={
'data-url-regex': URL_REGEX.pattern,
}),
required=True
)
tag = forms.CharField(
label="Tags (comma separated tag1,tag2,tag3)",
label="Tags",
strip=True,
required=False,
widget=forms.TextInput(attrs={
'list': 'tag-datalist',
'autocomplete': 'off',
})
widget=TagEditorWidget(),
)
depth = forms.ChoiceField(
label="Archive depth",
@@ -58,11 +75,15 @@ class AddLinkForm(forms.Form):
label="Notes",
strip=True,
required=False,
widget=forms.Textarea(attrs={
'rows': 3,
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
widget=forms.TextInput(attrs={
'placeholder': 'Optional notes about this crawl',
})
)
url_filters = forms.Field(
label="URL allowlist / denylist",
required=False,
widget=URLFiltersWidget(source_selector='textarea[name="url"]'),
)
# Plugin groups
chrome_plugins = forms.MultipleChoiceField(
@@ -111,24 +132,15 @@ class AddLinkForm(forms.Form):
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
})
)
persona = forms.CharField(
persona = forms.ModelChoiceField(
label="Persona (authentication profile)",
max_length=100,
initial='Default',
required=False,
)
overwrite = forms.BooleanField(
label="Overwrite existing snapshots",
initial=False,
required=False,
)
update = forms.BooleanField(
label="Update/retry previously failed URLs",
initial=False,
required=False,
queryset=Persona.objects.none(),
empty_label=None,
to_field_name='name',
)
index_only = forms.BooleanField(
label="Index only (don't archive yet)",
label="Index only dry run (add crawl but don't archive yet)",
initial=False,
required=False,
)
@@ -142,11 +154,13 @@ class AddLinkForm(forms.Form):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Import at runtime to avoid circular imports
from archivebox.config.common import ARCHIVING_CONFIG
default_persona = Persona.get_or_create_default()
self.fields['persona'].queryset = Persona.objects.order_by('name')
self.fields['persona'].initial = default_persona.name
# Get all plugins
all_plugins = get_plugins()
plugin_configs = discover_plugin_configs()
# Define plugin groups
chrome_dependent = {
@@ -170,26 +184,28 @@ class AddLinkForm(forms.Form):
# Populate plugin field choices
get_choice_field(self, 'chrome_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
]
get_choice_field(self, 'archiving_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in archiving
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
]
get_choice_field(self, 'parsing_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in parsing
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
]
get_choice_field(self, 'search_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in search
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
]
get_choice_field(self, 'binary_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in binary
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
]
get_choice_field(self, 'extension_plugins').choices = [
(p, p) for p in sorted(all_plugins) if p in extensions
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
]
# Set update default from config
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
if required_search_plugin in search_choices:
get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
def clean(self):
cleaned_data = super().clean() or {}
@@ -207,6 +223,23 @@ class AddLinkForm(forms.Form):
return cleaned_data
def clean_url(self):
value = self.cleaned_data.get('url') or ''
urls = '\n'.join(find_all_urls(value))
if not urls:
raise forms.ValidationError('Enter at least one valid URL.')
return urls
def clean_url_filters(self):
from archivebox.crawls.models import Crawl
value = self.cleaned_data.get('url_filters') or {}
return {
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
'same_domain_only': bool(value.get('same_domain_only')),
}
def clean_schedule(self):
schedule = (self.cleaned_data.get('schedule') or '').strip()
if not schedule:

View File

@@ -163,6 +163,10 @@ def get_api_base_url(request=None) -> str:
return _build_base_url_for_host(get_api_host(), request=request)
def get_public_base_url(request=None) -> str:
return _build_base_url_for_host(get_public_host(), request=request)
# Backwards-compat aliases (archive == web)
def get_archive_base_url(request=None) -> str:
return get_web_base_url(request=request)

View File

@@ -0,0 +1,15 @@
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("core", "0031_add_archiveresult_snapshot_status_index"),
]
operations = [
migrations.RemoveField(
model_name="archiveresult",
name="retry_at",
),
]

View File

@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.workers.tasks import bg_archive_snapshot
from archivebox.crawls.models import Crawl
from archivebox.machine.models import NetworkInterface, Binary
from archivebox.machine.models import Binary
@@ -60,32 +60,41 @@ class Tag(ModelWithUUID):
def __str__(self):
return self.name
def _generate_unique_slug(self) -> str:
base_slug = slugify(self.name) or 'tag'
existing = Tag.objects.filter(slug__startswith=base_slug)
if self.pk:
existing = existing.exclude(pk=self.pk)
existing_slugs = set(existing.values_list("slug", flat=True))
slug = base_slug
i = 1
while slug in existing_slugs:
slug = f"{base_slug}_{i}"
i += 1
return slug
def save(self, *args, **kwargs):
is_new = self._state.adding
if is_new:
self.slug = slugify(self.name)
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
i = None
while True:
slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
if slug not in existing:
self.slug = slug
break
i = (i or 0) + 1
existing_name = None
if self.pk:
existing_name = Tag.objects.filter(pk=self.pk).values_list('name', flat=True).first()
if not self.slug or existing_name != self.name:
self.slug = self._generate_unique_slug()
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Tag',
indent_level=0,
metadata={
'id': self.id,
'name': self.name,
'slug': self.slug,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# log_worker_event(
# worker_type='DB',
# event='Created Tag',
# indent_level=0,
# metadata={
# 'id': self.id,
# 'name': self.name,
# 'slug': self.slug,
# },
# )
@property
def api_url(self) -> str:
@@ -364,7 +373,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct()
def save(self, *args, **kwargs):
is_new = self._state.adding
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp:
@@ -393,24 +401,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
super().save(*args, **kwargs)
self.ensure_legacy_archive_symlink()
if self.url not in self.crawl.urls:
existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url}
if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls:
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Snapshot',
indent_level=2,
url=self.url,
metadata={
'id': str(self.id),
'crawl_id': str(self.crawl_id),
'depth': self.depth,
'status': self.status,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# log_worker_event(
# worker_type='DB',
# event='Created Snapshot',
# indent_level=2,
# url=self.url,
# metadata={
# 'id': str(self.id),
# 'crawl_id': str(self.crawl_id),
# 'depth': self.depth,
# 'status': self.status,
# },
# )
# =========================================================================
# Filesystem Migration Methods
@@ -1528,16 +1537,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
"""
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
Called by: SnapshotMachine.enter_started()
Hook Lifecycle:
1. discover_hooks('Snapshot') → finds all plugin hooks
2. For each hook:
- Create ArchiveResult with status=QUEUED
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
3. ArchiveResults execute independently via ArchiveResultMachine
4. Hook execution happens in ArchiveResult.run(), NOT here
Returns:
list[ArchiveResult]: Newly created pending results
"""
@@ -1602,7 +1601,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'url': self.url,
'title': self.title,
'tags': self.tags_str(),
'tags_str': self.tags_str(),
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
'created_at': self.created_at.isoformat() if self.created_at else None,
'timestamp': self.timestamp,
@@ -1672,7 +1670,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# ID not found, fall through to create-by-URL logic
pass
url = record.get('url')
from archivebox.misc.util import fix_url_from_markdown
url = fix_url_from_markdown(str(record.get('url') or '').strip())
if not url:
return None
@@ -1807,7 +1807,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
defaults={
'plugin': plugin,
'status': ArchiveResult.INITIAL_STATE,
'retry_at': timezone.now(),
},
)
if archiveresult.status == ArchiveResult.INITIAL_STATE:
@@ -1853,11 +1852,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
failed = results.filter(status='failed').count()
running = results.filter(status='started').count()
skipped = results.filter(status='skipped').count()
noresults = results.filter(status='noresults').count()
total = results.count()
pending = total - succeeded - failed - running - skipped
pending = total - succeeded - failed - running - skipped - noresults
# Calculate percentage (succeeded + failed + skipped as completed)
completed = succeeded + failed + skipped
# Calculate percentage (succeeded + failed + skipped + noresults as completed)
completed = succeeded + failed + skipped + noresults
percent = int((completed / total * 100) if total > 0 else 0)
# Sum output sizes
@@ -1875,47 +1875,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'running': running,
'pending': pending,
'skipped': skipped,
'noresults': noresults,
'percent': percent,
'output_size': output_size,
'is_sealed': is_sealed,
}
def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
def retry_failed_archiveresults(self) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
This enables seamless retry of the entire extraction pipeline:
- Resets FAILED and SKIPPED results to QUEUED
- Sets retry_at so workers pick them up
- Plugins run in order (numeric prefix)
- Each plugin checks its dependencies at runtime
Dependency handling (e.g., chrome → screenshot):
- Plugins check if required outputs exist before running
- If dependency output missing → plugin returns 'skipped'
- On retry, if dependency now succeeds → dependent can run
Returns count of ArchiveResults reset.
"""
retry_at = retry_at or timezone.now()
count = self.archiveresult_set.filter(
status__in=[
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
]
).update(
status=ArchiveResult.StatusChoices.QUEUED,
retry_at=retry_at,
output=None,
output_str='',
output_json=None,
output_files={},
output_size=0,
output_mimetypes='',
start_ts=None,
end_ts=None,
)
# Also reset the snapshot and current_step so it gets re-checked from the beginning
if count > 0:
self.status = self.StatusChoices.STARTED
self.retry_at = retry_at
self.retry_at = timezone.now()
self.current_step = 0 # Reset to step 0 for retry
self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
@@ -2228,6 +2219,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
best_result = outputs[0]
context = {
**self.to_dict(extended=True),
'snapshot': self,
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
'url_str': htmlencode(urldecode(self.base_url)),
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
@@ -2275,8 +2267,8 @@ class SnapshotMachine(BaseStateMachine):
│ • discover_hooks('Snapshot') → finds all plugin hooks │
│ • create_pending_archiveresults() → creates ONE │
│ ArchiveResult per hook (NO execution yet) │
│ 2. ArchiveResults process independently with their own
state machines (see ArchiveResultMachine)
│ 2. The shared abx-dl runner executes hooks and the
projector updates ArchiveResult rows from events
│ 3. Advance through steps 0-9 as foreground hooks complete │
└─────────────────────────────────────────────────────────────┘
↓ tick() when is_finished()
@@ -2358,7 +2350,7 @@ class SnapshotMachine(BaseStateMachine):
cast(Any, crawl).sm.seal()
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes):
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -2366,6 +2358,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
NORESULTS = 'noresults', 'No Results'
INITIAL_STATE = StatusChoices.QUEUED
ACTIVE_STATE = StatusChoices.STARTED
FINAL_STATES = (
StatusChoices.SUCCEEDED,
StatusChoices.FAILED,
StatusChoices.SKIPPED,
StatusChoices.NORESULTS,
)
FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE)
@classmethod
def get_plugin_choices(cls):
@@ -2404,16 +2407,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True)
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
notes = models.TextField(blank=True, null=False, default='')
# output_dir is computed via @property from snapshot.output_dir / plugin
state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
active_state = StatusChoices.STARTED
snapshot_id: uuid.UUID
process_id: uuid.UUID | None
@@ -2421,7 +2418,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithNotes.Meta,
ModelWithStateMachine.Meta,
):
app_label = 'core'
verbose_name = 'Archive Result'
@@ -2516,40 +2512,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return None
def save(self, *args, **kwargs):
is_new = self._state.adding
# Create Process record if this is a new ArchiveResult and no process exists yet
if is_new and not self.process_id:
from archivebox.machine.models import Process, Machine
process = Process.objects.create(
machine=Machine.current(),
pwd=str(Path(self.snapshot.output_dir) / self.plugin),
cmd=[], # Will be set by run()
status='queued',
timeout=120,
env={},
)
self.process = process
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
# Call the Django Model.save() directly instead
models.Model.save(self, *args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created ArchiveResult',
indent_level=3,
plugin=self.plugin,
metadata={
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
'snapshot_url': str(self.snapshot.url)[:64],
'status': self.status,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# log_worker_event(
# worker_type='DB',
# event='Created ArchiveResult',
# indent_level=3,
# plugin=self.plugin,
# metadata={
# 'id': str(self.id),
# 'snapshot_id': str(self.snapshot_id),
# 'snapshot_url': str(self.snapshot.url)[:64],
# 'status': self.status,
# },
# )
@cached_property
def snapshot_dir(self):
@@ -2566,6 +2546,28 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def get_absolute_url(self):
return f'/{self.snapshot.archive_path}/{self.plugin}'
def reset_for_retry(self, *, save: bool = True) -> None:
self.status = self.StatusChoices.QUEUED
self.output_str = ''
self.output_json = None
self.output_files = {}
self.output_size = 0
self.output_mimetypes = ''
self.start_ts = None
self.end_ts = None
if save:
self.save(update_fields=[
'status',
'output_str',
'output_json',
'output_files',
'output_size',
'output_mimetypes',
'start_ts',
'end_ts',
'modified_at',
])
@property
def plugin_module(self) -> Any | None:
# Hook scripts are now used instead of Python plugin modules
@@ -2723,11 +2725,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return None
def create_output_dir(self):
output_dir = Path(self.snapshot_dir) / self.plugin
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir
@property
def output_dir_name(self) -> str:
return self.plugin
@@ -2782,134 +2779,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def save_search_index(self):
pass
def cascade_health_update(self, success: bool):
"""Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
# Update archival hierarchy
self.snapshot.increment_health_stats(success)
self.snapshot.crawl.increment_health_stats(success)
# Update execution infrastructure
if self.binary:
self.binary.increment_health_stats(success)
if self.binary.machine:
self.binary.machine.increment_health_stats(success)
if self.iface:
self.iface.increment_health_stats(success)
def run(self):
"""
Execute this ArchiveResult's hook and update status.
If self.hook_name is set, runs only that specific hook.
If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
Updates status/output fields, queues discovered URLs, and triggers indexing.
"""
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
from archivebox.config.configset import get_config
# Get merged config with proper context
config = get_config(
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
# Determine which hook(s) to run
hooks = []
if self.hook_name:
# SPECIFIC HOOK MODE: Find the specific hook by name
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
plugin_dir = base_dir / self.plugin
if plugin_dir.exists():
hook_path = plugin_dir / self.hook_name
if hook_path.exists():
hooks.append(hook_path)
break
else:
# LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
plugin_dir = base_dir / self.plugin
if plugin_dir.exists():
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
if matches:
hooks.extend(sorted(matches))
if not hooks:
self.status = self.StatusChoices.FAILED
if self.hook_name:
self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
else:
self.output_str = f'No hooks found for plugin: {self.plugin}'
self.retry_at = None
self.save()
return
# Output directory is plugin_dir for the hook output
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
start_ts = timezone.now()
process = None
for hook in hooks:
# Run hook using Process.launch() - returns Process model
process = run_hook(
hook,
output_dir=plugin_dir,
config=config,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
crawl_id=str(self.snapshot.crawl.id),
depth=self.snapshot.depth,
)
# Link ArchiveResult to Process
self.process = process
self.start_ts = start_ts
self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
if not process:
# No hooks ran
self.status = self.StatusChoices.FAILED
self.output_str = 'No hooks executed'
self.save()
return
# Update status based on hook execution
if process.status == process.StatusChoices.RUNNING:
# BACKGROUND HOOK - still running, return immediately
# Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
return
# FOREGROUND HOOK - completed, update from filesystem
self.update_from_output()
# Clean up empty output directory if no files were created
if plugin_dir.exists() and not self.output_files:
try:
if not any(plugin_dir.iterdir()):
plugin_dir.rmdir()
except (OSError, RuntimeError):
pass
def update_from_output(self):
"""
Update this ArchiveResult from filesystem logs and output files.
Used for:
- Foreground hooks that completed (called from ArchiveResult.run())
- Background hooks that completed (called from Snapshot.cleanup())
Used for Snapshot cleanup / orphan recovery when a hook's output exists
on disk but the projector did not finalize the row in the database.
Updates:
- status, output_str, output_json from ArchiveResult JSONL record
- output_files, output_size, output_mimetypes by walking filesystem
- end_ts, retry_at, cmd, cmd_version, binary FK
- end_ts, cmd, cmd_version, binary FK
- Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
"""
import mimetypes
@@ -2924,7 +2804,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.status = self.StatusChoices.FAILED
self.output_str = 'Output directory not found'
self.end_ts = timezone.now()
self.retry_at = None
self.save()
return
@@ -2948,6 +2827,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
'succeeded': self.StatusChoices.SUCCEEDED,
'failed': self.StatusChoices.FAILED,
'skipped': self.StatusChoices.SKIPPED,
'noresults': self.StatusChoices.NORESULTS,
}
self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
@@ -3011,7 +2891,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Update timestamps
self.end_ts = timezone.now()
self.retry_at = None
self.save()
@@ -3095,340 +2974,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
"""
import re
from archivebox.config.configset import get_config
# Get merged config with proper hierarchy
config = get_config(
user=self.created_by,
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
# Get allowlist/denylist (can be string or list)
allowlist_raw = config.get('URL_ALLOWLIST', '')
denylist_raw = config.get('URL_DENYLIST', '')
# Normalize to list of patterns
def to_pattern_list(value):
if isinstance(value, list):
return value
if isinstance(value, str):
return [p.strip() for p in value.split(',') if p.strip()]
return []
allowlist = to_pattern_list(allowlist_raw)
denylist = to_pattern_list(denylist_raw)
# Denylist takes precedence
if denylist:
for pattern in denylist:
try:
if re.search(pattern, url):
return False
except re.error:
continue # Skip invalid regex patterns
# If allowlist exists, URL must match at least one pattern
if allowlist:
for pattern in allowlist:
try:
if re.search(pattern, url):
return True
except re.error:
continue # Skip invalid regex patterns
return False # No allowlist patterns matched
return True # No filters or passed filters
return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot)
@property
def output_dir(self) -> Path:
"""Get the output directory for this plugin's results."""
return Path(self.snapshot.output_dir) / self.plugin
def is_background_hook(self) -> bool:
"""Check if this ArchiveResult is for a background hook."""
plugin_dir = Path(self.pwd) if self.pwd else None
if not plugin_dir:
return False
pid_file = plugin_dir / 'hook.pid'
return pid_file.exists()
# =============================================================================
# ArchiveResult State Machine
# =============================================================================
class ArchiveResultMachine(BaseStateMachine):
"""
State machine for managing ArchiveResult (single plugin execution) lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Waiting for its turn to run │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. archiveresult.run() │
│ • Find specific hook by hook_name │
│ • run_hook(script, output_dir, ...) → subprocess │
│ │
│ 2a. FOREGROUND hook (returns HookResult): │
│ • update_from_output() immediately │
│ - Read stdout.log │
│ - Parse JSONL records │
│ - Extract 'ArchiveResult' record → update status │
│ - Walk output_dir → populate output_files │
│ - Call process_hook_records() for side effects │
│ │
│ 2b. BACKGROUND hook (returns None): │
│ • Status stays STARTED │
│ • Continues running in background │
│ • Killed by Snapshot.cleanup() when sealed │
└─────────────────────────────────────────────────────────────┘
↓ tick() checks status
┌─────────────────────────────────────────────────────────────┐
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
│ • Set by hook's JSONL output during update_from_output() │
│ • Health stats incremented (num_uses_succeeded/failed) │
│ • Parent Snapshot health stats also updated │
└─────────────────────────────────────────────────────────────┘
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model_attr_name = 'archiveresult'
# States
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
started = State(value=ArchiveResult.StatusChoices.STARTED)
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
# Flow: queued → started → (succeeded|failed|skipped)
# queued → skipped (if exceeded max attempts)
# started → backoff → started (retry)
tick = (
queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
| queued.to.itself(unless='can_start')
| queued.to(started, cond='can_start')
| started.to(succeeded, cond='is_succeeded')
| started.to(failed, cond='is_failed')
| started.to(skipped, cond='is_skipped')
| started.to(backoff, cond='is_backoff')
| backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
| backoff.to.itself(unless='can_start')
| backoff.to(started, cond='can_start')
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
# Reason: backoff should always retry→started, then started→final states
)
archiveresult: ArchiveResult
def can_start(self) -> bool:
"""Pure function - check if AR can start (has valid URL)."""
return bool(self.archiveresult.snapshot.url)
def is_exceeded_max_attempts(self) -> bool:
"""Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
# Count failed ArchiveResults for this snapshot (any plugin type)
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.FAILED
).count()
return failed_count >= max_attempts
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if extractor plugin failed (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extractor plugin was skipped (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
and not self.archiveresult.output_str
)
def is_finished(self) -> bool:
"""
Check if extraction has completed (success, failure, or skipped).
For background hooks in STARTED state, checks if their Process has finished and reaps them.
"""
# If already in final state, return True
if self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
):
return True
# If in STARTED state with a Process, check if Process has finished running
if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
if self.archiveresult.process_id:
process = self.archiveresult.process
# If process is NOT running anymore, reap the background hook
if not process.is_running:
self.archiveresult.update_from_output()
# Check if now in final state after reaping
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
return False
@queued.enter
def enter_queued(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
# Update Process with network interface
if self.archiveresult.process_id:
self.archiveresult.process.iface = NetworkInterface.current()
self.archiveresult.process.save()
# Lock the object and mark start time
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
)
# Run the plugin - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
@backoff.enter
def enter_backoff(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
end_ts=None,
)
def _check_and_seal_parent_snapshot(self):
"""
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
Note: In the new architecture, the shared runner handles step advancement and sealing.
This method is kept for direct model-driven edge cases.
"""
import sys
snapshot = self.archiveresult.snapshot
# Check if all archiveresults are finished (in final states)
remaining_active = snapshot.archiveresult_set.exclude(
status__in=[
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
]
).count()
if remaining_active == 0:
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
# Seal the parent snapshot
cast(Any, snapshot).sm.seal()
@succeeded.enter
def enter_succeeded(self):
import sys
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=True)
print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
@failed.enter
def enter_failed(self):
import sys
print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=False)
print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
@skipped.enter
def enter_skipped(self):
import sys
# Set output_str if not already set (e.g., when skipped due to max attempts)
if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
from archivebox.config.configset import get_config
config = get_config(
crawl=self.archiveresult.snapshot.crawl,
snapshot=self.archiveresult.snapshot,
)
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
# =============================================================================
# State Machine Registration
# =============================================================================
@@ -3436,4 +2988,3 @@ class ArchiveResultMachine(BaseStateMachine):
# Manually register state machines with python-statemachine registry
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
registry.register(SnapshotMachine)
registry.register(ArchiveResultMachine)

View File

@@ -232,11 +232,12 @@ SQLITE_CONNECTION_OPTIONS = {
# https://gcollazo.com/optimal-sqlite-settings-for-django/
# https://litestream.io/tips/#busy-timeout
# https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
"timeout": 10,
"timeout": 30,
"check_same_thread": False,
"transaction_mode": "IMMEDIATE",
"init_command": (
"PRAGMA foreign_keys=ON;"
"PRAGMA busy_timeout = 30000;"
"PRAGMA journal_mode = WAL;"
"PRAGMA synchronous = NORMAL;"
"PRAGMA temp_store = MEMORY;"

View File

@@ -0,0 +1,271 @@
from __future__ import annotations
import json
from collections import defaultdict
from typing import Any
from django.contrib.auth.models import User
from django.db.models import Count, F, Q, QuerySet
from django.db.models.functions import Lower
from django.http import HttpRequest
from django.urls import reverse
from archivebox.core.host_utils import build_snapshot_url, build_web_url
from archivebox.core.models import Snapshot, SnapshotTag, Tag
TAG_SNAPSHOT_PREVIEW_LIMIT = 10
TAG_SORT_CHOICES = (
('name_asc', 'Name A-Z'),
('name_desc', 'Name Z-A'),
('created_desc', 'Created newest'),
('created_asc', 'Created oldest'),
('snapshots_desc', 'Most snapshots'),
('snapshots_asc', 'Fewest snapshots'),
)
TAG_HAS_SNAPSHOTS_CHOICES = (
('all', 'All'),
('yes', 'Has snapshots'),
('no', 'No snapshots'),
)
def normalize_tag_name(name: str) -> str:
return (name or '').strip()
def normalize_tag_sort(sort: str = 'created_desc') -> str:
valid_sorts = {key for key, _label in TAG_SORT_CHOICES}
return sort if sort in valid_sorts else 'created_desc'
def normalize_has_snapshots_filter(value: str = 'all') -> str:
valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES}
return value if value in valid_filters else 'all'
def normalize_created_by_filter(created_by: str = '') -> str:
return created_by if str(created_by).isdigit() else ''
def normalize_created_year_filter(year: str = '') -> str:
year = (year or '').strip()
return year if len(year) == 4 and year.isdigit() else ''
def get_matching_tags(
query: str = '',
sort: str = 'created_desc',
created_by: str = '',
year: str = '',
has_snapshots: str = 'all',
) -> QuerySet[Tag]:
queryset = Tag.objects.select_related('created_by').annotate(
num_snapshots=Count('snapshot_set', distinct=True),
)
query = normalize_tag_name(query)
if query:
queryset = queryset.filter(
Q(name__icontains=query) | Q(slug__icontains=query),
)
created_by = normalize_created_by_filter(created_by)
if created_by:
queryset = queryset.filter(created_by_id=int(created_by))
year = normalize_created_year_filter(year)
if year:
queryset = queryset.filter(created_at__year=int(year))
has_snapshots = normalize_has_snapshots_filter(has_snapshots)
if has_snapshots == 'yes':
queryset = queryset.filter(num_snapshots__gt=0)
elif has_snapshots == 'no':
queryset = queryset.filter(num_snapshots=0)
sort = normalize_tag_sort(sort)
if sort == 'name_asc':
queryset = queryset.order_by(Lower('name'), 'id')
elif sort == 'name_desc':
queryset = queryset.order_by(Lower('name').desc(), '-id')
elif sort == 'created_asc':
queryset = queryset.order_by(F('created_at').asc(nulls_first=True), 'id', Lower('name'))
elif sort == 'snapshots_desc':
queryset = queryset.order_by(F('num_snapshots').desc(nulls_last=True), F('created_at').desc(nulls_last=True), '-id', Lower('name'))
elif sort == 'snapshots_asc':
queryset = queryset.order_by(F('num_snapshots').asc(nulls_first=True), Lower('name'), 'id')
else:
queryset = queryset.order_by(F('created_at').desc(nulls_last=True), '-id', Lower('name'))
return queryset
def get_tag_creator_choices() -> list[tuple[str, str]]:
rows = (
Tag.objects
.filter(created_by__isnull=False)
.values_list('created_by_id', 'created_by__username')
.order_by(Lower('created_by__username'), 'created_by_id')
.distinct()
)
return [(str(user_id), username or f'User {user_id}') for user_id, username in rows]
def get_tag_year_choices() -> list[str]:
years = Tag.objects.exclude(created_at__isnull=True).dates('created_at', 'year', order='DESC')
return [str(year.year) for year in years]
def get_tag_by_ref(tag_ref: str | int) -> Tag:
if isinstance(tag_ref, int):
return Tag.objects.get(pk=tag_ref)
ref = str(tag_ref).strip()
if ref.isdigit():
return Tag.objects.get(pk=int(ref))
try:
return Tag.objects.get(slug__iexact=ref)
except Tag.DoesNotExist:
return Tag.objects.get(slug__icontains=ref)
def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]:
normalized_name = normalize_tag_name(name)
if not normalized_name:
raise ValueError('Tag name is required')
existing = Tag.objects.filter(name__iexact=normalized_name).first()
if existing:
return existing, False
tag = Tag.objects.create(
name=normalized_name,
created_by=created_by,
)
return tag, True
def rename_tag(tag: Tag, name: str) -> Tag:
normalized_name = normalize_tag_name(name)
if not normalized_name:
raise ValueError('Tag name is required')
existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first()
if existing:
raise ValueError(f'Tag "{existing.name}" already exists')
if tag.name != normalized_name:
tag.name = normalized_name
tag.save()
return tag
def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]:
return tag.delete()
def export_tag_urls(tag: Tag) -> str:
urls = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').values_list('url', flat=True)
return '\n'.join(urls)
def export_tag_snapshots_jsonl(tag: Tag) -> str:
snapshots = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').prefetch_related('tags')
return '\n'.join(json.dumps(snapshot.to_json()) for snapshot in snapshots)
def _display_snapshot_title(snapshot: Snapshot) -> str:
title = (snapshot.title or '').strip()
url = (snapshot.url or '').strip()
if not title:
return url
normalized_title = title.lower()
if normalized_title == 'pending...' or normalized_title == url.lower():
return url
return title
def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]:
return {
'id': str(snapshot.pk),
'title': _display_snapshot_title(snapshot),
'url': snapshot.url,
'favicon_url': build_snapshot_url(str(snapshot.pk), 'favicon.ico', request=request),
'admin_url': reverse('admin:core_snapshot_change', args=[snapshot.pk]),
'archive_url': build_web_url(f'/{snapshot.archive_path_from_db}/index.html', request=request),
'downloaded_at': snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None,
}
def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT) -> dict[int, list[dict[str, Any]]]:
tag_ids = [tag.pk for tag in tags]
if not tag_ids:
return {}
snapshot_tags = (
SnapshotTag.objects
.filter(tag_id__in=tag_ids)
.select_related('snapshot__crawl__created_by')
.order_by(
'tag_id',
F('snapshot__downloaded_at').desc(nulls_last=True),
F('snapshot__created_at').desc(nulls_last=True),
F('snapshot_id').desc(),
)
)
preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list)
for snapshot_tag in snapshot_tags:
previews = preview_map[snapshot_tag.tag_id]
if len(previews) >= preview_limit:
continue
previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request))
return preview_map
def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]:
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
return {
'id': tag.pk,
'name': tag.name,
'slug': tag.slug,
'num_snapshots': count,
'filter_url': f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}",
'edit_url': reverse('admin:core_tag_change', args=[tag.pk]),
'export_urls_url': reverse('api-1:tag_urls_export', args=[tag.pk]),
'export_jsonl_url': reverse('api-1:tag_snapshots_export', args=[tag.pk]),
'rename_url': reverse('api-1:rename_tag', args=[tag.pk]),
'delete_url': reverse('api-1:delete_tag', args=[tag.pk]),
'snapshots': snapshot_previews or [],
}
def build_tag_cards(
query: str = '',
request: HttpRequest | None = None,
limit: int | None = None,
preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT,
sort: str = 'created_desc',
created_by: str = '',
year: str = '',
has_snapshots: str = 'all',
) -> list[dict[str, Any]]:
queryset = get_matching_tags(
query=query,
sort=sort,
created_by=created_by,
year=year,
has_snapshots=has_snapshots,
)
if limit is not None:
queryset = queryset[:limit]
tags = list(queryset)
preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit)
return [
build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, []))
for tag in tags
]

View File

@@ -11,6 +11,7 @@ from archivebox.hooks import (
)
from archivebox.core.host_utils import (
get_admin_base_url,
get_public_base_url,
get_web_base_url,
get_snapshot_base_url,
build_snapshot_url,
@@ -166,6 +167,11 @@ def web_base_url(context) -> str:
return get_web_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def public_base_url(context) -> str:
return get_public_base_url(request=context.get('request'))
@register.simple_tag(takes_context=True)
def snapshot_base_url(context, snapshot) -> str:
snapshot_id = getattr(snapshot, 'id', snapshot)

View File

@@ -1,5 +1,6 @@
__package__ = 'archivebox.core'
import json
import os
import posixpath
from glob import glob, escape
@@ -7,7 +8,7 @@ from django.utils import timezone
import inspect
from typing import Callable, cast, get_type_hints
from pathlib import Path
from urllib.parse import urlparse
from urllib.parse import quote, urlparse
from django.shortcuts import render, redirect
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
@@ -26,7 +27,7 @@ from admin_data_views.typing import TableContext, ItemContext, SectionData
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
from archivebox.misc.serve_static import serve_static_with_byterange_support
@@ -37,7 +38,18 @@ from archivebox.core.models import Snapshot
from archivebox.core.host_utils import build_snapshot_url
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_enabled_plugins, get_plugin_name
from archivebox.hooks import (
BUILTIN_PLUGINS_DIR,
USER_PLUGINS_DIR,
discover_plugin_configs,
get_enabled_plugins,
get_plugin_name,
iter_plugin_dirs,
)
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
LIVE_PLUGIN_BASE_URL = '/admin/environment/plugins/'
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
@@ -699,6 +711,9 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
rel_path = path or ""
show_indexes = bool(request.GET.get("files"))
if not show_indexes and (not rel_path or rel_path == "index.html"):
return SnapshotView.render_live_index(request, snapshot)
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
@@ -784,7 +799,6 @@ class SnapshotHostView(View):
raise Http404
return _serve_snapshot_replay(request, snapshot, path)
class SnapshotReplayView(View):
"""Serve snapshot directory contents on a one-domain replay path."""
@@ -915,8 +929,17 @@ class AddView(UserPassesTestMixin, FormView):
return custom_config
def get_context_data(self, **kwargs):
from archivebox.core.models import Tag
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
plugin_configs = discover_plugin_configs()
plugin_dependency_map = {
plugin_name: [
str(required_plugin).strip()
for required_plugin in (schema.get('required_plugins') or [])
if str(required_plugin).strip()
]
for plugin_name, schema in plugin_configs.items()
if isinstance(schema.get('required_plugins'), list) and schema.get('required_plugins')
}
return {
**super().get_context_data(**kwargs),
'title': "Create Crawl",
@@ -924,8 +947,9 @@ class AddView(UserPassesTestMixin, FormView):
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
'required_search_plugin': required_search_plugin,
'plugin_dependency_map_json': json.dumps(plugin_dependency_map, sort_keys=True),
'stdout': '',
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
@@ -937,11 +961,10 @@ class AddView(UserPassesTestMixin, FormView):
depth = int(form.cleaned_data["depth"])
plugins = ','.join(form.cleaned_data.get("plugins", []))
schedule = form.cleaned_data.get("schedule", "").strip()
persona = form.cleaned_data.get("persona", "Default")
overwrite = form.cleaned_data.get("overwrite", False)
update = form.cleaned_data.get("update", False)
persona = form.cleaned_data.get("persona")
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
url_filters = form.cleaned_data.get("url_filters") or {}
custom_config = self._get_custom_config_overrides(form)
from archivebox.config.permissions import HOSTNAME
@@ -957,6 +980,7 @@ class AddView(UserPassesTestMixin, FormView):
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Crawl with the URLs from the file
@@ -964,16 +988,18 @@ class AddView(UserPassesTestMixin, FormView):
urls_content = sources_file.read_text()
# Build complete config
config = {
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'DEPTH': depth,
'PLUGINS': plugins or '',
'DEFAULT_PERSONA': persona or 'Default',
'DEFAULT_PERSONA': (persona.name if persona else 'Default'),
}
# Merge custom config overrides
config.update(custom_config)
if url_filters.get('allowlist'):
config['URL_ALLOWLIST'] = url_filters['allowlist']
if url_filters.get('denylist'):
config['URL_DENYLIST'] = url_filters['denylist']
crawl = Crawl.objects.create(
urls=urls_content,
@@ -999,6 +1025,8 @@ class AddView(UserPassesTestMixin, FormView):
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
crawl.create_snapshots_from_urls()
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from archivebox.crawls.actors import CrawlActor
@@ -1011,7 +1039,7 @@ class AddView(UserPassesTestMixin, FormView):
urls = form.cleaned_data["url"]
schedule = form.cleaned_data.get("schedule", "").strip()
rough_url_count = urls.count('://')
rough_url_count = len([url for url in urls.splitlines() if url.strip()])
# Build success message with schedule link if created
schedule_msg = ""
@@ -1080,10 +1108,6 @@ class WebAddView(AddView):
'persona': defaults_form.fields['persona'].initial or 'Default',
'config': {},
}
if defaults_form.fields['update'].initial:
form_data['update'] = 'on'
if defaults_form.fields['overwrite'].initial:
form_data['overwrite'] = 'on'
if defaults_form.fields['index_only'].initial:
form_data['index_only'] = 'on'
@@ -1118,6 +1142,41 @@ def live_progress_view(request):
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]:
normalized_hook_name = Path(hook_name).name if hook_name else ""
if not normalized_hook_name:
return (plugin, plugin, "unknown", "")
phase = "unknown"
if normalized_hook_name.startswith("on_Crawl__"):
phase = "crawl"
elif normalized_hook_name.startswith("on_Snapshot__"):
phase = "snapshot"
elif normalized_hook_name.startswith("on_Binary__"):
phase = "binary"
label = normalized_hook_name
if "__" in normalized_hook_name:
label = normalized_hook_name.split("__", 1)[1]
label = label.rsplit(".", 1)[0]
if len(label) > 3 and label[:2].isdigit() and label[2] == "_":
label = label[3:]
label = label.replace("_", " ").strip() or plugin
return (plugin, label, phase, normalized_hook_name)
def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]:
hook_path = ""
if isinstance(cmd, list) and cmd:
first = cmd[0]
if isinstance(first, str):
hook_path = first
if not hook_path:
return ("", "setup", "unknown", "")
return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup")
machine = Machine.current()
orchestrator_proc = Process.objects.filter(
machine=machine,
@@ -1188,8 +1247,19 @@ def live_progress_view(request):
Process.TypeChoices.BINARY,
],
)
recent_processes = Process.objects.filter(
machine=machine,
process_type__in=[
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
modified_at__gte=timezone.now() - timedelta(minutes=10),
).order_by("-modified_at")
crawl_process_pids: dict[str, int] = {}
snapshot_process_pids: dict[str, int] = {}
process_records_by_crawl: dict[str, list[dict[str, object]]] = {}
process_records_by_snapshot: dict[str, list[dict[str, object]]] = {}
seen_process_records: set[str] = set()
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
@@ -1197,11 +1267,48 @@ def live_progress_view(request):
crawl_id = env.get('CRAWL_ID')
snapshot_id = env.get('SNAPSHOT_ID')
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
if crawl_id and proc.pid:
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
if snapshot_id and proc.pid:
if phase == "snapshot" and snapshot_id and proc.pid:
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
for proc in recent_processes:
env = proc.env or {}
if not isinstance(env, dict):
env = {}
crawl_id = env.get("CRAWL_ID")
snapshot_id = env.get("SNAPSHOT_ID")
if not crawl_id and not snapshot_id:
continue
plugin, label, phase, hook_name = process_label(proc.cmd)
record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id)
proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}"
if proc_key in seen_process_records:
continue
seen_process_records.add(proc_key)
status = "started" if proc.status == Process.StatusChoices.RUNNING else ("failed" if proc.exit_code not in (None, 0) else "succeeded")
payload: dict[str, object] = {
"id": str(proc.id),
"plugin": plugin,
"label": label,
"hook_name": hook_name,
"status": status,
"phase": phase,
"source": "process",
"process_id": str(proc.id),
}
if status == "started" and proc.pid:
payload["pid"] = proc.pid
if phase == "snapshot" and snapshot_id:
process_records_by_snapshot.setdefault(str(snapshot_id), []).append(payload)
elif crawl_id:
process_records_by_crawl.setdefault(str(crawl_id), []).append(payload)
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).prefetch_related(
@@ -1234,6 +1341,11 @@ def live_progress_view(request):
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
crawl_setup_plugins = list(process_records_by_crawl.get(str(crawl.id), []))
crawl_setup_total = len(crawl_setup_plugins)
crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded")
crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed")
crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued")
# Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
@@ -1241,28 +1353,21 @@ def live_progress_view(request):
# Get archive results for this snapshot (already prefetched)
snapshot_results = snapshot.archiveresult_set.all()
# Count in memory instead of DB queries
total_plugins = len(snapshot_results)
completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
# Calculate snapshot progress using per-plugin progress
now = timezone.now()
plugin_progress_values: list[int] = []
all_plugins: list[dict[str, object]] = []
seen_plugin_keys: set[str] = set()
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
def plugin_sort_key(ar):
status_order = {
ArchiveResult.StatusChoices.STARTED: 0,
ArchiveResult.StatusChoices.QUEUED: 1,
ArchiveResult.StatusChoices.SUCCEEDED: 2,
ArchiveResult.StatusChoices.FAILED: 3,
ArchiveResult.StatusChoices.NORESULTS: 3,
ArchiveResult.StatusChoices.FAILED: 4,
}
return (status_order.get(ar.status, 4), ar.plugin)
return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "")
all_plugins = []
for ar in sorted(snapshot_results, key=plugin_sort_key):
status = ar.status
progress_value = 0
@@ -1270,6 +1375,7 @@ def live_progress_view(request):
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
):
progress_value = 100
elif status == ArchiveResult.StatusChoices.STARTED:
@@ -1284,20 +1390,49 @@ def live_progress_view(request):
progress_value = 0
plugin_progress_values.append(progress_value)
plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin)
plugin_payload = {
'id': str(ar.id),
'plugin': ar.plugin,
'label': label,
'hook_name': hook_name,
'phase': phase,
'status': status,
'process_id': str(ar.process_id) if ar.process_id else None,
}
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
plugin_payload['pid'] = ar.process.pid
if status == ArchiveResult.StatusChoices.STARTED:
plugin_payload['progress'] = progress_value
plugin_payload['timeout'] = ar.timeout or 120
plugin_payload['source'] = 'archiveresult'
all_plugins.append(plugin_payload)
seen_plugin_keys.add(
str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}"
)
snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
for proc_payload in process_records_by_snapshot.get(str(snapshot.id), []):
proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}")
if proc_key in seen_plugin_keys:
continue
seen_plugin_keys.add(proc_key)
all_plugins.append(proc_payload)
proc_status = proc_payload.get("status")
if proc_status in ("succeeded", "failed", "skipped"):
plugin_progress_values.append(100)
elif proc_status == "started":
plugin_progress_values.append(1)
else:
plugin_progress_values.append(0)
total_plugins = len(all_plugins)
completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded")
failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed")
pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued")
snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
@@ -1334,6 +1469,11 @@ def live_progress_view(request):
'started_snapshots': started_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
'setup_plugins': crawl_setup_plugins,
'setup_total_plugins': crawl_setup_total,
'setup_completed_plugins': crawl_setup_completed,
'setup_failed_plugins': crawl_setup_failed,
'setup_pending_plugins': crawl_setup_pending,
'active_snapshots': active_snapshots_for_crawl,
'can_start': can_start,
'urls_preview': urls_preview,
@@ -1461,7 +1601,11 @@ def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
from archivebox.machine.models import Machine
# Check if it's from archivebox.machine.config
# Environment variables override all persistent config sources.
if key in os.environ:
return 'Environment'
# Machine.config overrides ArchiveBox.conf.
try:
machine = Machine.current()
if machine.config and key in machine.config:
@@ -1469,10 +1613,6 @@ def find_config_source(key: str, merged_config: dict) -> str:
except Exception:
pass
# Check if it's from environment variable
if key in os.environ:
return 'Environment'
# Check if it's from archivebox.config.file
from archivebox.config.configset import BaseConfigSet
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
@@ -1483,6 +1623,43 @@ def find_config_source(key: str, merged_config: dict) -> str:
return 'Default'
def find_plugin_for_config_key(key: str) -> str | None:
for plugin_name, schema in discover_plugin_configs().items():
if key in (schema.get('properties') or {}):
return plugin_name
return None
def get_config_definition_link(key: str) -> tuple[str, str]:
plugin_name = find_plugin_for_config_key(key)
if not plugin_name:
return (
f'https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code',
'archivebox/config',
)
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
if plugin_dir:
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(builtin_root):
return (
f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json',
f'abx_plugins/plugins/{plugin_name}/config.json',
)
user_root = USER_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(user_root):
return (
f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/',
f'data/custom_plugins/{plugin_name}/config.json',
)
return (
f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/',
f'abx_plugins/plugins/{plugin_name}/config.json',
)
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
@@ -1566,17 +1743,6 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
# Determine all sources for this config value
sources_info = []
# Default value
default_val = find_config_default(key)
if default_val:
sources_info.append(('Default', default_val, 'gray'))
# Config file value
if CONSTANTS.CONFIG_FILE.exists():
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
sources_info.append(('Config File', file_config[key], 'green'))
# Environment variable
if key in os.environ:
sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
@@ -1592,6 +1758,17 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
except Exception:
pass
# Config file value
if CONSTANTS.CONFIG_FILE.exists():
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
sources_info.append(('Config File', file_config[key], 'green'))
# Default value
default_val = find_config_default(key)
if default_val:
sources_info.append(('Default', default_val, 'gray'))
# Final computed value
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
if not key_is_safe(key):
@@ -1614,6 +1791,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
definition_url, definition_label = get_config_definition_link(key)
section_data = cast(SectionData, {
"name": section_header,
"description": None,
@@ -1621,7 +1800,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
'Key': key,
'Type': find_config_type(key),
'Value': final_value,
'Source': find_config_source(key, merged_config),
'Currently read from': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
@@ -1631,14 +1810,14 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config</code>...
<a href="{definition_url}" target="_blank" rel="noopener noreferrer">
See full definition in <code>{definition_label}</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
<b>Configuration Sources (in priority order):</b><br/><br/>
<b>Configuration Sources (highest priority first):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
@@ -1651,15 +1830,15 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
}"</code>
</p>
'''),
'Source': mark_safe(f'''
'Currently read from': mark_safe(f'''
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
</li>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>

View File

@@ -131,7 +131,46 @@ class TagEditorWidget(forms.Widget):
}};
window.updateHiddenInput_{widget_id} = function() {{
document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
var hiddenInput = document.getElementById('{widget_id}');
if (!hiddenInput) {{
return;
}}
hiddenInput.value = currentTags_{widget_id}.join(',');
hiddenInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
hiddenInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
}};
function normalizeTags_{widget_id}(value) {{
var rawTags = Array.isArray(value) ? value : String(value || '').split(',');
var seen = {{}};
return rawTags
.map(function(tag) {{ return String(tag || '').trim(); }})
.filter(function(tag) {{
if (!tag) return false;
var normalized = tag.toLowerCase();
if (seen[normalized]) return false;
seen[normalized] = true;
return true;
}})
.sort(function(a, b) {{
return a.toLowerCase().localeCompare(b.toLowerCase());
}});
}}
window.setTags_{widget_id} = function(value, options) {{
currentTags_{widget_id} = normalizeTags_{widget_id}(value);
rebuildPills_{widget_id}();
if (!(options && options.skipHiddenUpdate)) {{
updateHiddenInput_{widget_id}();
}}
}};
window.syncTagEditorFromHidden_{widget_id} = function() {{
var hiddenInput = document.getElementById('{widget_id}');
if (!hiddenInput) {{
return;
}}
setTags_{widget_id}(hiddenInput.value, {{ skipHiddenUpdate: true }});
}};
function computeTagStyle_{widget_id}(tagName) {{
@@ -190,9 +229,7 @@ class TagEditorWidget(forms.Widget):
// Add to current tags
currentTags_{widget_id}.push(tagName);
currentTags_{widget_id}.sort(function(a, b) {{
return a.toLowerCase().localeCompare(b.toLowerCase());
}});
currentTags_{widget_id} = normalizeTags_{widget_id}(currentTags_{widget_id});
// Rebuild pills
rebuildPills_{widget_id}();
@@ -252,6 +289,14 @@ class TagEditorWidget(forms.Widget):
}}
}});
document.getElementById('{widget_id}').addEventListener('change', function() {{
syncTagEditorFromHidden_{widget_id}();
}});
document.getElementById('{widget_id}').addEventListener('archivebox:sync-tags', function() {{
syncTagEditorFromHidden_{widget_id}();
}});
window.handleTagKeydown_{widget_id} = function(event) {{
var input = event.target;
var value = input.value.trim();
@@ -320,6 +365,8 @@ class TagEditorWidget(forms.Widget):
var input = document.querySelector('input[name="csrfmiddlewaretoken"]');
return input ? input.value : '';
}}
syncTagEditorFromHidden_{widget_id}();
}})();
</script>
'''
@@ -327,15 +374,232 @@ class TagEditorWidget(forms.Widget):
return mark_safe(html)
class URLFiltersWidget(forms.Widget):
"""Render URL allowlist / denylist controls with same-domain autofill."""
template_name = ""
def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'):
self.source_selector = source_selector
super().__init__(attrs)
def render(self, name, value, attrs=None, renderer=None):
value = value if isinstance(value, dict) else {}
widget_id_raw = attrs.get('id', name) if attrs else name
widget_id = re.sub(r'[^A-Za-z0-9_]', '_', str(widget_id_raw)) or name
allowlist = escape(value.get('allowlist', '') or '')
denylist = escape(value.get('denylist', '') or '')
return mark_safe(f'''
<div id="{widget_id}_container" class="url-filters-widget">
<input type="hidden" name="{name}" value="">
<div class="url-filters-grid">
<div class="url-filters-column">
<div class="url-filter-label-row">
<label for="{widget_id}_allowlist" class="url-filter-label"><span class="url-filter-label-main">🟢 URL_ALLOWLIST</span></label>
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
</div>
<textarea id="{widget_id}_allowlist"
name="{name}_allowlist"
rows="2"
placeholder="^https?://([^/]+\\.)?(example\\.com|example\\.org)([:/]|$)">{allowlist}</textarea>
</div>
<div class="url-filters-column">
<div class="url-filter-label-row">
<label for="{widget_id}_denylist" class="url-filter-label"><span class="url-filter-label-main">⛔ URL_DENYLIST</span></label>
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
</div>
<textarea id="{widget_id}_denylist"
name="{name}_denylist"
rows="2"
placeholder="^https?://([^/]+\\.)?(cdn\\.example\\.com|analytics\\.example\\.org)([:/]|$)">{denylist}</textarea>
</div>
</div>
<label class="url-filters-toggle" for="{widget_id}_same_domain_only">
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
<span>Same domain only</span>
</label>
<div class="help-text">These values can be one regex pattern or domain per line. URL_DENYLIST takes precedence over URL_ALLOWLIST.</div>
<script>
(function() {{
var allowlistField = document.getElementById('{widget_id}_allowlist');
var denylistField = document.getElementById('{widget_id}_denylist');
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
var sourceField = document.querySelector({json.dumps(self.source_selector)});
var lastAutoGeneratedAllowlist = '';
if (!allowlistField || !sameDomainOnly || !sourceField) {{
return;
}}
function extractUrl(line) {{
var trimmed = String(line || '').trim();
if (!trimmed || trimmed.charAt(0) === '#') {{
return '';
}}
if (trimmed.charAt(0) === '{{') {{
try {{
var record = JSON.parse(trimmed);
return String(record.url || '').trim();
}} catch (error) {{
return '';
}}
}}
return trimmed;
}}
function escapeRegex(text) {{
return String(text || '').replace(/[.*+?^${{}}()|[\\]\\\\]/g, '\\\\$&');
}}
function buildHostRegex(domains) {{
if (!domains.length) {{
return '';
}}
return '^https?://(' + domains.map(escapeRegex).join('|') + ')([:/]|$)';
}}
function getConfigEditorRows() {{
return document.getElementById('id_config_rows');
}}
function getConfigUpdater() {{
return window.updateHiddenField_id_config || null;
}}
function findConfigRow(key) {{
var rows = getConfigEditorRows();
if (!rows) {{
return null;
}}
var matches = Array.prototype.filter.call(rows.querySelectorAll('.key-value-row'), function(row) {{
var keyInput = row.querySelector('.kv-key');
return keyInput && keyInput.value.trim() === key;
}});
return matches.length ? matches[0] : null;
}}
function addConfigRow() {{
if (typeof window.addKeyValueRow_id_config === 'function') {{
window.addKeyValueRow_id_config();
var rows = getConfigEditorRows();
return rows ? rows.lastElementChild : null;
}}
return null;
}}
function setConfigRow(key, value) {{
var rows = getConfigEditorRows();
var updater = getConfigUpdater();
if (!rows || !updater) {{
return;
}}
var row = findConfigRow(key);
if (!value) {{
if (row) {{
row.remove();
updater();
}}
return;
}}
if (!row) {{
row = addConfigRow();
}}
if (!row) {{
return;
}}
var keyInput = row.querySelector('.kv-key');
var valueInput = row.querySelector('.kv-value');
if (!keyInput || !valueInput) {{
return;
}}
keyInput.value = key;
valueInput.value = value;
keyInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
valueInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
updater();
}}
function syncConfigEditor() {{
setConfigRow('URL_ALLOWLIST', allowlistField.value.trim());
setConfigRow('URL_DENYLIST', denylistField ? denylistField.value.trim() : '');
}}
function syncAllowlistFromUrls() {{
if (!sameDomainOnly.checked) {{
if (allowlistField.value.trim() === lastAutoGeneratedAllowlist) {{
allowlistField.value = '';
syncConfigEditor();
}}
lastAutoGeneratedAllowlist = '';
return;
}}
var seen = Object.create(null);
var domains = [];
sourceField.value.split(/\\n+/).forEach(function(line) {{
var url = extractUrl(line);
if (!url) {{
return;
}}
try {{
var parsed = new URL(url);
var domain = String(parsed.hostname || '').toLowerCase();
if (!domain || seen[domain]) {{
return;
}}
seen[domain] = true;
domains.push(domain);
}} catch (error) {{
return;
}}
}});
lastAutoGeneratedAllowlist = buildHostRegex(domains);
allowlistField.value = lastAutoGeneratedAllowlist;
syncConfigEditor();
}}
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
sourceField.addEventListener('input', syncAllowlistFromUrls);
sourceField.addEventListener('change', syncAllowlistFromUrls);
allowlistField.addEventListener('input', syncConfigEditor);
allowlistField.addEventListener('change', syncConfigEditor);
if (denylistField) {{
denylistField.addEventListener('input', syncConfigEditor);
denylistField.addEventListener('change', syncConfigEditor);
}}
if (document.readyState === 'loading') {{
document.addEventListener('DOMContentLoaded', syncConfigEditor, {{ once: true }});
}} else {{
syncConfigEditor();
}}
}})();
</script>
</div>
''')
def value_from_datadict(self, data, files, name):
return {
'allowlist': data.get(f'{name}_allowlist', ''),
'denylist': data.get(f'{name}_denylist', ''),
'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
}
class InlineTagEditorWidget(TagEditorWidget):
"""
Inline version of TagEditorWidget for use in list views.
Includes AJAX save functionality for immediate persistence.
"""
def __init__(self, attrs=None, snapshot_id=None):
def __init__(self, attrs=None, snapshot_id=None, editable=True):
super().__init__(attrs, snapshot_id)
self.snapshot_id = snapshot_id
self.editable = editable
def render(self, name, value, attrs=None, renderer=None, snapshot_id=None):
"""Render inline tag editor with AJAX save."""
@@ -361,20 +625,24 @@ class InlineTagEditorWidget(TagEditorWidget):
# Build pills HTML with filter links
pills_html = ''
for td in tag_data:
remove_button = ''
if self.editable:
remove_button = (
f'<button type="button" class="tag-remove-btn" '
f'data-tag-id="{td["id"]}" data-tag-name="{self._escape(td["name"])}">&times;</button>'
)
pills_html += f'''
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}" style="{self._tag_style(td['name'])}">
<a href="/admin/core/snapshot/?tags__id__exact={td['id']}" class="tag-link">{self._escape(td['name'])}</a>
<button type="button" class="tag-remove-btn" data-tag-id="{td['id']}" data-tag-name="{self._escape(td['name'])}">&times;</button>
{remove_button}
</span>
'''
tags_json = escape(json.dumps(tag_data))
html = f'''
<span id="{widget_id}_container" class="tag-editor-inline" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}">
<span id="{widget_id}_pills" class="tag-pills-inline">
{pills_html}
</span>
input_html = ''
readonly_class = ' readonly' if not self.editable else ''
if self.editable:
input_html = f'''
<input type="text"
id="{widget_id}_input"
class="tag-inline-input-sm"
@@ -384,6 +652,14 @@ class InlineTagEditorWidget(TagEditorWidget):
data-inline-tag-input="1"
>
<datalist id="{widget_id}_datalist"></datalist>
'''
html = f'''
<span id="{widget_id}_container" class="tag-editor-inline{readonly_class}" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}" data-readonly="{int(not self.editable)}">
<span id="{widget_id}_pills" class="tag-pills-inline">
{pills_html}
</span>
{input_html}
</span>
'''