mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
WIP: checkpoint working tree before rebasing onto dev
This commit is contained in:
@@ -1,14 +1,23 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import shlex
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
from functools import reduce
|
||||
from operator import and_
|
||||
|
||||
from django.contrib import admin
|
||||
from django.db.models import Min, Q, TextField
|
||||
from django.db.models.functions import Cast
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.urls import reverse, resolve
|
||||
from django.utils import timezone
|
||||
from django.utils.text import smart_split
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
@@ -16,11 +25,71 @@ from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_plugin_icon
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
from archivebox.core.widgets import InlineTagEditorWidget
|
||||
from archivebox.core.views import LIVE_PLUGIN_BASE_URL
|
||||
|
||||
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def _stringify_env_value(value) -> str:
|
||||
if value is None:
|
||||
return ''
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
return json.dumps(value, separators=(',', ':'))
|
||||
|
||||
|
||||
def _quote_shell_string(value: str) -> str:
|
||||
return "'" + str(value).replace("'", "'\"'\"'") + "'"
|
||||
|
||||
|
||||
def _get_replay_source_url(result: ArchiveResult) -> str:
|
||||
process_env = getattr(getattr(result, 'process', None), 'env', None) or {}
|
||||
return str(process_env.get('SOURCE_URL') or result.snapshot.url or '')
|
||||
|
||||
|
||||
def build_abx_dl_display_command(result: ArchiveResult) -> str:
|
||||
source_url = _get_replay_source_url(result)
|
||||
plugin_name = str(result.plugin or '').strip()
|
||||
if not plugin_name and not source_url:
|
||||
return 'abx-dl'
|
||||
if not source_url:
|
||||
return f'abx-dl --plugins={plugin_name}'
|
||||
return f'abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}'
|
||||
|
||||
|
||||
def build_abx_dl_replay_command(result: ArchiveResult) -> str:
|
||||
display_command = build_abx_dl_display_command(result)
|
||||
process = getattr(result, 'process', None)
|
||||
env = getattr(process, 'env', None) or {}
|
||||
env_items = ' '.join(
|
||||
f'{key}={shlex.quote(_stringify_env_value(value))}'
|
||||
for key, value in sorted(env.items())
|
||||
if value is not None
|
||||
)
|
||||
snapshot_dir = shlex.quote(str(result.snapshot_dir))
|
||||
if env_items:
|
||||
return f'cd {snapshot_dir}; env {env_items} {display_command}'
|
||||
return f'cd {snapshot_dir}; {display_command}'
|
||||
|
||||
|
||||
def get_plugin_admin_url(plugin_name: str) -> str:
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs
|
||||
|
||||
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
|
||||
if plugin_dir:
|
||||
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(builtin_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
|
||||
user_root = USER_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(user_root):
|
||||
return f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/'
|
||||
|
||||
return f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/'
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
|
||||
|
||||
@@ -35,6 +104,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
'failed': ('#991b1b', '#fee2e2'), # red
|
||||
'queued': ('#6b7280', '#f3f4f6'), # gray
|
||||
'started': ('#92400e', '#fef3c7'), # amber
|
||||
'backoff': ('#92400e', '#fef3c7'),
|
||||
'skipped': ('#475569', '#f1f5f9'),
|
||||
'noresults': ('#475569', '#f1f5f9'),
|
||||
}
|
||||
|
||||
rows = []
|
||||
@@ -54,8 +126,10 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
if len(full_output) > 60:
|
||||
output_display += '...'
|
||||
|
||||
# Get full command as tooltip
|
||||
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
cmd_str_escaped = html.escape(display_cmd)
|
||||
cmd_attr = html.escape(replay_cmd, quote=True)
|
||||
|
||||
# Build output link - use embed_path() which checks output_files first
|
||||
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
@@ -77,7 +151,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 11px;"
|
||||
title="View/edit archive result">
|
||||
<code>{str(result.id)[:8]}</code>
|
||||
<code>{str(result.id)[-8:]}</code>
|
||||
</a>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
@@ -140,7 +214,15 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
|
||||
<b>Command:</b>
|
||||
</div>
|
||||
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
|
||||
<div style="position: relative; margin: 0; padding: 8px 56px 8px 8px; background: #1e293b; border-radius: 4px;">
|
||||
<button type="button"
|
||||
data-command="{cmd_attr}"
|
||||
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
|
||||
style="position: absolute; top: 6px; right: 6px; padding: 2px 8px; border: 0; border-radius: 4px; background: #334155; color: #e2e8f0; font-size: 11px; cursor: pointer;">
|
||||
Copy
|
||||
</button>
|
||||
<code title="{cmd_attr}" style="display: block; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; color: #e2e8f0; font-size: 11px;">{cmd_str_escaped}</code>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
</td>
|
||||
@@ -165,7 +247,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
||||
<thead>
|
||||
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Details</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
|
||||
@@ -193,7 +275,7 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'output_str')
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
@@ -259,10 +341,11 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
list_display = ('details_link', 'created_at', 'snapshot_info', 'tags_inline', 'status_badge', 'plugin_with_icon', 'process_link', 'machine_link', 'cmd_str', 'output_str_display')
|
||||
list_display_links = None
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
readonly_fields = ('cmd', 'cmd_version', 'pwd', 'cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process_link')
|
||||
search_fields = ()
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
@@ -271,7 +354,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Plugin', {
|
||||
'fields': ('plugin', 'plugin_with_icon', 'status', 'retry_at'),
|
||||
'fields': ('plugin_with_icon', 'process_link', 'status'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
@@ -305,8 +388,61 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
self.request = request
|
||||
return super().change_view(request, object_id, form_url, extra_context)
|
||||
|
||||
def get_queryset(self, request):
|
||||
return (
|
||||
super()
|
||||
.get_queryset(request)
|
||||
.select_related('snapshot', 'process')
|
||||
.prefetch_related('snapshot__tags')
|
||||
.annotate(snapshot_first_tag=Min('snapshot__tags__name'))
|
||||
)
|
||||
|
||||
def get_search_results(self, request, queryset, search_term):
|
||||
if not search_term:
|
||||
return queryset, False
|
||||
|
||||
queryset = queryset.annotate(
|
||||
snapshot_id_text=Cast('snapshot__id', output_field=TextField()),
|
||||
snapshot_crawl_id_text=Cast('snapshot__crawl_id', output_field=TextField()),
|
||||
output_json_text=Cast('output_json', output_field=TextField()),
|
||||
cmd_text=Cast('process__cmd', output_field=TextField()),
|
||||
)
|
||||
|
||||
search_bits = [
|
||||
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit
|
||||
for bit in smart_split(search_term)
|
||||
]
|
||||
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
|
||||
if not search_bits:
|
||||
return queryset, False
|
||||
|
||||
filters = []
|
||||
for bit in search_bits:
|
||||
filters.append(
|
||||
Q(snapshot_id_text__icontains=bit)
|
||||
| Q(snapshot__url__icontains=bit)
|
||||
| Q(snapshot__tags__name__icontains=bit)
|
||||
| Q(snapshot_crawl_id_text__icontains=bit)
|
||||
| Q(plugin__icontains=bit)
|
||||
| Q(hook_name__icontains=bit)
|
||||
| Q(output_str__icontains=bit)
|
||||
| Q(output_json_text__icontains=bit)
|
||||
| Q(cmd_text__icontains=bit)
|
||||
)
|
||||
|
||||
return queryset.filter(reduce(and_, filters)).distinct(), True
|
||||
|
||||
@admin.display(description='Details', ordering='id')
|
||||
def details_link(self, result):
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:core_archiveresult_change', args=[result.id]),
|
||||
str(result.id)[-8:],
|
||||
)
|
||||
|
||||
@admin.display(
|
||||
description='Snapshot Info'
|
||||
description='Snapshot',
|
||||
ordering='snapshot__url',
|
||||
)
|
||||
def snapshot_info(self, result):
|
||||
snapshot_id = str(result.snapshot_id)
|
||||
@@ -325,20 +461,83 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
def tags_str(self, result):
|
||||
return result.snapshot.tags_str()
|
||||
|
||||
@admin.display(description='Tags', ordering='snapshot_first_tag')
|
||||
def tags_inline(self, result):
|
||||
widget = InlineTagEditorWidget(snapshot_id=str(result.snapshot_id), editable=False)
|
||||
tags_html = widget.render(
|
||||
name=f'tags_{result.snapshot_id}',
|
||||
value=result.snapshot.tags.all(),
|
||||
attrs={'id': f'tags_{result.snapshot_id}'},
|
||||
snapshot_id=str(result.snapshot_id),
|
||||
)
|
||||
return mark_safe(f'<span class="tags-inline-editor">{tags_html}</span>')
|
||||
|
||||
@admin.display(description='Status', ordering='status')
|
||||
def status_badge(self, result):
|
||||
status = result.status or ArchiveResult.StatusChoices.QUEUED
|
||||
return format_html(
|
||||
'<span class="status-badge {} status-{}">{}</span>',
|
||||
status,
|
||||
status,
|
||||
result.get_status_display() or status,
|
||||
)
|
||||
|
||||
@admin.display(description='Plugin', ordering='plugin')
|
||||
def plugin_with_icon(self, result):
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
return format_html(
|
||||
'<span title="{}">{}</span> {}',
|
||||
'<a href="{}" title="{}">{}</a> <a href="{}"><code>{}</code></a>',
|
||||
get_plugin_admin_url(result.plugin),
|
||||
result.plugin,
|
||||
icon,
|
||||
get_plugin_admin_url(result.plugin),
|
||||
result.plugin,
|
||||
)
|
||||
|
||||
def cmd_str(self, result):
|
||||
@admin.display(description='Process', ordering='process__pid')
|
||||
def process_link(self, result):
|
||||
if not result.process_id:
|
||||
return '-'
|
||||
process_label = result.process.pid if result.process and result.process.pid else '-'
|
||||
return format_html(
|
||||
'<pre>{}</pre>',
|
||||
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
|
||||
'<a href="{}"><code>{}</code></a>',
|
||||
reverse('admin:machine_process_change', args=[result.process_id]),
|
||||
process_label,
|
||||
)
|
||||
|
||||
@admin.display(description='Machine', ordering='process__machine__hostname')
|
||||
def machine_link(self, result):
|
||||
if not result.process_id or not result.process or not result.process.machine_id:
|
||||
return '-'
|
||||
machine = result.process.machine
|
||||
return format_html(
|
||||
'<a href="{}"><code>{}</code> {}</a>',
|
||||
reverse('admin:machine_machine_change', args=[machine.id]),
|
||||
str(machine.id)[:8],
|
||||
machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Command')
|
||||
def cmd_str(self, result):
|
||||
display_cmd = build_abx_dl_display_command(result)
|
||||
replay_cmd = build_abx_dl_replay_command(result)
|
||||
return format_html(
|
||||
'''
|
||||
<div style="position: relative; width: 300px; min-width: 300px; max-width: 300px; overflow: hidden; box-sizing: border-box;">
|
||||
<button type="button"
|
||||
data-command="{}"
|
||||
onclick="(function(btn){{var text=btn.dataset.command||''; if(navigator.clipboard&&navigator.clipboard.writeText){{navigator.clipboard.writeText(text);}} else {{var ta=document.createElement('textarea'); ta.value=text; document.body.appendChild(ta); ta.select(); document.execCommand('copy'); document.body.removeChild(ta);}}}})(this); return false;"
|
||||
style="position: absolute; top: 6px; right: 6px; z-index: 1; padding: 2px 8px; border: 0; border-radius: 4px; background: #e2e8f0; color: #334155; font-size: 11px; cursor: pointer;">
|
||||
Copy
|
||||
</button>
|
||||
<code title="{}" style="display: block; width: 100%; max-width: 100%; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; padding: 8px 56px 8px 8px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; font-size: 11px; box-sizing: border-box;">
|
||||
{}
|
||||
</code>
|
||||
</div>
|
||||
''',
|
||||
replay_cmd,
|
||||
replay_cmd,
|
||||
display_cmd,
|
||||
)
|
||||
|
||||
def output_display(self, result):
|
||||
@@ -352,6 +551,27 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
result.output_str,
|
||||
)
|
||||
|
||||
@admin.display(description='Output', ordering='output_str')
|
||||
def output_str_display(self, result):
|
||||
output_text = str(result.output_str or '').strip()
|
||||
if not output_text:
|
||||
return '-'
|
||||
|
||||
live_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||
if live_path:
|
||||
return format_html(
|
||||
'<a href="{}" title="{}"><code>{}</code></a>',
|
||||
build_snapshot_url(str(result.snapshot_id), live_path),
|
||||
output_text,
|
||||
output_text,
|
||||
)
|
||||
|
||||
return format_html(
|
||||
'<span title="{}">{}</span>',
|
||||
output_text,
|
||||
output_text,
|
||||
)
|
||||
|
||||
def output_summary(self, result):
|
||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
|
||||
output_html = format_html(
|
||||
|
||||
@@ -61,12 +61,14 @@ def register_admin_site():
|
||||
from archivebox.crawls.admin import register_admin as register_crawls_admin
|
||||
from archivebox.api.admin import register_admin as register_api_admin
|
||||
from archivebox.machine.admin import register_admin as register_machine_admin
|
||||
from archivebox.personas.admin import register_admin as register_personas_admin
|
||||
from archivebox.workers.admin import register_admin as register_workers_admin
|
||||
|
||||
register_core_admin(archivebox_admin)
|
||||
register_crawls_admin(archivebox_admin)
|
||||
register_api_admin(archivebox_admin)
|
||||
register_machine_admin(archivebox_admin)
|
||||
register_personas_admin(archivebox_admin)
|
||||
register_workers_admin(archivebox_admin)
|
||||
|
||||
return archivebox_admin
|
||||
|
||||
@@ -6,6 +6,7 @@ from pathlib import Path
|
||||
|
||||
from django.contrib import admin, messages
|
||||
from django.urls import path
|
||||
from django.shortcuts import get_object_or_404, redirect
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.utils import timezone
|
||||
@@ -14,6 +15,7 @@ from django.db.models.functions import Coalesce
|
||||
from django import forms
|
||||
from django.template import Template, RequestContext
|
||||
from django.contrib.admin.helpers import ActionForm
|
||||
from django.middleware.csrf import get_token
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
@@ -24,7 +26,7 @@ from archivebox.search.admin import SearchResultsAdminMixin
|
||||
from archivebox.core.host_utils import build_snapshot_url, build_web_url
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
from archivebox.workers.tasks import bg_archive_snapshot, bg_archive_snapshots, bg_add
|
||||
|
||||
from archivebox.core.models import Tag, Snapshot, ArchiveResult
|
||||
from archivebox.core.admin_archiveresults import render_archiveresults_list
|
||||
@@ -215,10 +217,23 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
|
||||
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid'),
|
||||
path('<path:object_id>/redo-failed/', self.admin_site.admin_view(self.redo_failed_view), name='core_snapshot_redo_failed'),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def redo_failed_view(self, request, object_id):
|
||||
snapshot = get_object_or_404(Snapshot, pk=object_id)
|
||||
|
||||
if request.method == 'POST':
|
||||
queued = bg_archive_snapshot(snapshot, overwrite=False)
|
||||
messages.success(
|
||||
request,
|
||||
f"Queued {queued} snapshot for re-archiving. The background runner will process it.",
|
||||
)
|
||||
|
||||
return redirect(snapshot.admin_change_url)
|
||||
|
||||
# def get_queryset(self, request):
|
||||
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
|
||||
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
|
||||
@@ -312,6 +327,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
def admin_actions(self, obj):
|
||||
summary_url = build_web_url(f'/{obj.archive_path}')
|
||||
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
|
||||
redo_failed_url = f'/admin/core/snapshot/{obj.pk}/redo-failed/'
|
||||
csrf_token = get_token(self.request)
|
||||
return format_html(
|
||||
'''
|
||||
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
|
||||
@@ -344,13 +361,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
onmouseout="this.style.background='#eff6ff';">
|
||||
🆕 Archive Now
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Redo failed extractors (missing outputs)"
|
||||
onmouseover="this.style.background='#d1fae5';"
|
||||
onmouseout="this.style.background='#ecfdf5';">
|
||||
🔁 Redo Failed
|
||||
</a>
|
||||
<form action="{}" method="post" style="display: inline-flex; margin: 0;">
|
||||
<input type="hidden" name="csrfmiddlewaretoken" value="{}">
|
||||
<button type="submit" class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s; cursor: pointer;"
|
||||
title="Redo failed extractors (missing outputs)"
|
||||
onmouseover="this.style.background='#d1fae5';"
|
||||
onmouseout="this.style.background='#ecfdf5';">
|
||||
🔁 Redo Failed
|
||||
</button>
|
||||
</form>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Re-run all extractors (overwrite existing)"
|
||||
@@ -367,14 +386,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
</a>
|
||||
</div>
|
||||
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
|
||||
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
|
||||
<b>Tip:</b> Redo Failed runs immediately. The other action buttons link to the list view with this snapshot pre-selected.
|
||||
</p>
|
||||
''',
|
||||
summary_url,
|
||||
results_url,
|
||||
obj.url,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
redo_failed_url,
|
||||
csrf_token,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
)
|
||||
|
||||
@@ -1,63 +1,74 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.contrib import admin
|
||||
from urllib.parse import quote
|
||||
|
||||
from django import forms
|
||||
from django.contrib import admin, messages
|
||||
from django.contrib.admin.options import IS_POPUP_VAR
|
||||
from django.http import HttpRequest, HttpResponseRedirect
|
||||
from django.urls import reverse
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
|
||||
from archivebox.core.models import SnapshotTag, Tag
|
||||
from archivebox.core.tag_utils import (
|
||||
TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
TAG_SORT_CHOICES,
|
||||
build_tag_cards,
|
||||
get_tag_creator_choices,
|
||||
get_tag_year_choices,
|
||||
normalize_created_by_filter,
|
||||
normalize_created_year_filter,
|
||||
normalize_has_snapshots_filter,
|
||||
normalize_tag_sort,
|
||||
)
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
|
||||
|
||||
class TagInline(admin.TabularInline):
|
||||
model = SnapshotTag
|
||||
# fk_name = 'snapshot'
|
||||
fields = ('id', 'tag')
|
||||
extra = 1
|
||||
# min_num = 1
|
||||
max_num = 1000
|
||||
autocomplete_fields = (
|
||||
'tag',
|
||||
)
|
||||
|
||||
|
||||
# class AutocompleteTags:
|
||||
# model = Tag
|
||||
# search_fields = ['name']
|
||||
# name = 'name'
|
||||
# # source_field = 'name'
|
||||
# remote_field = Tag._meta.get_field('name')
|
||||
|
||||
# class AutocompleteTagsAdminStub:
|
||||
# name = 'admin'
|
||||
|
||||
|
||||
# class TaggedItemInline(admin.TabularInline):
|
||||
# readonly_fields = ('object_link',)
|
||||
# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields)
|
||||
# model = TaggedItem
|
||||
# extra = 1
|
||||
# show_change_link = True
|
||||
|
||||
# @admin.display(description='object')
|
||||
# def object_link(self, obj):
|
||||
# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id)
|
||||
# return format_html('<a href="/admin/{}/{}/{}/change"><b>[{}]</b></a>', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj))
|
||||
class TagAdminForm(forms.ModelForm):
|
||||
class Meta:
|
||||
model = Tag
|
||||
fields = '__all__'
|
||||
widgets = {
|
||||
'name': forms.TextInput(attrs={
|
||||
'placeholder': 'research, receipts, product-design...',
|
||||
'autocomplete': 'off',
|
||||
'spellcheck': 'false',
|
||||
'data-tag-name-input': '1',
|
||||
}),
|
||||
}
|
||||
|
||||
def clean_name(self):
|
||||
name = (self.cleaned_data.get('name') or '').strip()
|
||||
if not name:
|
||||
raise forms.ValidationError('Tag name is required.')
|
||||
return name
|
||||
|
||||
|
||||
|
||||
class TagAdmin(BaseModelAdmin):
|
||||
list_display = ('created_at', 'created_by', 'id', 'name', 'num_snapshots', 'snapshots')
|
||||
form = TagAdminForm
|
||||
change_list_template = 'admin/core/tag/change_list.html'
|
||||
change_form_template = 'admin/core/tag/change_form.html'
|
||||
list_display = ('name', 'num_snapshots', 'created_at', 'created_by')
|
||||
list_filter = ('created_at', 'created_by')
|
||||
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
actions = ['delete_selected', 'merge_tags']
|
||||
ordering = ['-created_at']
|
||||
# inlines = [TaggedItemInline]
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
actions = ['delete_selected']
|
||||
ordering = ['name', 'id']
|
||||
|
||||
fieldsets = (
|
||||
('Tag Info', {
|
||||
('Tag', {
|
||||
'fields': ('name', 'slug'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
@@ -65,112 +76,137 @@ class TagAdmin(BaseModelAdmin):
|
||||
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
('Recent Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
add_fieldsets = (
|
||||
('Tag', {
|
||||
'fields': ('name',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
def get_fieldsets(self, request: HttpRequest, obj: Tag | None = None):
|
||||
return self.fieldsets if obj else self.add_fieldsets
|
||||
|
||||
def num_snapshots(self, tag):
|
||||
def changelist_view(self, request: HttpRequest, extra_context=None):
|
||||
query = (request.GET.get('q') or '').strip()
|
||||
sort = normalize_tag_sort((request.GET.get('sort') or 'created_desc').strip())
|
||||
created_by = normalize_created_by_filter((request.GET.get('created_by') or '').strip())
|
||||
year = normalize_created_year_filter((request.GET.get('year') or '').strip())
|
||||
has_snapshots = normalize_has_snapshots_filter((request.GET.get('has_snapshots') or 'all').strip())
|
||||
extra_context = {
|
||||
**(extra_context or {}),
|
||||
'initial_query': query,
|
||||
'initial_sort': sort,
|
||||
'initial_created_by': created_by,
|
||||
'initial_year': year,
|
||||
'initial_has_snapshots': has_snapshots,
|
||||
'tag_sort_choices': TAG_SORT_CHOICES,
|
||||
'tag_has_snapshots_choices': TAG_HAS_SNAPSHOTS_CHOICES,
|
||||
'tag_created_by_choices': get_tag_creator_choices(),
|
||||
'tag_year_choices': get_tag_year_choices(),
|
||||
'initial_tag_cards': build_tag_cards(
|
||||
query=query,
|
||||
request=request,
|
||||
sort=sort,
|
||||
created_by=created_by,
|
||||
year=year,
|
||||
has_snapshots=has_snapshots,
|
||||
),
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_create_api_url': reverse('api-1:tags_create'),
|
||||
}
|
||||
return super().changelist_view(request, extra_context=extra_context)
|
||||
|
||||
def render_change_form(self, request, context, add=False, change=False, form_url='', obj=None):
|
||||
current_name = (request.POST.get('name') or '').strip()
|
||||
if not current_name and obj:
|
||||
current_name = obj.name
|
||||
|
||||
similar_tag_cards = build_tag_cards(query=current_name, request=request, limit=12) if current_name else build_tag_cards(request=request, limit=12)
|
||||
if obj:
|
||||
similar_tag_cards = [card for card in similar_tag_cards if card['id'] != obj.pk]
|
||||
|
||||
context.update({
|
||||
'tag_search_api_url': reverse('api-1:search_tags'),
|
||||
'tag_similar_cards': similar_tag_cards,
|
||||
'tag_similar_query': current_name,
|
||||
})
|
||||
return super().render_change_form(request, context, add=add, change=change, form_url=form_url, obj=obj)
|
||||
|
||||
def response_add(self, request: HttpRequest, obj: Tag, post_url_continue=None):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST:
|
||||
return super().response_add(request, obj, post_url_continue=post_url_continue)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" saved.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def response_change(self, request: HttpRequest, obj: Tag):
|
||||
if IS_POPUP_VAR in request.POST or '_continue' in request.POST or '_addanother' in request.POST or '_saveasnew' in request.POST:
|
||||
return super().response_change(request, obj)
|
||||
|
||||
self.message_user(request, f'Tag "{obj.name}" updated.', level=messages.SUCCESS)
|
||||
return self._redirect_to_changelist(obj.name)
|
||||
|
||||
def _redirect_to_changelist(self, query: str = '') -> HttpResponseRedirect:
|
||||
changelist_url = reverse('admin:core_tag_changelist')
|
||||
if query:
|
||||
changelist_url = f'{changelist_url}?q={quote(query)}'
|
||||
return HttpResponseRedirect(changelist_url)
|
||||
|
||||
@admin.display(description='Snapshots')
|
||||
def snapshots(self, tag: Tag):
|
||||
snapshots = tag.snapshot_set.select_related('crawl__created_by').order_by('-downloaded_at', '-created_at', '-pk')[:10]
|
||||
total_count = tag.snapshot_set.count()
|
||||
if not snapshots:
|
||||
return mark_safe(
|
||||
f'<p style="margin:0;color:#64748b;">No snapshots use this tag yet. '
|
||||
f'<a href="/admin/core/snapshot/?tags__id__exact={tag.id}">Open filtered snapshot list</a>.</p>'
|
||||
)
|
||||
|
||||
cards = []
|
||||
for snapshot in snapshots:
|
||||
title = (snapshot.title or '').strip() or snapshot.url
|
||||
cards.append(format_html(
|
||||
'''
|
||||
<a href="{}" style="display:flex;align-items:center;gap:10px;padding:10px 12px;border:1px solid #e2e8f0;border-radius:12px;background:#fff;text-decoration:none;color:#0f172a;">
|
||||
<img src="{}" alt="" style="width:18px;height:18px;border-radius:4px;flex:0 0 auto;" onerror="this.style.display='none'">
|
||||
<span style="min-width:0;">
|
||||
<strong style="display:block;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</strong>
|
||||
<code style="display:block;color:#64748b;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">{}</code>
|
||||
</span>
|
||||
</a>
|
||||
''',
|
||||
reverse('admin:core_snapshot_change', args=[snapshot.pk]),
|
||||
build_snapshot_url(str(snapshot.pk), 'favicon.ico'),
|
||||
title[:120],
|
||||
snapshot.url[:120],
|
||||
))
|
||||
|
||||
cards.append(format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}" style="display:inline-flex;margin-top:10px;font-weight:600;">View all {} tagged snapshots</a>',
|
||||
tag.id,
|
||||
total_count,
|
||||
))
|
||||
return mark_safe('<div style="display:grid;gap:10px;">' + ''.join(cards) + '</div>')
|
||||
|
||||
@admin.display(description='Snapshots', ordering='num_snapshots')
|
||||
def num_snapshots(self, tag: Tag):
|
||||
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
|
||||
return format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
|
||||
tag.id,
|
||||
tag.snapshot_set.count(),
|
||||
count,
|
||||
)
|
||||
|
||||
def snapshots(self, tag):
|
||||
total_count = tag.snapshot_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> {}',
|
||||
snap.pk,
|
||||
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
|
||||
snap.url[:64],
|
||||
)
|
||||
for snap in tag.snapshot_set.order_by('-downloaded_at')[:10]
|
||||
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">{total_count} total snapshots...<a>'))
|
||||
|
||||
# def get_urls(self):
|
||||
# urls = super().get_urls()
|
||||
# custom_urls = [
|
||||
# path(
|
||||
# "merge-tags/",
|
||||
# self.admin_site.admin_view(self.merge_tags_view),
|
||||
# name="taggit_tag_merge_tags",
|
||||
# ),
|
||||
# ]
|
||||
# return custom_urls + urls
|
||||
|
||||
# @admin.action(description="Merge selected tags")
|
||||
# def merge_tags(self, request, queryset):
|
||||
# selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME)
|
||||
# if not selected:
|
||||
# self.message_user(request, "Please select at least one tag.")
|
||||
# return redirect(request.get_full_path())
|
||||
|
||||
# selected_tag_ids = ",".join(selected)
|
||||
# redirect_url = f"{request.get_full_path()}merge-tags/"
|
||||
|
||||
# request.session["selected_tag_ids"] = selected_tag_ids
|
||||
|
||||
# return redirect(redirect_url)
|
||||
|
||||
# def merge_tags_view(self, request):
|
||||
# selected_tag_ids = request.session.get("selected_tag_ids", "").split(",")
|
||||
# if request.method == "POST":
|
||||
# form = MergeTagsForm(request.POST)
|
||||
# if form.is_valid():
|
||||
# new_tag_name = form.cleaned_data["new_tag_name"]
|
||||
# new_tag, created = Tag.objects.get_or_create(name=new_tag_name)
|
||||
# with transaction.atomic():
|
||||
# for tag_id in selected_tag_ids:
|
||||
# tag = Tag.objects.get(id=tag_id)
|
||||
# tagged_items = TaggedItem.objects.filter(tag=tag)
|
||||
# for tagged_item in tagged_items:
|
||||
# if TaggedItem.objects.filter(
|
||||
# tag=new_tag,
|
||||
# content_type=tagged_item.content_type,
|
||||
# object_id=tagged_item.object_id,
|
||||
# ).exists():
|
||||
# # we have the new tag as well, so we can just
|
||||
# # remove the tag association
|
||||
# tagged_item.delete()
|
||||
# else:
|
||||
# # point this taggedItem to the new one
|
||||
# tagged_item.tag = new_tag
|
||||
# tagged_item.save()
|
||||
|
||||
# # delete the old tag
|
||||
# if tag.id != new_tag.id:
|
||||
# tag.delete()
|
||||
|
||||
# self.message_user(request, "Tags have been merged", level="success")
|
||||
# # clear the selected_tag_ids from session after merge is complete
|
||||
# request.session.pop("selected_tag_ids", None)
|
||||
|
||||
# return redirect("..")
|
||||
# else:
|
||||
# self.message_user(request, "Form is invalid.", level="error")
|
||||
|
||||
# context = {
|
||||
# "form": MergeTagsForm(),
|
||||
# "selected_tag_ids": selected_tag_ids,
|
||||
# }
|
||||
# return render(request, "admin/taggit/merge_tags_form.html", context)
|
||||
|
||||
|
||||
# @admin.register(SnapshotTag, site=archivebox_admin)
|
||||
# class SnapshotTagAdmin(BaseModelAdmin):
|
||||
# list_display = ('id', 'snapshot', 'tag')
|
||||
# sort_fields = ('id', 'snapshot', 'tag')
|
||||
# search_fields = ('id', 'snapshot_id', 'tag_id')
|
||||
# fields = ('snapshot', 'id')
|
||||
# actions = ['delete_selected']
|
||||
# ordering = ['-id']
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(Tag, TagAdmin)
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
from django import forms
|
||||
from django.utils.html import format_html
|
||||
|
||||
from archivebox.misc.util import URL_REGEX
|
||||
from archivebox.misc.util import URL_REGEX, find_all_urls
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
from archivebox.crawls.schedule_utils import validate_schedule
|
||||
from archivebox.hooks import get_plugins
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
from archivebox.core.widgets import TagEditorWidget, URLFiltersWidget
|
||||
from archivebox.hooks import get_plugins, discover_plugin_configs, get_plugin_icon
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
@@ -22,6 +26,22 @@ def get_plugin_choices():
|
||||
return [(name, name) for name in get_plugins()]
|
||||
|
||||
|
||||
def get_plugin_choice_label(plugin_name: str, plugin_configs: dict[str, dict]) -> str:
|
||||
schema = plugin_configs.get(plugin_name, {})
|
||||
description = str(schema.get('description') or '').strip()
|
||||
if not description:
|
||||
return plugin_name
|
||||
icon_html = get_plugin_icon(plugin_name)
|
||||
|
||||
return format_html(
|
||||
'<span class="plugin-choice-icon">{}</span><span class="plugin-choice-name">{}</span><a class="plugin-choice-description" href="https://archivebox.github.io/abx-plugins/#{}" target="_blank" rel="noopener noreferrer">{}</a>',
|
||||
icon_html,
|
||||
plugin_name,
|
||||
plugin_name,
|
||||
description,
|
||||
)
|
||||
|
||||
|
||||
def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
|
||||
field = form.fields[name]
|
||||
if not isinstance(field, forms.ChoiceField):
|
||||
@@ -31,22 +51,19 @@ def get_choice_field(form: forms.Form, name: str) -> forms.ChoiceField:
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
# Basic fields
|
||||
url = forms.RegexField(
|
||||
label="URLs (one per line)",
|
||||
regex=URL_REGEX,
|
||||
min_length=6,
|
||||
url = forms.CharField(
|
||||
label="URLs",
|
||||
strip=True,
|
||||
widget=forms.Textarea,
|
||||
widget=forms.Textarea(attrs={
|
||||
'data-url-regex': URL_REGEX.pattern,
|
||||
}),
|
||||
required=True
|
||||
)
|
||||
tag = forms.CharField(
|
||||
label="Tags (comma separated tag1,tag2,tag3)",
|
||||
label="Tags",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'list': 'tag-datalist',
|
||||
'autocomplete': 'off',
|
||||
})
|
||||
widget=TagEditorWidget(),
|
||||
)
|
||||
depth = forms.ChoiceField(
|
||||
label="Archive depth",
|
||||
@@ -58,11 +75,15 @@ class AddLinkForm(forms.Form):
|
||||
label="Notes",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.Textarea(attrs={
|
||||
'rows': 3,
|
||||
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'Optional notes about this crawl',
|
||||
})
|
||||
)
|
||||
url_filters = forms.Field(
|
||||
label="URL allowlist / denylist",
|
||||
required=False,
|
||||
widget=URLFiltersWidget(source_selector='textarea[name="url"]'),
|
||||
)
|
||||
|
||||
# Plugin groups
|
||||
chrome_plugins = forms.MultipleChoiceField(
|
||||
@@ -111,24 +132,15 @@ class AddLinkForm(forms.Form):
|
||||
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
||||
})
|
||||
)
|
||||
persona = forms.CharField(
|
||||
persona = forms.ModelChoiceField(
|
||||
label="Persona (authentication profile)",
|
||||
max_length=100,
|
||||
initial='Default',
|
||||
required=False,
|
||||
)
|
||||
overwrite = forms.BooleanField(
|
||||
label="Overwrite existing snapshots",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
update = forms.BooleanField(
|
||||
label="Update/retry previously failed URLs",
|
||||
initial=False,
|
||||
required=False,
|
||||
queryset=Persona.objects.none(),
|
||||
empty_label=None,
|
||||
to_field_name='name',
|
||||
)
|
||||
index_only = forms.BooleanField(
|
||||
label="Index only (don't archive yet)",
|
||||
label="Index only dry run (add crawl but don't archive yet)",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
@@ -142,11 +154,13 @@ class AddLinkForm(forms.Form):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Import at runtime to avoid circular imports
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
default_persona = Persona.get_or_create_default()
|
||||
self.fields['persona'].queryset = Persona.objects.order_by('name')
|
||||
self.fields['persona'].initial = default_persona.name
|
||||
|
||||
# Get all plugins
|
||||
all_plugins = get_plugins()
|
||||
plugin_configs = discover_plugin_configs()
|
||||
|
||||
# Define plugin groups
|
||||
chrome_dependent = {
|
||||
@@ -170,26 +184,28 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
# Populate plugin field choices
|
||||
get_choice_field(self, 'chrome_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
]
|
||||
get_choice_field(self, 'archiving_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in archiving
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in archiving
|
||||
]
|
||||
get_choice_field(self, 'parsing_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in parsing
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in parsing
|
||||
]
|
||||
get_choice_field(self, 'search_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in search
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in search
|
||||
]
|
||||
get_choice_field(self, 'binary_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in binary
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in binary
|
||||
]
|
||||
get_choice_field(self, 'extension_plugins').choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in extensions
|
||||
(p, get_plugin_choice_label(p, plugin_configs)) for p in sorted(all_plugins) if p in extensions
|
||||
]
|
||||
|
||||
# Set update default from config
|
||||
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
|
||||
search_choices = [choice[0] for choice in get_choice_field(self, 'search_plugins').choices]
|
||||
if required_search_plugin in search_choices:
|
||||
get_choice_field(self, 'search_plugins').initial = [required_search_plugin]
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean() or {}
|
||||
@@ -207,6 +223,23 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
return cleaned_data
|
||||
|
||||
def clean_url(self):
|
||||
value = self.cleaned_data.get('url') or ''
|
||||
urls = '\n'.join(find_all_urls(value))
|
||||
if not urls:
|
||||
raise forms.ValidationError('Enter at least one valid URL.')
|
||||
return urls
|
||||
|
||||
def clean_url_filters(self):
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
value = self.cleaned_data.get('url_filters') or {}
|
||||
return {
|
||||
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
|
||||
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
|
||||
'same_domain_only': bool(value.get('same_domain_only')),
|
||||
}
|
||||
|
||||
def clean_schedule(self):
|
||||
schedule = (self.cleaned_data.get('schedule') or '').strip()
|
||||
if not schedule:
|
||||
|
||||
@@ -163,6 +163,10 @@ def get_api_base_url(request=None) -> str:
|
||||
return _build_base_url_for_host(get_api_host(), request=request)
|
||||
|
||||
|
||||
def get_public_base_url(request=None) -> str:
|
||||
return _build_base_url_for_host(get_public_host(), request=request)
|
||||
|
||||
|
||||
# Backwards-compat aliases (archive == web)
|
||||
def get_archive_base_url(request=None) -> str:
|
||||
return get_web_base_url(request=request)
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("core", "0031_add_archiveresult_snapshot_status_index"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name="archiveresult",
|
||||
name="retry_at",
|
||||
),
|
||||
]
|
||||
@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
|
||||
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
|
||||
from archivebox.workers.tasks import bg_archive_snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.machine.models import NetworkInterface, Binary
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
|
||||
|
||||
@@ -60,32 +60,41 @@ class Tag(ModelWithUUID):
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def _generate_unique_slug(self) -> str:
|
||||
base_slug = slugify(self.name) or 'tag'
|
||||
existing = Tag.objects.filter(slug__startswith=base_slug)
|
||||
if self.pk:
|
||||
existing = existing.exclude(pk=self.pk)
|
||||
existing_slugs = set(existing.values_list("slug", flat=True))
|
||||
|
||||
slug = base_slug
|
||||
i = 1
|
||||
while slug in existing_slugs:
|
||||
slug = f"{base_slug}_{i}"
|
||||
i += 1
|
||||
return slug
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if is_new:
|
||||
self.slug = slugify(self.name)
|
||||
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
|
||||
i = None
|
||||
while True:
|
||||
slug = f"{slugify(self.name)}_{i}" if i else slugify(self.name)
|
||||
if slug not in existing:
|
||||
self.slug = slug
|
||||
break
|
||||
i = (i or 0) + 1
|
||||
existing_name = None
|
||||
if self.pk:
|
||||
existing_name = Tag.objects.filter(pk=self.pk).values_list('name', flat=True).first()
|
||||
|
||||
if not self.slug or existing_name != self.name:
|
||||
self.slug = self._generate_unique_slug()
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Tag',
|
||||
indent_level=0,
|
||||
metadata={
|
||||
'id': self.id,
|
||||
'name': self.name,
|
||||
'slug': self.slug,
|
||||
},
|
||||
)
|
||||
# if is_new:
|
||||
# from archivebox.misc.logging_util import log_worker_event
|
||||
# log_worker_event(
|
||||
# worker_type='DB',
|
||||
# event='Created Tag',
|
||||
# indent_level=0,
|
||||
# metadata={
|
||||
# 'id': self.id,
|
||||
# 'name': self.name,
|
||||
# 'slug': self.slug,
|
||||
# },
|
||||
# )
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
@@ -364,7 +373,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return Binary.objects.filter(process_set__archiveresult__snapshot_id=self.id).distinct()
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if not self.bookmarked_at:
|
||||
self.bookmarked_at = self.created_at or timezone.now()
|
||||
if not self.timestamp:
|
||||
@@ -393,24 +401,25 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
self.ensure_legacy_archive_symlink()
|
||||
if self.url not in self.crawl.urls:
|
||||
existing_urls = {url for _raw_line, url in self.crawl._iter_url_lines() if url}
|
||||
if self.crawl.url_passes_filters(self.url, snapshot=self) and self.url not in existing_urls:
|
||||
self.crawl.urls += f'\n{self.url}'
|
||||
self.crawl.save()
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Snapshot',
|
||||
indent_level=2,
|
||||
url=self.url,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'crawl_id': str(self.crawl_id),
|
||||
'depth': self.depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
# if is_new:
|
||||
# from archivebox.misc.logging_util import log_worker_event
|
||||
# log_worker_event(
|
||||
# worker_type='DB',
|
||||
# event='Created Snapshot',
|
||||
# indent_level=2,
|
||||
# url=self.url,
|
||||
# metadata={
|
||||
# 'id': str(self.id),
|
||||
# 'crawl_id': str(self.crawl_id),
|
||||
# 'depth': self.depth,
|
||||
# 'status': self.status,
|
||||
# },
|
||||
# )
|
||||
|
||||
# =========================================================================
|
||||
# Filesystem Migration Methods
|
||||
@@ -1528,16 +1537,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
"""
|
||||
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
|
||||
|
||||
Called by: SnapshotMachine.enter_started()
|
||||
|
||||
Hook Lifecycle:
|
||||
1. discover_hooks('Snapshot') → finds all plugin hooks
|
||||
2. For each hook:
|
||||
- Create ArchiveResult with status=QUEUED
|
||||
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
|
||||
3. ArchiveResults execute independently via ArchiveResultMachine
|
||||
4. Hook execution happens in ArchiveResult.run(), NOT here
|
||||
|
||||
Returns:
|
||||
list[ArchiveResult]: Newly created pending results
|
||||
"""
|
||||
@@ -1602,7 +1601,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'url': self.url,
|
||||
'title': self.title,
|
||||
'tags': self.tags_str(),
|
||||
'tags_str': self.tags_str(),
|
||||
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||
'timestamp': self.timestamp,
|
||||
@@ -1672,7 +1670,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# ID not found, fall through to create-by-URL logic
|
||||
pass
|
||||
|
||||
url = record.get('url')
|
||||
from archivebox.misc.util import fix_url_from_markdown
|
||||
|
||||
url = fix_url_from_markdown(str(record.get('url') or '').strip())
|
||||
if not url:
|
||||
return None
|
||||
|
||||
@@ -1807,7 +1807,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
defaults={
|
||||
'plugin': plugin,
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
},
|
||||
)
|
||||
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||
@@ -1853,11 +1852,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
failed = results.filter(status='failed').count()
|
||||
running = results.filter(status='started').count()
|
||||
skipped = results.filter(status='skipped').count()
|
||||
noresults = results.filter(status='noresults').count()
|
||||
total = results.count()
|
||||
pending = total - succeeded - failed - running - skipped
|
||||
pending = total - succeeded - failed - running - skipped - noresults
|
||||
|
||||
# Calculate percentage (succeeded + failed + skipped as completed)
|
||||
completed = succeeded + failed + skipped
|
||||
# Calculate percentage (succeeded + failed + skipped + noresults as completed)
|
||||
completed = succeeded + failed + skipped + noresults
|
||||
percent = int((completed / total * 100) if total > 0 else 0)
|
||||
|
||||
# Sum output sizes
|
||||
@@ -1875,47 +1875,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'running': running,
|
||||
'pending': pending,
|
||||
'skipped': skipped,
|
||||
'noresults': noresults,
|
||||
'percent': percent,
|
||||
'output_size': output_size,
|
||||
'is_sealed': is_sealed,
|
||||
}
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
|
||||
def retry_failed_archiveresults(self) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
|
||||
This enables seamless retry of the entire extraction pipeline:
|
||||
- Resets FAILED and SKIPPED results to QUEUED
|
||||
- Sets retry_at so workers pick them up
|
||||
- Plugins run in order (numeric prefix)
|
||||
- Each plugin checks its dependencies at runtime
|
||||
|
||||
Dependency handling (e.g., chrome → screenshot):
|
||||
- Plugins check if required outputs exist before running
|
||||
- If dependency output missing → plugin returns 'skipped'
|
||||
- On retry, if dependency now succeeds → dependent can run
|
||||
|
||||
Returns count of ArchiveResults reset.
|
||||
"""
|
||||
retry_at = retry_at or timezone.now()
|
||||
|
||||
count = self.archiveresult_set.filter(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
]
|
||||
).update(
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
retry_at=retry_at,
|
||||
output=None,
|
||||
output_str='',
|
||||
output_json=None,
|
||||
output_files={},
|
||||
output_size=0,
|
||||
output_mimetypes='',
|
||||
start_ts=None,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
# Also reset the snapshot and current_step so it gets re-checked from the beginning
|
||||
if count > 0:
|
||||
self.status = self.StatusChoices.STARTED
|
||||
self.retry_at = retry_at
|
||||
self.retry_at = timezone.now()
|
||||
self.current_step = 0 # Reset to step 0 for retry
|
||||
self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
|
||||
|
||||
@@ -2228,6 +2219,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
best_result = outputs[0]
|
||||
context = {
|
||||
**self.to_dict(extended=True),
|
||||
'snapshot': self,
|
||||
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
|
||||
'url_str': htmlencode(urldecode(self.base_url)),
|
||||
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
|
||||
@@ -2275,8 +2267,8 @@ class SnapshotMachine(BaseStateMachine):
|
||||
│ • discover_hooks('Snapshot') → finds all plugin hooks │
|
||||
│ • create_pending_archiveresults() → creates ONE │
|
||||
│ ArchiveResult per hook (NO execution yet) │
|
||||
│ 2. ArchiveResults process independently with their own │
|
||||
│ state machines (see ArchiveResultMachine) │
|
||||
│ 2. The shared abx-dl runner executes hooks and the │
|
||||
│ projector updates ArchiveResult rows from events │
|
||||
│ 3. Advance through steps 0-9 as foreground hooks complete │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when is_finished()
|
||||
@@ -2358,7 +2350,7 @@ class SnapshotMachine(BaseStateMachine):
|
||||
cast(Any, crawl).sm.seal()
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes):
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
@@ -2366,6 +2358,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
SUCCEEDED = 'succeeded', 'Succeeded'
|
||||
FAILED = 'failed', 'Failed'
|
||||
SKIPPED = 'skipped', 'Skipped'
|
||||
NORESULTS = 'noresults', 'No Results'
|
||||
|
||||
INITIAL_STATE = StatusChoices.QUEUED
|
||||
ACTIVE_STATE = StatusChoices.STARTED
|
||||
FINAL_STATES = (
|
||||
StatusChoices.SUCCEEDED,
|
||||
StatusChoices.FAILED,
|
||||
StatusChoices.SKIPPED,
|
||||
StatusChoices.NORESULTS,
|
||||
)
|
||||
FINAL_OR_ACTIVE_STATES = (*FINAL_STATES, ACTIVE_STATE)
|
||||
|
||||
@classmethod
|
||||
def get_plugin_choices(cls):
|
||||
@@ -2404,16 +2407,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
start_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||
end_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||
|
||||
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
|
||||
notes = models.TextField(blank=True, null=False, default='')
|
||||
# output_dir is computed via @property from snapshot.output_dir / plugin
|
||||
|
||||
state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
|
||||
retry_at_field_name = 'retry_at'
|
||||
state_field_name = 'status'
|
||||
active_state = StatusChoices.STARTED
|
||||
|
||||
snapshot_id: uuid.UUID
|
||||
process_id: uuid.UUID | None
|
||||
|
||||
@@ -2421,7 +2418,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
ModelWithOutputDir.Meta,
|
||||
ModelWithConfig.Meta,
|
||||
ModelWithNotes.Meta,
|
||||
ModelWithStateMachine.Meta,
|
||||
):
|
||||
app_label = 'core'
|
||||
verbose_name = 'Archive Result'
|
||||
@@ -2516,40 +2512,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
return None
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
|
||||
# Create Process record if this is a new ArchiveResult and no process exists yet
|
||||
if is_new and not self.process_id:
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
process = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
pwd=str(Path(self.snapshot.output_dir) / self.plugin),
|
||||
cmd=[], # Will be set by run()
|
||||
status='queued',
|
||||
timeout=120,
|
||||
env={},
|
||||
)
|
||||
self.process = process
|
||||
|
||||
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
|
||||
# Call the Django Model.save() directly instead
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created ArchiveResult',
|
||||
indent_level=3,
|
||||
plugin=self.plugin,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'snapshot_id': str(self.snapshot_id),
|
||||
'snapshot_url': str(self.snapshot.url)[:64],
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
# if is_new:
|
||||
# from archivebox.misc.logging_util import log_worker_event
|
||||
# log_worker_event(
|
||||
# worker_type='DB',
|
||||
# event='Created ArchiveResult',
|
||||
# indent_level=3,
|
||||
# plugin=self.plugin,
|
||||
# metadata={
|
||||
# 'id': str(self.id),
|
||||
# 'snapshot_id': str(self.snapshot_id),
|
||||
# 'snapshot_url': str(self.snapshot.url)[:64],
|
||||
# 'status': self.status,
|
||||
# },
|
||||
# )
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
@@ -2566,6 +2546,28 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def get_absolute_url(self):
|
||||
return f'/{self.snapshot.archive_path}/{self.plugin}'
|
||||
|
||||
def reset_for_retry(self, *, save: bool = True) -> None:
|
||||
self.status = self.StatusChoices.QUEUED
|
||||
self.output_str = ''
|
||||
self.output_json = None
|
||||
self.output_files = {}
|
||||
self.output_size = 0
|
||||
self.output_mimetypes = ''
|
||||
self.start_ts = None
|
||||
self.end_ts = None
|
||||
if save:
|
||||
self.save(update_fields=[
|
||||
'status',
|
||||
'output_str',
|
||||
'output_json',
|
||||
'output_files',
|
||||
'output_size',
|
||||
'output_mimetypes',
|
||||
'start_ts',
|
||||
'end_ts',
|
||||
'modified_at',
|
||||
])
|
||||
|
||||
@property
|
||||
def plugin_module(self) -> Any | None:
|
||||
# Hook scripts are now used instead of Python plugin modules
|
||||
@@ -2723,11 +2725,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
return None
|
||||
|
||||
def create_output_dir(self):
|
||||
output_dir = Path(self.snapshot_dir) / self.plugin
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
return output_dir
|
||||
|
||||
@property
|
||||
def output_dir_name(self) -> str:
|
||||
return self.plugin
|
||||
@@ -2782,134 +2779,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def save_search_index(self):
|
||||
pass
|
||||
|
||||
def cascade_health_update(self, success: bool):
|
||||
"""Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
|
||||
# Update archival hierarchy
|
||||
self.snapshot.increment_health_stats(success)
|
||||
self.snapshot.crawl.increment_health_stats(success)
|
||||
|
||||
# Update execution infrastructure
|
||||
if self.binary:
|
||||
self.binary.increment_health_stats(success)
|
||||
if self.binary.machine:
|
||||
self.binary.machine.increment_health_stats(success)
|
||||
|
||||
if self.iface:
|
||||
self.iface.increment_health_stats(success)
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute this ArchiveResult's hook and update status.
|
||||
|
||||
If self.hook_name is set, runs only that specific hook.
|
||||
If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
|
||||
|
||||
Updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config with proper context
|
||||
config = get_config(
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
# Determine which hook(s) to run
|
||||
hooks = []
|
||||
|
||||
if self.hook_name:
|
||||
# SPECIFIC HOOK MODE: Find the specific hook by name
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
plugin_dir = base_dir / self.plugin
|
||||
if plugin_dir.exists():
|
||||
hook_path = plugin_dir / self.hook_name
|
||||
if hook_path.exists():
|
||||
hooks.append(hook_path)
|
||||
break
|
||||
else:
|
||||
# LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
plugin_dir = base_dir / self.plugin
|
||||
if plugin_dir.exists():
|
||||
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
|
||||
if matches:
|
||||
hooks.extend(sorted(matches))
|
||||
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
if self.hook_name:
|
||||
self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
|
||||
else:
|
||||
self.output_str = f'No hooks found for plugin: {self.plugin}'
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Output directory is plugin_dir for the hook output
|
||||
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
|
||||
|
||||
start_ts = timezone.now()
|
||||
process = None
|
||||
|
||||
for hook in hooks:
|
||||
# Run hook using Process.launch() - returns Process model
|
||||
process = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_dir,
|
||||
config=config,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
crawl_id=str(self.snapshot.crawl.id),
|
||||
depth=self.snapshot.depth,
|
||||
)
|
||||
|
||||
# Link ArchiveResult to Process
|
||||
self.process = process
|
||||
self.start_ts = start_ts
|
||||
self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
|
||||
|
||||
if not process:
|
||||
# No hooks ran
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = 'No hooks executed'
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Update status based on hook execution
|
||||
if process.status == process.StatusChoices.RUNNING:
|
||||
# BACKGROUND HOOK - still running, return immediately
|
||||
# Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
|
||||
return
|
||||
|
||||
# FOREGROUND HOOK - completed, update from filesystem
|
||||
self.update_from_output()
|
||||
|
||||
# Clean up empty output directory if no files were created
|
||||
if plugin_dir.exists() and not self.output_files:
|
||||
try:
|
||||
if not any(plugin_dir.iterdir()):
|
||||
plugin_dir.rmdir()
|
||||
except (OSError, RuntimeError):
|
||||
pass
|
||||
|
||||
def update_from_output(self):
|
||||
"""
|
||||
Update this ArchiveResult from filesystem logs and output files.
|
||||
|
||||
Used for:
|
||||
- Foreground hooks that completed (called from ArchiveResult.run())
|
||||
- Background hooks that completed (called from Snapshot.cleanup())
|
||||
Used for Snapshot cleanup / orphan recovery when a hook's output exists
|
||||
on disk but the projector did not finalize the row in the database.
|
||||
|
||||
Updates:
|
||||
- status, output_str, output_json from ArchiveResult JSONL record
|
||||
- output_files, output_size, output_mimetypes by walking filesystem
|
||||
- end_ts, retry_at, cmd, cmd_version, binary FK
|
||||
- end_ts, cmd, cmd_version, binary FK
|
||||
- Processes side-effect records (Snapshot, Tag, etc.) via process_hook_records()
|
||||
"""
|
||||
import mimetypes
|
||||
@@ -2924,7 +2804,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = 'Output directory not found'
|
||||
self.end_ts = timezone.now()
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
@@ -2948,6 +2827,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
'succeeded': self.StatusChoices.SUCCEEDED,
|
||||
'failed': self.StatusChoices.FAILED,
|
||||
'skipped': self.StatusChoices.SKIPPED,
|
||||
'noresults': self.StatusChoices.NORESULTS,
|
||||
}
|
||||
self.status = status_map.get(hook_data.get('status', 'failed'), self.StatusChoices.FAILED)
|
||||
|
||||
@@ -3011,7 +2891,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Update timestamps
|
||||
self.end_ts = timezone.now()
|
||||
self.retry_at = None
|
||||
|
||||
self.save()
|
||||
|
||||
@@ -3095,340 +2974,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
Uses proper config hierarchy: defaults -> file -> env -> machine -> user -> crawl -> snapshot
|
||||
"""
|
||||
import re
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config with proper hierarchy
|
||||
config = get_config(
|
||||
user=self.created_by,
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
# Get allowlist/denylist (can be string or list)
|
||||
allowlist_raw = config.get('URL_ALLOWLIST', '')
|
||||
denylist_raw = config.get('URL_DENYLIST', '')
|
||||
|
||||
# Normalize to list of patterns
|
||||
def to_pattern_list(value):
|
||||
if isinstance(value, list):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return [p.strip() for p in value.split(',') if p.strip()]
|
||||
return []
|
||||
|
||||
allowlist = to_pattern_list(allowlist_raw)
|
||||
denylist = to_pattern_list(denylist_raw)
|
||||
|
||||
# Denylist takes precedence
|
||||
if denylist:
|
||||
for pattern in denylist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return False
|
||||
except re.error:
|
||||
continue # Skip invalid regex patterns
|
||||
|
||||
# If allowlist exists, URL must match at least one pattern
|
||||
if allowlist:
|
||||
for pattern in allowlist:
|
||||
try:
|
||||
if re.search(pattern, url):
|
||||
return True
|
||||
except re.error:
|
||||
continue # Skip invalid regex patterns
|
||||
return False # No allowlist patterns matched
|
||||
|
||||
return True # No filters or passed filters
|
||||
return self.snapshot.crawl.url_passes_filters(url, snapshot=self.snapshot)
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
"""Get the output directory for this plugin's results."""
|
||||
return Path(self.snapshot.output_dir) / self.plugin
|
||||
|
||||
def is_background_hook(self) -> bool:
|
||||
"""Check if this ArchiveResult is for a background hook."""
|
||||
plugin_dir = Path(self.pwd) if self.pwd else None
|
||||
if not plugin_dir:
|
||||
return False
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
return pid_file.exists()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ArchiveResult State Machine
|
||||
# =============================================================================
|
||||
|
||||
class ArchiveResultMachine(BaseStateMachine):
|
||||
"""
|
||||
State machine for managing ArchiveResult (single plugin execution) lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Waiting for its turn to run │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. archiveresult.run() │
|
||||
│ • Find specific hook by hook_name │
|
||||
│ • run_hook(script, output_dir, ...) → subprocess │
|
||||
│ │
|
||||
│ 2a. FOREGROUND hook (returns HookResult): │
|
||||
│ • update_from_output() immediately │
|
||||
│ - Read stdout.log │
|
||||
│ - Parse JSONL records │
|
||||
│ - Extract 'ArchiveResult' record → update status │
|
||||
│ - Walk output_dir → populate output_files │
|
||||
│ - Call process_hook_records() for side effects │
|
||||
│ │
|
||||
│ 2b. BACKGROUND hook (returns None): │
|
||||
│ • Status stays STARTED │
|
||||
│ • Continues running in background │
|
||||
│ • Killed by Snapshot.cleanup() when sealed │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() checks status
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
|
||||
│ • Set by hook's JSONL output during update_from_output() │
|
||||
│ • Health stats incremented (num_uses_succeeded/failed) │
|
||||
│ • Parent Snapshot health stats also updated │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model_attr_name = 'archiveresult'
|
||||
|
||||
# States
|
||||
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=ArchiveResult.StatusChoices.STARTED)
|
||||
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
# Flow: queued → started → (succeeded|failed|skipped)
|
||||
# queued → skipped (if exceeded max attempts)
|
||||
# started → backoff → started (retry)
|
||||
tick = (
|
||||
queued.to(skipped, cond='is_exceeded_max_attempts') # Check skip first
|
||||
| queued.to.itself(unless='can_start')
|
||||
| queued.to(started, cond='can_start')
|
||||
| started.to(succeeded, cond='is_succeeded')
|
||||
| started.to(failed, cond='is_failed')
|
||||
| started.to(skipped, cond='is_skipped')
|
||||
| started.to(backoff, cond='is_backoff')
|
||||
| backoff.to(skipped, cond='is_exceeded_max_attempts') # Check skip from backoff too
|
||||
| backoff.to.itself(unless='can_start')
|
||||
| backoff.to(started, cond='can_start')
|
||||
# Removed redundant transitions: backoff.to(succeeded/failed/skipped)
|
||||
# Reason: backoff should always retry→started, then started→final states
|
||||
)
|
||||
|
||||
archiveresult: ArchiveResult
|
||||
|
||||
def can_start(self) -> bool:
|
||||
"""Pure function - check if AR can start (has valid URL)."""
|
||||
return bool(self.archiveresult.snapshot.url)
|
||||
|
||||
def is_exceeded_max_attempts(self) -> bool:
|
||||
"""Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config(
|
||||
crawl=self.archiveresult.snapshot.crawl,
|
||||
snapshot=self.archiveresult.snapshot,
|
||||
)
|
||||
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
|
||||
|
||||
# Count failed ArchiveResults for this snapshot (any plugin type)
|
||||
failed_count = self.archiveresult.snapshot.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.FAILED
|
||||
).count()
|
||||
|
||||
return failed_count >= max_attempts
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if extractor plugin failed (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extractor plugin was skipped (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED
|
||||
and not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""
|
||||
Check if extraction has completed (success, failure, or skipped).
|
||||
|
||||
For background hooks in STARTED state, checks if their Process has finished and reaps them.
|
||||
"""
|
||||
# If already in final state, return True
|
||||
if self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
):
|
||||
return True
|
||||
|
||||
# If in STARTED state with a Process, check if Process has finished running
|
||||
if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
|
||||
if self.archiveresult.process_id:
|
||||
process = self.archiveresult.process
|
||||
|
||||
# If process is NOT running anymore, reap the background hook
|
||||
if not process.is_running:
|
||||
self.archiveresult.update_from_output()
|
||||
# Check if now in final state after reaping
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
return False
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
|
||||
# Update Process with network interface
|
||||
if self.archiveresult.process_id:
|
||||
self.archiveresult.process.iface = NetworkInterface.current()
|
||||
self.archiveresult.process.save()
|
||||
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Run the plugin - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
def _check_and_seal_parent_snapshot(self):
|
||||
"""
|
||||
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
|
||||
|
||||
Note: In the new architecture, the shared runner handles step advancement and sealing.
|
||||
This method is kept for direct model-driven edge cases.
|
||||
"""
|
||||
import sys
|
||||
|
||||
snapshot = self.archiveresult.snapshot
|
||||
|
||||
# Check if all archiveresults are finished (in final states)
|
||||
remaining_active = snapshot.archiveresult_set.exclude(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
]
|
||||
).count()
|
||||
|
||||
if remaining_active == 0:
|
||||
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
|
||||
# Seal the parent snapshot
|
||||
cast(Any, snapshot).sm.seal()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=True)
|
||||
|
||||
print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
import sys
|
||||
|
||||
print(f'[red] ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=False)
|
||||
|
||||
print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
import sys
|
||||
|
||||
# Set output_str if not already set (e.g., when skipped due to max attempts)
|
||||
if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(
|
||||
crawl=self.archiveresult.snapshot.crawl,
|
||||
snapshot=self.archiveresult.snapshot,
|
||||
)
|
||||
max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
|
||||
self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# State Machine Registration
|
||||
# =============================================================================
|
||||
@@ -3436,4 +2988,3 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
# Manually register state machines with python-statemachine registry
|
||||
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
|
||||
registry.register(SnapshotMachine)
|
||||
registry.register(ArchiveResultMachine)
|
||||
|
||||
@@ -232,11 +232,12 @@ SQLITE_CONNECTION_OPTIONS = {
|
||||
# https://gcollazo.com/optimal-sqlite-settings-for-django/
|
||||
# https://litestream.io/tips/#busy-timeout
|
||||
# https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
|
||||
"timeout": 10,
|
||||
"timeout": 30,
|
||||
"check_same_thread": False,
|
||||
"transaction_mode": "IMMEDIATE",
|
||||
"init_command": (
|
||||
"PRAGMA foreign_keys=ON;"
|
||||
"PRAGMA busy_timeout = 30000;"
|
||||
"PRAGMA journal_mode = WAL;"
|
||||
"PRAGMA synchronous = NORMAL;"
|
||||
"PRAGMA temp_store = MEMORY;"
|
||||
|
||||
271
archivebox/core/tag_utils.py
Normal file
271
archivebox/core/tag_utils.py
Normal file
@@ -0,0 +1,271 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
from django.contrib.auth.models import User
|
||||
from django.db.models import Count, F, Q, QuerySet
|
||||
from django.db.models.functions import Lower
|
||||
from django.http import HttpRequest
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.core.host_utils import build_snapshot_url, build_web_url
|
||||
from archivebox.core.models import Snapshot, SnapshotTag, Tag
|
||||
|
||||
|
||||
TAG_SNAPSHOT_PREVIEW_LIMIT = 10
|
||||
TAG_SORT_CHOICES = (
|
||||
('name_asc', 'Name A-Z'),
|
||||
('name_desc', 'Name Z-A'),
|
||||
('created_desc', 'Created newest'),
|
||||
('created_asc', 'Created oldest'),
|
||||
('snapshots_desc', 'Most snapshots'),
|
||||
('snapshots_asc', 'Fewest snapshots'),
|
||||
)
|
||||
TAG_HAS_SNAPSHOTS_CHOICES = (
|
||||
('all', 'All'),
|
||||
('yes', 'Has snapshots'),
|
||||
('no', 'No snapshots'),
|
||||
)
|
||||
|
||||
|
||||
def normalize_tag_name(name: str) -> str:
|
||||
return (name or '').strip()
|
||||
|
||||
|
||||
def normalize_tag_sort(sort: str = 'created_desc') -> str:
|
||||
valid_sorts = {key for key, _label in TAG_SORT_CHOICES}
|
||||
return sort if sort in valid_sorts else 'created_desc'
|
||||
|
||||
|
||||
def normalize_has_snapshots_filter(value: str = 'all') -> str:
|
||||
valid_filters = {key for key, _label in TAG_HAS_SNAPSHOTS_CHOICES}
|
||||
return value if value in valid_filters else 'all'
|
||||
|
||||
|
||||
def normalize_created_by_filter(created_by: str = '') -> str:
|
||||
return created_by if str(created_by).isdigit() else ''
|
||||
|
||||
|
||||
def normalize_created_year_filter(year: str = '') -> str:
|
||||
year = (year or '').strip()
|
||||
return year if len(year) == 4 and year.isdigit() else ''
|
||||
|
||||
|
||||
def get_matching_tags(
|
||||
query: str = '',
|
||||
sort: str = 'created_desc',
|
||||
created_by: str = '',
|
||||
year: str = '',
|
||||
has_snapshots: str = 'all',
|
||||
) -> QuerySet[Tag]:
|
||||
queryset = Tag.objects.select_related('created_by').annotate(
|
||||
num_snapshots=Count('snapshot_set', distinct=True),
|
||||
)
|
||||
|
||||
query = normalize_tag_name(query)
|
||||
if query:
|
||||
queryset = queryset.filter(
|
||||
Q(name__icontains=query) | Q(slug__icontains=query),
|
||||
)
|
||||
|
||||
created_by = normalize_created_by_filter(created_by)
|
||||
if created_by:
|
||||
queryset = queryset.filter(created_by_id=int(created_by))
|
||||
|
||||
year = normalize_created_year_filter(year)
|
||||
if year:
|
||||
queryset = queryset.filter(created_at__year=int(year))
|
||||
|
||||
has_snapshots = normalize_has_snapshots_filter(has_snapshots)
|
||||
if has_snapshots == 'yes':
|
||||
queryset = queryset.filter(num_snapshots__gt=0)
|
||||
elif has_snapshots == 'no':
|
||||
queryset = queryset.filter(num_snapshots=0)
|
||||
|
||||
sort = normalize_tag_sort(sort)
|
||||
if sort == 'name_asc':
|
||||
queryset = queryset.order_by(Lower('name'), 'id')
|
||||
elif sort == 'name_desc':
|
||||
queryset = queryset.order_by(Lower('name').desc(), '-id')
|
||||
elif sort == 'created_asc':
|
||||
queryset = queryset.order_by(F('created_at').asc(nulls_first=True), 'id', Lower('name'))
|
||||
elif sort == 'snapshots_desc':
|
||||
queryset = queryset.order_by(F('num_snapshots').desc(nulls_last=True), F('created_at').desc(nulls_last=True), '-id', Lower('name'))
|
||||
elif sort == 'snapshots_asc':
|
||||
queryset = queryset.order_by(F('num_snapshots').asc(nulls_first=True), Lower('name'), 'id')
|
||||
else:
|
||||
queryset = queryset.order_by(F('created_at').desc(nulls_last=True), '-id', Lower('name'))
|
||||
|
||||
return queryset
|
||||
|
||||
|
||||
def get_tag_creator_choices() -> list[tuple[str, str]]:
|
||||
rows = (
|
||||
Tag.objects
|
||||
.filter(created_by__isnull=False)
|
||||
.values_list('created_by_id', 'created_by__username')
|
||||
.order_by(Lower('created_by__username'), 'created_by_id')
|
||||
.distinct()
|
||||
)
|
||||
return [(str(user_id), username or f'User {user_id}') for user_id, username in rows]
|
||||
|
||||
|
||||
def get_tag_year_choices() -> list[str]:
|
||||
years = Tag.objects.exclude(created_at__isnull=True).dates('created_at', 'year', order='DESC')
|
||||
return [str(year.year) for year in years]
|
||||
|
||||
|
||||
def get_tag_by_ref(tag_ref: str | int) -> Tag:
|
||||
if isinstance(tag_ref, int):
|
||||
return Tag.objects.get(pk=tag_ref)
|
||||
|
||||
ref = str(tag_ref).strip()
|
||||
if ref.isdigit():
|
||||
return Tag.objects.get(pk=int(ref))
|
||||
|
||||
try:
|
||||
return Tag.objects.get(slug__iexact=ref)
|
||||
except Tag.DoesNotExist:
|
||||
return Tag.objects.get(slug__icontains=ref)
|
||||
|
||||
|
||||
def get_or_create_tag(name: str, created_by: User | None = None) -> tuple[Tag, bool]:
|
||||
normalized_name = normalize_tag_name(name)
|
||||
if not normalized_name:
|
||||
raise ValueError('Tag name is required')
|
||||
|
||||
existing = Tag.objects.filter(name__iexact=normalized_name).first()
|
||||
if existing:
|
||||
return existing, False
|
||||
|
||||
tag = Tag.objects.create(
|
||||
name=normalized_name,
|
||||
created_by=created_by,
|
||||
)
|
||||
return tag, True
|
||||
|
||||
|
||||
def rename_tag(tag: Tag, name: str) -> Tag:
|
||||
normalized_name = normalize_tag_name(name)
|
||||
if not normalized_name:
|
||||
raise ValueError('Tag name is required')
|
||||
|
||||
existing = Tag.objects.filter(name__iexact=normalized_name).exclude(pk=tag.pk).first()
|
||||
if existing:
|
||||
raise ValueError(f'Tag "{existing.name}" already exists')
|
||||
|
||||
if tag.name != normalized_name:
|
||||
tag.name = normalized_name
|
||||
tag.save()
|
||||
return tag
|
||||
|
||||
|
||||
def delete_tag(tag: Tag) -> tuple[int, dict[str, int]]:
|
||||
return tag.delete()
|
||||
|
||||
|
||||
def export_tag_urls(tag: Tag) -> str:
|
||||
urls = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').values_list('url', flat=True)
|
||||
return '\n'.join(urls)
|
||||
|
||||
|
||||
def export_tag_snapshots_jsonl(tag: Tag) -> str:
|
||||
snapshots = tag.snapshot_set.order_by('-downloaded_at', '-created_at', '-pk').prefetch_related('tags')
|
||||
return '\n'.join(json.dumps(snapshot.to_json()) for snapshot in snapshots)
|
||||
|
||||
|
||||
def _display_snapshot_title(snapshot: Snapshot) -> str:
|
||||
title = (snapshot.title or '').strip()
|
||||
url = (snapshot.url or '').strip()
|
||||
if not title:
|
||||
return url
|
||||
|
||||
normalized_title = title.lower()
|
||||
if normalized_title == 'pending...' or normalized_title == url.lower():
|
||||
return url
|
||||
return title
|
||||
|
||||
|
||||
def _build_snapshot_preview(snapshot: Snapshot, request: HttpRequest | None = None) -> dict[str, Any]:
|
||||
return {
|
||||
'id': str(snapshot.pk),
|
||||
'title': _display_snapshot_title(snapshot),
|
||||
'url': snapshot.url,
|
||||
'favicon_url': build_snapshot_url(str(snapshot.pk), 'favicon.ico', request=request),
|
||||
'admin_url': reverse('admin:core_snapshot_change', args=[snapshot.pk]),
|
||||
'archive_url': build_web_url(f'/{snapshot.archive_path_from_db}/index.html', request=request),
|
||||
'downloaded_at': snapshot.downloaded_at.isoformat() if snapshot.downloaded_at else None,
|
||||
}
|
||||
|
||||
|
||||
def _build_snapshot_preview_map(tags: list[Tag], request: HttpRequest | None = None, preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT) -> dict[int, list[dict[str, Any]]]:
|
||||
tag_ids = [tag.pk for tag in tags]
|
||||
if not tag_ids:
|
||||
return {}
|
||||
|
||||
snapshot_tags = (
|
||||
SnapshotTag.objects
|
||||
.filter(tag_id__in=tag_ids)
|
||||
.select_related('snapshot__crawl__created_by')
|
||||
.order_by(
|
||||
'tag_id',
|
||||
F('snapshot__downloaded_at').desc(nulls_last=True),
|
||||
F('snapshot__created_at').desc(nulls_last=True),
|
||||
F('snapshot_id').desc(),
|
||||
)
|
||||
)
|
||||
|
||||
preview_map: dict[int, list[dict[str, Any]]] = defaultdict(list)
|
||||
for snapshot_tag in snapshot_tags:
|
||||
previews = preview_map[snapshot_tag.tag_id]
|
||||
if len(previews) >= preview_limit:
|
||||
continue
|
||||
previews.append(_build_snapshot_preview(snapshot_tag.snapshot, request=request))
|
||||
return preview_map
|
||||
|
||||
|
||||
def build_tag_card(tag: Tag, snapshot_previews: list[dict[str, Any]] | None = None) -> dict[str, Any]:
|
||||
count = getattr(tag, 'num_snapshots', tag.snapshot_set.count())
|
||||
return {
|
||||
'id': tag.pk,
|
||||
'name': tag.name,
|
||||
'slug': tag.slug,
|
||||
'num_snapshots': count,
|
||||
'filter_url': f"{reverse('admin:core_snapshot_changelist')}?tags__id__exact={tag.pk}",
|
||||
'edit_url': reverse('admin:core_tag_change', args=[tag.pk]),
|
||||
'export_urls_url': reverse('api-1:tag_urls_export', args=[tag.pk]),
|
||||
'export_jsonl_url': reverse('api-1:tag_snapshots_export', args=[tag.pk]),
|
||||
'rename_url': reverse('api-1:rename_tag', args=[tag.pk]),
|
||||
'delete_url': reverse('api-1:delete_tag', args=[tag.pk]),
|
||||
'snapshots': snapshot_previews or [],
|
||||
}
|
||||
|
||||
|
||||
def build_tag_cards(
|
||||
query: str = '',
|
||||
request: HttpRequest | None = None,
|
||||
limit: int | None = None,
|
||||
preview_limit: int = TAG_SNAPSHOT_PREVIEW_LIMIT,
|
||||
sort: str = 'created_desc',
|
||||
created_by: str = '',
|
||||
year: str = '',
|
||||
has_snapshots: str = 'all',
|
||||
) -> list[dict[str, Any]]:
|
||||
queryset = get_matching_tags(
|
||||
query=query,
|
||||
sort=sort,
|
||||
created_by=created_by,
|
||||
year=year,
|
||||
has_snapshots=has_snapshots,
|
||||
)
|
||||
if limit is not None:
|
||||
queryset = queryset[:limit]
|
||||
|
||||
tags = list(queryset)
|
||||
preview_map = _build_snapshot_preview_map(tags, request=request, preview_limit=preview_limit)
|
||||
return [
|
||||
build_tag_card(tag, snapshot_previews=preview_map.get(tag.pk, []))
|
||||
for tag in tags
|
||||
]
|
||||
@@ -11,6 +11,7 @@ from archivebox.hooks import (
|
||||
)
|
||||
from archivebox.core.host_utils import (
|
||||
get_admin_base_url,
|
||||
get_public_base_url,
|
||||
get_web_base_url,
|
||||
get_snapshot_base_url,
|
||||
build_snapshot_url,
|
||||
@@ -166,6 +167,11 @@ def web_base_url(context) -> str:
|
||||
return get_web_base_url(request=context.get('request'))
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def public_base_url(context) -> str:
|
||||
return get_public_base_url(request=context.get('request'))
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def snapshot_base_url(context, snapshot) -> str:
|
||||
snapshot_id = getattr(snapshot, 'id', snapshot)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import json
|
||||
import os
|
||||
import posixpath
|
||||
from glob import glob, escape
|
||||
@@ -7,7 +8,7 @@ from django.utils import timezone
|
||||
import inspect
|
||||
from typing import Callable, cast, get_type_hints
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import quote, urlparse
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
|
||||
@@ -26,7 +27,7 @@ from admin_data_views.typing import TableContext, ItemContext, SectionData
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode
|
||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
@@ -37,7 +38,18 @@ from archivebox.core.models import Snapshot
|
||||
from archivebox.core.host_utils import build_snapshot_url
|
||||
from archivebox.core.forms import AddLinkForm
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.hooks import get_enabled_plugins, get_plugin_name
|
||||
from archivebox.hooks import (
|
||||
BUILTIN_PLUGINS_DIR,
|
||||
USER_PLUGINS_DIR,
|
||||
discover_plugin_configs,
|
||||
get_enabled_plugins,
|
||||
get_plugin_name,
|
||||
iter_plugin_dirs,
|
||||
)
|
||||
|
||||
|
||||
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
|
||||
LIVE_PLUGIN_BASE_URL = '/admin/environment/plugins/'
|
||||
|
||||
|
||||
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
|
||||
@@ -699,6 +711,9 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind
|
||||
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
|
||||
rel_path = path or ""
|
||||
show_indexes = bool(request.GET.get("files"))
|
||||
if not show_indexes and (not rel_path or rel_path == "index.html"):
|
||||
return SnapshotView.render_live_index(request, snapshot)
|
||||
|
||||
if not rel_path or rel_path.endswith("/"):
|
||||
if show_indexes:
|
||||
rel_path = rel_path.rstrip("/")
|
||||
@@ -784,7 +799,6 @@ class SnapshotHostView(View):
|
||||
raise Http404
|
||||
return _serve_snapshot_replay(request, snapshot, path)
|
||||
|
||||
|
||||
class SnapshotReplayView(View):
|
||||
"""Serve snapshot directory contents on a one-domain replay path."""
|
||||
|
||||
@@ -915,8 +929,17 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
return custom_config
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
required_search_plugin = f'search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}'.strip()
|
||||
plugin_configs = discover_plugin_configs()
|
||||
plugin_dependency_map = {
|
||||
plugin_name: [
|
||||
str(required_plugin).strip()
|
||||
for required_plugin in (schema.get('required_plugins') or [])
|
||||
if str(required_plugin).strip()
|
||||
]
|
||||
for plugin_name, schema in plugin_configs.items()
|
||||
if isinstance(schema.get('required_plugins'), list) and schema.get('required_plugins')
|
||||
}
|
||||
return {
|
||||
**super().get_context_data(**kwargs),
|
||||
'title': "Create Crawl",
|
||||
@@ -924,8 +947,9 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
|
||||
'VERSION': VERSION,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
'required_search_plugin': required_search_plugin,
|
||||
'plugin_dependency_map_json': json.dumps(plugin_dependency_map, sort_keys=True),
|
||||
'stdout': '',
|
||||
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
|
||||
}
|
||||
|
||||
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
|
||||
@@ -937,11 +961,10 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
depth = int(form.cleaned_data["depth"])
|
||||
plugins = ','.join(form.cleaned_data.get("plugins", []))
|
||||
schedule = form.cleaned_data.get("schedule", "").strip()
|
||||
persona = form.cleaned_data.get("persona", "Default")
|
||||
overwrite = form.cleaned_data.get("overwrite", False)
|
||||
update = form.cleaned_data.get("update", False)
|
||||
persona = form.cleaned_data.get("persona")
|
||||
index_only = form.cleaned_data.get("index_only", False)
|
||||
notes = form.cleaned_data.get("notes", "")
|
||||
url_filters = form.cleaned_data.get("url_filters") or {}
|
||||
custom_config = self._get_custom_config_overrides(form)
|
||||
|
||||
from archivebox.config.permissions import HOSTNAME
|
||||
@@ -957,6 +980,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
|
||||
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
|
||||
# 2. create a new Crawl with the URLs from the file
|
||||
@@ -964,16 +988,18 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
urls_content = sources_file.read_text()
|
||||
# Build complete config
|
||||
config = {
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'DEPTH': depth,
|
||||
'PLUGINS': plugins or '',
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
'DEFAULT_PERSONA': (persona.name if persona else 'Default'),
|
||||
}
|
||||
|
||||
# Merge custom config overrides
|
||||
config.update(custom_config)
|
||||
if url_filters.get('allowlist'):
|
||||
config['URL_ALLOWLIST'] = url_filters['allowlist']
|
||||
if url_filters.get('denylist'):
|
||||
config['URL_DENYLIST'] = url_filters['denylist']
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
@@ -999,6 +1025,8 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
crawl.schedule = crawl_schedule
|
||||
crawl.save(update_fields=['schedule'])
|
||||
|
||||
crawl.create_snapshots_from_urls()
|
||||
|
||||
# 4. start the Orchestrator & wait until it completes
|
||||
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||
# from archivebox.crawls.actors import CrawlActor
|
||||
@@ -1011,7 +1039,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
|
||||
urls = form.cleaned_data["url"]
|
||||
schedule = form.cleaned_data.get("schedule", "").strip()
|
||||
rough_url_count = urls.count('://')
|
||||
rough_url_count = len([url for url in urls.splitlines() if url.strip()])
|
||||
|
||||
# Build success message with schedule link if created
|
||||
schedule_msg = ""
|
||||
@@ -1080,10 +1108,6 @@ class WebAddView(AddView):
|
||||
'persona': defaults_form.fields['persona'].initial or 'Default',
|
||||
'config': {},
|
||||
}
|
||||
if defaults_form.fields['update'].initial:
|
||||
form_data['update'] = 'on'
|
||||
if defaults_form.fields['overwrite'].initial:
|
||||
form_data['overwrite'] = 'on'
|
||||
if defaults_form.fields['index_only'].initial:
|
||||
form_data['index_only'] = 'on'
|
||||
|
||||
@@ -1118,6 +1142,41 @@ def live_progress_view(request):
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]:
|
||||
normalized_hook_name = Path(hook_name).name if hook_name else ""
|
||||
if not normalized_hook_name:
|
||||
return (plugin, plugin, "unknown", "")
|
||||
|
||||
phase = "unknown"
|
||||
if normalized_hook_name.startswith("on_Crawl__"):
|
||||
phase = "crawl"
|
||||
elif normalized_hook_name.startswith("on_Snapshot__"):
|
||||
phase = "snapshot"
|
||||
elif normalized_hook_name.startswith("on_Binary__"):
|
||||
phase = "binary"
|
||||
|
||||
label = normalized_hook_name
|
||||
if "__" in normalized_hook_name:
|
||||
label = normalized_hook_name.split("__", 1)[1]
|
||||
label = label.rsplit(".", 1)[0]
|
||||
if len(label) > 3 and label[:2].isdigit() and label[2] == "_":
|
||||
label = label[3:]
|
||||
label = label.replace("_", " ").strip() or plugin
|
||||
|
||||
return (plugin, label, phase, normalized_hook_name)
|
||||
|
||||
def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]:
|
||||
hook_path = ""
|
||||
if isinstance(cmd, list) and cmd:
|
||||
first = cmd[0]
|
||||
if isinstance(first, str):
|
||||
hook_path = first
|
||||
|
||||
if not hook_path:
|
||||
return ("", "setup", "unknown", "")
|
||||
|
||||
return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup")
|
||||
|
||||
machine = Machine.current()
|
||||
orchestrator_proc = Process.objects.filter(
|
||||
machine=machine,
|
||||
@@ -1188,8 +1247,19 @@ def live_progress_view(request):
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
)
|
||||
recent_processes = Process.objects.filter(
|
||||
machine=machine,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
modified_at__gte=timezone.now() - timedelta(minutes=10),
|
||||
).order_by("-modified_at")
|
||||
crawl_process_pids: dict[str, int] = {}
|
||||
snapshot_process_pids: dict[str, int] = {}
|
||||
process_records_by_crawl: dict[str, list[dict[str, object]]] = {}
|
||||
process_records_by_snapshot: dict[str, list[dict[str, object]]] = {}
|
||||
seen_process_records: set[str] = set()
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
@@ -1197,11 +1267,48 @@ def live_progress_view(request):
|
||||
|
||||
crawl_id = env.get('CRAWL_ID')
|
||||
snapshot_id = env.get('SNAPSHOT_ID')
|
||||
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
|
||||
if crawl_id and proc.pid:
|
||||
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
|
||||
if snapshot_id and proc.pid:
|
||||
if phase == "snapshot" and snapshot_id and proc.pid:
|
||||
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
|
||||
|
||||
for proc in recent_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
env = {}
|
||||
|
||||
crawl_id = env.get("CRAWL_ID")
|
||||
snapshot_id = env.get("SNAPSHOT_ID")
|
||||
if not crawl_id and not snapshot_id:
|
||||
continue
|
||||
|
||||
plugin, label, phase, hook_name = process_label(proc.cmd)
|
||||
|
||||
record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id)
|
||||
proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}"
|
||||
if proc_key in seen_process_records:
|
||||
continue
|
||||
seen_process_records.add(proc_key)
|
||||
|
||||
status = "started" if proc.status == Process.StatusChoices.RUNNING else ("failed" if proc.exit_code not in (None, 0) else "succeeded")
|
||||
payload: dict[str, object] = {
|
||||
"id": str(proc.id),
|
||||
"plugin": plugin,
|
||||
"label": label,
|
||||
"hook_name": hook_name,
|
||||
"status": status,
|
||||
"phase": phase,
|
||||
"source": "process",
|
||||
"process_id": str(proc.id),
|
||||
}
|
||||
if status == "started" and proc.pid:
|
||||
payload["pid"] = proc.pid
|
||||
if phase == "snapshot" and snapshot_id:
|
||||
process_records_by_snapshot.setdefault(str(snapshot_id), []).append(payload)
|
||||
elif crawl_id:
|
||||
process_records_by_crawl.setdefault(str(crawl_id), []).append(payload)
|
||||
|
||||
active_crawls_qs = Crawl.objects.filter(
|
||||
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
|
||||
).prefetch_related(
|
||||
@@ -1234,6 +1341,11 @@ def live_progress_view(request):
|
||||
|
||||
# Calculate crawl progress
|
||||
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||
crawl_setup_plugins = list(process_records_by_crawl.get(str(crawl.id), []))
|
||||
crawl_setup_total = len(crawl_setup_plugins)
|
||||
crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded")
|
||||
crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed")
|
||||
crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued")
|
||||
|
||||
# Get active snapshots for this crawl (already prefetched)
|
||||
active_snapshots_for_crawl = []
|
||||
@@ -1241,28 +1353,21 @@ def live_progress_view(request):
|
||||
# Get archive results for this snapshot (already prefetched)
|
||||
snapshot_results = snapshot.archiveresult_set.all()
|
||||
|
||||
# Count in memory instead of DB queries
|
||||
total_plugins = len(snapshot_results)
|
||||
completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
|
||||
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
|
||||
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
|
||||
|
||||
# Calculate snapshot progress using per-plugin progress
|
||||
now = timezone.now()
|
||||
plugin_progress_values: list[int] = []
|
||||
all_plugins: list[dict[str, object]] = []
|
||||
seen_plugin_keys: set[str] = set()
|
||||
|
||||
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
|
||||
# Order: started first, then queued, then completed
|
||||
def plugin_sort_key(ar):
|
||||
status_order = {
|
||||
ArchiveResult.StatusChoices.STARTED: 0,
|
||||
ArchiveResult.StatusChoices.QUEUED: 1,
|
||||
ArchiveResult.StatusChoices.SUCCEEDED: 2,
|
||||
ArchiveResult.StatusChoices.FAILED: 3,
|
||||
ArchiveResult.StatusChoices.NORESULTS: 3,
|
||||
ArchiveResult.StatusChoices.FAILED: 4,
|
||||
}
|
||||
return (status_order.get(ar.status, 4), ar.plugin)
|
||||
return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "")
|
||||
|
||||
all_plugins = []
|
||||
for ar in sorted(snapshot_results, key=plugin_sort_key):
|
||||
status = ar.status
|
||||
progress_value = 0
|
||||
@@ -1270,6 +1375,7 @@ def live_progress_view(request):
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.NORESULTS,
|
||||
):
|
||||
progress_value = 100
|
||||
elif status == ArchiveResult.StatusChoices.STARTED:
|
||||
@@ -1284,20 +1390,49 @@ def live_progress_view(request):
|
||||
progress_value = 0
|
||||
|
||||
plugin_progress_values.append(progress_value)
|
||||
plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin)
|
||||
|
||||
plugin_payload = {
|
||||
'id': str(ar.id),
|
||||
'plugin': ar.plugin,
|
||||
'label': label,
|
||||
'hook_name': hook_name,
|
||||
'phase': phase,
|
||||
'status': status,
|
||||
'process_id': str(ar.process_id) if ar.process_id else None,
|
||||
}
|
||||
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
|
||||
plugin_payload['pid'] = ar.process.pid
|
||||
if status == ArchiveResult.StatusChoices.STARTED:
|
||||
plugin_payload['progress'] = progress_value
|
||||
plugin_payload['timeout'] = ar.timeout or 120
|
||||
plugin_payload['source'] = 'archiveresult'
|
||||
all_plugins.append(plugin_payload)
|
||||
seen_plugin_keys.add(
|
||||
str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}"
|
||||
)
|
||||
|
||||
snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0
|
||||
for proc_payload in process_records_by_snapshot.get(str(snapshot.id), []):
|
||||
proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}")
|
||||
if proc_key in seen_plugin_keys:
|
||||
continue
|
||||
seen_plugin_keys.add(proc_key)
|
||||
all_plugins.append(proc_payload)
|
||||
|
||||
proc_status = proc_payload.get("status")
|
||||
if proc_status in ("succeeded", "failed", "skipped"):
|
||||
plugin_progress_values.append(100)
|
||||
elif proc_status == "started":
|
||||
plugin_progress_values.append(1)
|
||||
else:
|
||||
plugin_progress_values.append(0)
|
||||
|
||||
total_plugins = len(all_plugins)
|
||||
completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded")
|
||||
failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed")
|
||||
pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued")
|
||||
|
||||
snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0
|
||||
|
||||
active_snapshots_for_crawl.append({
|
||||
'id': str(snapshot.id),
|
||||
@@ -1334,6 +1469,11 @@ def live_progress_view(request):
|
||||
'started_snapshots': started_snapshots,
|
||||
'failed_snapshots': 0,
|
||||
'pending_snapshots': pending_snapshots,
|
||||
'setup_plugins': crawl_setup_plugins,
|
||||
'setup_total_plugins': crawl_setup_total,
|
||||
'setup_completed_plugins': crawl_setup_completed,
|
||||
'setup_failed_plugins': crawl_setup_failed,
|
||||
'setup_pending_plugins': crawl_setup_pending,
|
||||
'active_snapshots': active_snapshots_for_crawl,
|
||||
'can_start': can_start,
|
||||
'urls_preview': urls_preview,
|
||||
@@ -1461,7 +1601,11 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
"""Determine where a config value comes from."""
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
# Check if it's from archivebox.machine.config
|
||||
# Environment variables override all persistent config sources.
|
||||
if key in os.environ:
|
||||
return 'Environment'
|
||||
|
||||
# Machine.config overrides ArchiveBox.conf.
|
||||
try:
|
||||
machine = Machine.current()
|
||||
if machine.config and key in machine.config:
|
||||
@@ -1469,10 +1613,6 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check if it's from environment variable
|
||||
if key in os.environ:
|
||||
return 'Environment'
|
||||
|
||||
# Check if it's from archivebox.config.file
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
@@ -1483,6 +1623,43 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
return 'Default'
|
||||
|
||||
|
||||
def find_plugin_for_config_key(key: str) -> str | None:
|
||||
for plugin_name, schema in discover_plugin_configs().items():
|
||||
if key in (schema.get('properties') or {}):
|
||||
return plugin_name
|
||||
return None
|
||||
|
||||
|
||||
def get_config_definition_link(key: str) -> tuple[str, str]:
|
||||
plugin_name = find_plugin_for_config_key(key)
|
||||
if not plugin_name:
|
||||
return (
|
||||
f'https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code',
|
||||
'archivebox/config',
|
||||
)
|
||||
|
||||
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
|
||||
if plugin_dir:
|
||||
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(builtin_root):
|
||||
return (
|
||||
f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json',
|
||||
f'abx_plugins/plugins/{plugin_name}/config.json',
|
||||
)
|
||||
|
||||
user_root = USER_PLUGINS_DIR.resolve()
|
||||
if plugin_dir.is_relative_to(user_root):
|
||||
return (
|
||||
f'{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/',
|
||||
f'data/custom_plugins/{plugin_name}/config.json',
|
||||
)
|
||||
|
||||
return (
|
||||
f'{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/',
|
||||
f'abx_plugins/plugins/{plugin_name}/config.json',
|
||||
)
|
||||
|
||||
|
||||
@render_with_table_view
|
||||
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
CONFIGS = get_all_configs()
|
||||
@@ -1566,17 +1743,6 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
# Determine all sources for this config value
|
||||
sources_info = []
|
||||
|
||||
# Default value
|
||||
default_val = find_config_default(key)
|
||||
if default_val:
|
||||
sources_info.append(('Default', default_val, 'gray'))
|
||||
|
||||
# Config file value
|
||||
if CONSTANTS.CONFIG_FILE.exists():
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
if key in file_config:
|
||||
sources_info.append(('Config File', file_config[key], 'green'))
|
||||
|
||||
# Environment variable
|
||||
if key in os.environ:
|
||||
sources_info.append(('Environment', os.environ[key] if key_is_safe(key) else '********', 'blue'))
|
||||
@@ -1592,6 +1758,17 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Config file value
|
||||
if CONSTANTS.CONFIG_FILE.exists():
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
if key in file_config:
|
||||
sources_info.append(('Config File', file_config[key], 'green'))
|
||||
|
||||
# Default value
|
||||
default_val = find_config_default(key)
|
||||
if default_val:
|
||||
sources_info.append(('Default', default_val, 'gray'))
|
||||
|
||||
# Final computed value
|
||||
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
|
||||
if not key_is_safe(key):
|
||||
@@ -1614,6 +1791,8 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
section_header = mark_safe(f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, calculated at runtime)</small>')
|
||||
|
||||
|
||||
definition_url, definition_label = get_config_definition_link(key)
|
||||
|
||||
section_data = cast(SectionData, {
|
||||
"name": section_header,
|
||||
"description": None,
|
||||
@@ -1621,7 +1800,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': final_value,
|
||||
'Source': find_config_source(key, merged_config),
|
||||
'Currently read from': find_config_source(key, merged_config),
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
@@ -1631,14 +1810,14 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
</span>
|
||||
'''),
|
||||
'Type': mark_safe(f'''
|
||||
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
|
||||
See full definition in <code>archivebox/config</code>...
|
||||
<a href="{definition_url}" target="_blank" rel="noopener noreferrer">
|
||||
See full definition in <code>{definition_label}</code>...
|
||||
</a>
|
||||
'''),
|
||||
'Value': mark_safe(f'''
|
||||
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
|
||||
<br/><hr/><br/>
|
||||
<b>Configuration Sources (in priority order):</b><br/><br/>
|
||||
<b>Configuration Sources (highest priority first):</b><br/><br/>
|
||||
{sources_html}
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
|
||||
@@ -1651,15 +1830,15 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
'Source': mark_safe(f'''
|
||||
'Currently read from': mark_safe(f'''
|
||||
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
|
||||
<br/><br/>
|
||||
Priority order (highest to lowest):
|
||||
<ol>
|
||||
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
||||
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
|
||||
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
|
||||
</li>
|
||||
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
||||
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
|
||||
<li><b style="color: gray">Default</b> - Default value from code</li>
|
||||
</ol>
|
||||
|
||||
@@ -131,7 +131,46 @@ class TagEditorWidget(forms.Widget):
|
||||
}};
|
||||
|
||||
window.updateHiddenInput_{widget_id} = function() {{
|
||||
document.getElementById('{widget_id}').value = currentTags_{widget_id}.join(',');
|
||||
var hiddenInput = document.getElementById('{widget_id}');
|
||||
if (!hiddenInput) {{
|
||||
return;
|
||||
}}
|
||||
hiddenInput.value = currentTags_{widget_id}.join(',');
|
||||
hiddenInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
hiddenInput.dispatchEvent(new Event('change', {{ bubbles: true }}));
|
||||
}};
|
||||
|
||||
function normalizeTags_{widget_id}(value) {{
|
||||
var rawTags = Array.isArray(value) ? value : String(value || '').split(',');
|
||||
var seen = {{}};
|
||||
return rawTags
|
||||
.map(function(tag) {{ return String(tag || '').trim(); }})
|
||||
.filter(function(tag) {{
|
||||
if (!tag) return false;
|
||||
var normalized = tag.toLowerCase();
|
||||
if (seen[normalized]) return false;
|
||||
seen[normalized] = true;
|
||||
return true;
|
||||
}})
|
||||
.sort(function(a, b) {{
|
||||
return a.toLowerCase().localeCompare(b.toLowerCase());
|
||||
}});
|
||||
}}
|
||||
|
||||
window.setTags_{widget_id} = function(value, options) {{
|
||||
currentTags_{widget_id} = normalizeTags_{widget_id}(value);
|
||||
rebuildPills_{widget_id}();
|
||||
if (!(options && options.skipHiddenUpdate)) {{
|
||||
updateHiddenInput_{widget_id}();
|
||||
}}
|
||||
}};
|
||||
|
||||
window.syncTagEditorFromHidden_{widget_id} = function() {{
|
||||
var hiddenInput = document.getElementById('{widget_id}');
|
||||
if (!hiddenInput) {{
|
||||
return;
|
||||
}}
|
||||
setTags_{widget_id}(hiddenInput.value, {{ skipHiddenUpdate: true }});
|
||||
}};
|
||||
|
||||
function computeTagStyle_{widget_id}(tagName) {{
|
||||
@@ -190,9 +229,7 @@ class TagEditorWidget(forms.Widget):
|
||||
|
||||
// Add to current tags
|
||||
currentTags_{widget_id}.push(tagName);
|
||||
currentTags_{widget_id}.sort(function(a, b) {{
|
||||
return a.toLowerCase().localeCompare(b.toLowerCase());
|
||||
}});
|
||||
currentTags_{widget_id} = normalizeTags_{widget_id}(currentTags_{widget_id});
|
||||
|
||||
// Rebuild pills
|
||||
rebuildPills_{widget_id}();
|
||||
@@ -252,6 +289,14 @@ class TagEditorWidget(forms.Widget):
|
||||
}}
|
||||
}});
|
||||
|
||||
document.getElementById('{widget_id}').addEventListener('change', function() {{
|
||||
syncTagEditorFromHidden_{widget_id}();
|
||||
}});
|
||||
|
||||
document.getElementById('{widget_id}').addEventListener('archivebox:sync-tags', function() {{
|
||||
syncTagEditorFromHidden_{widget_id}();
|
||||
}});
|
||||
|
||||
window.handleTagKeydown_{widget_id} = function(event) {{
|
||||
var input = event.target;
|
||||
var value = input.value.trim();
|
||||
@@ -320,6 +365,8 @@ class TagEditorWidget(forms.Widget):
|
||||
var input = document.querySelector('input[name="csrfmiddlewaretoken"]');
|
||||
return input ? input.value : '';
|
||||
}}
|
||||
|
||||
syncTagEditorFromHidden_{widget_id}();
|
||||
}})();
|
||||
</script>
|
||||
'''
|
||||
@@ -327,15 +374,232 @@ class TagEditorWidget(forms.Widget):
|
||||
return mark_safe(html)
|
||||
|
||||
|
||||
class URLFiltersWidget(forms.Widget):
|
||||
"""Render URL allowlist / denylist controls with same-domain autofill."""
|
||||
|
||||
template_name = ""
|
||||
|
||||
def __init__(self, attrs=None, *, source_selector='textarea[name="url"]'):
|
||||
self.source_selector = source_selector
|
||||
super().__init__(attrs)
|
||||
|
||||
def render(self, name, value, attrs=None, renderer=None):
|
||||
value = value if isinstance(value, dict) else {}
|
||||
widget_id_raw = attrs.get('id', name) if attrs else name
|
||||
widget_id = re.sub(r'[^A-Za-z0-9_]', '_', str(widget_id_raw)) or name
|
||||
allowlist = escape(value.get('allowlist', '') or '')
|
||||
denylist = escape(value.get('denylist', '') or '')
|
||||
|
||||
return mark_safe(f'''
|
||||
<div id="{widget_id}_container" class="url-filters-widget">
|
||||
<input type="hidden" name="{name}" value="">
|
||||
<div class="url-filters-grid">
|
||||
<div class="url-filters-column">
|
||||
<div class="url-filter-label-row">
|
||||
<label for="{widget_id}_allowlist" class="url-filter-label"><span class="url-filter-label-main">🟢 URL_ALLOWLIST</span></label>
|
||||
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
|
||||
</div>
|
||||
<textarea id="{widget_id}_allowlist"
|
||||
name="{name}_allowlist"
|
||||
rows="2"
|
||||
placeholder="^https?://([^/]+\\.)?(example\\.com|example\\.org)([:/]|$)">{allowlist}</textarea>
|
||||
</div>
|
||||
<div class="url-filters-column">
|
||||
<div class="url-filter-label-row">
|
||||
<label for="{widget_id}_denylist" class="url-filter-label"><span class="url-filter-label-main">⛔ URL_DENYLIST</span></label>
|
||||
<span class="url-filter-label-note">Regex patterns or domains to exclude, one pattern per line.</span>
|
||||
</div>
|
||||
<textarea id="{widget_id}_denylist"
|
||||
name="{name}_denylist"
|
||||
rows="2"
|
||||
placeholder="^https?://([^/]+\\.)?(cdn\\.example\\.com|analytics\\.example\\.org)([:/]|$)">{denylist}</textarea>
|
||||
</div>
|
||||
</div>
|
||||
<label class="url-filters-toggle" for="{widget_id}_same_domain_only">
|
||||
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
|
||||
<span>Same domain only</span>
|
||||
</label>
|
||||
<div class="help-text">These values can be one regex pattern or domain per line. URL_DENYLIST takes precedence over URL_ALLOWLIST.</div>
|
||||
<script>
|
||||
(function() {{
|
||||
var allowlistField = document.getElementById('{widget_id}_allowlist');
|
||||
var denylistField = document.getElementById('{widget_id}_denylist');
|
||||
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
|
||||
var sourceField = document.querySelector({json.dumps(self.source_selector)});
|
||||
var lastAutoGeneratedAllowlist = '';
|
||||
if (!allowlistField || !sameDomainOnly || !sourceField) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
function extractUrl(line) {{
|
||||
var trimmed = String(line || '').trim();
|
||||
if (!trimmed || trimmed.charAt(0) === '#') {{
|
||||
return '';
|
||||
}}
|
||||
if (trimmed.charAt(0) === '{{') {{
|
||||
try {{
|
||||
var record = JSON.parse(trimmed);
|
||||
return String(record.url || '').trim();
|
||||
}} catch (error) {{
|
||||
return '';
|
||||
}}
|
||||
}}
|
||||
return trimmed;
|
||||
}}
|
||||
|
||||
function escapeRegex(text) {{
|
||||
return String(text || '').replace(/[.*+?^${{}}()|[\\]\\\\]/g, '\\\\$&');
|
||||
}}
|
||||
|
||||
function buildHostRegex(domains) {{
|
||||
if (!domains.length) {{
|
||||
return '';
|
||||
}}
|
||||
return '^https?://(' + domains.map(escapeRegex).join('|') + ')([:/]|$)';
|
||||
}}
|
||||
|
||||
function getConfigEditorRows() {{
|
||||
return document.getElementById('id_config_rows');
|
||||
}}
|
||||
|
||||
function getConfigUpdater() {{
|
||||
return window.updateHiddenField_id_config || null;
|
||||
}}
|
||||
|
||||
function findConfigRow(key) {{
|
||||
var rows = getConfigEditorRows();
|
||||
if (!rows) {{
|
||||
return null;
|
||||
}}
|
||||
var matches = Array.prototype.filter.call(rows.querySelectorAll('.key-value-row'), function(row) {{
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
return keyInput && keyInput.value.trim() === key;
|
||||
}});
|
||||
return matches.length ? matches[0] : null;
|
||||
}}
|
||||
|
||||
function addConfigRow() {{
|
||||
if (typeof window.addKeyValueRow_id_config === 'function') {{
|
||||
window.addKeyValueRow_id_config();
|
||||
var rows = getConfigEditorRows();
|
||||
return rows ? rows.lastElementChild : null;
|
||||
}}
|
||||
return null;
|
||||
}}
|
||||
|
||||
function setConfigRow(key, value) {{
|
||||
var rows = getConfigEditorRows();
|
||||
var updater = getConfigUpdater();
|
||||
if (!rows || !updater) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
var row = findConfigRow(key);
|
||||
if (!value) {{
|
||||
if (row) {{
|
||||
row.remove();
|
||||
updater();
|
||||
}}
|
||||
return;
|
||||
}}
|
||||
|
||||
if (!row) {{
|
||||
row = addConfigRow();
|
||||
}}
|
||||
if (!row) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
var valueInput = row.querySelector('.kv-value');
|
||||
if (!keyInput || !valueInput) {{
|
||||
return;
|
||||
}}
|
||||
|
||||
keyInput.value = key;
|
||||
valueInput.value = value;
|
||||
keyInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
valueInput.dispatchEvent(new Event('input', {{ bubbles: true }}));
|
||||
updater();
|
||||
}}
|
||||
|
||||
function syncConfigEditor() {{
|
||||
setConfigRow('URL_ALLOWLIST', allowlistField.value.trim());
|
||||
setConfigRow('URL_DENYLIST', denylistField ? denylistField.value.trim() : '');
|
||||
}}
|
||||
|
||||
function syncAllowlistFromUrls() {{
|
||||
if (!sameDomainOnly.checked) {{
|
||||
if (allowlistField.value.trim() === lastAutoGeneratedAllowlist) {{
|
||||
allowlistField.value = '';
|
||||
syncConfigEditor();
|
||||
}}
|
||||
lastAutoGeneratedAllowlist = '';
|
||||
return;
|
||||
}}
|
||||
|
||||
var seen = Object.create(null);
|
||||
var domains = [];
|
||||
sourceField.value.split(/\\n+/).forEach(function(line) {{
|
||||
var url = extractUrl(line);
|
||||
if (!url) {{
|
||||
return;
|
||||
}}
|
||||
try {{
|
||||
var parsed = new URL(url);
|
||||
var domain = String(parsed.hostname || '').toLowerCase();
|
||||
if (!domain || seen[domain]) {{
|
||||
return;
|
||||
}}
|
||||
seen[domain] = true;
|
||||
domains.push(domain);
|
||||
}} catch (error) {{
|
||||
return;
|
||||
}}
|
||||
}});
|
||||
lastAutoGeneratedAllowlist = buildHostRegex(domains);
|
||||
allowlistField.value = lastAutoGeneratedAllowlist;
|
||||
syncConfigEditor();
|
||||
}}
|
||||
|
||||
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
|
||||
sourceField.addEventListener('input', syncAllowlistFromUrls);
|
||||
sourceField.addEventListener('change', syncAllowlistFromUrls);
|
||||
allowlistField.addEventListener('input', syncConfigEditor);
|
||||
allowlistField.addEventListener('change', syncConfigEditor);
|
||||
if (denylistField) {{
|
||||
denylistField.addEventListener('input', syncConfigEditor);
|
||||
denylistField.addEventListener('change', syncConfigEditor);
|
||||
}}
|
||||
|
||||
if (document.readyState === 'loading') {{
|
||||
document.addEventListener('DOMContentLoaded', syncConfigEditor, {{ once: true }});
|
||||
}} else {{
|
||||
syncConfigEditor();
|
||||
}}
|
||||
}})();
|
||||
</script>
|
||||
</div>
|
||||
''')
|
||||
|
||||
def value_from_datadict(self, data, files, name):
|
||||
return {
|
||||
'allowlist': data.get(f'{name}_allowlist', ''),
|
||||
'denylist': data.get(f'{name}_denylist', ''),
|
||||
'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
|
||||
}
|
||||
|
||||
|
||||
class InlineTagEditorWidget(TagEditorWidget):
|
||||
"""
|
||||
Inline version of TagEditorWidget for use in list views.
|
||||
Includes AJAX save functionality for immediate persistence.
|
||||
"""
|
||||
|
||||
def __init__(self, attrs=None, snapshot_id=None):
|
||||
def __init__(self, attrs=None, snapshot_id=None, editable=True):
|
||||
super().__init__(attrs, snapshot_id)
|
||||
self.snapshot_id = snapshot_id
|
||||
self.editable = editable
|
||||
|
||||
def render(self, name, value, attrs=None, renderer=None, snapshot_id=None):
|
||||
"""Render inline tag editor with AJAX save."""
|
||||
@@ -361,20 +625,24 @@ class InlineTagEditorWidget(TagEditorWidget):
|
||||
# Build pills HTML with filter links
|
||||
pills_html = ''
|
||||
for td in tag_data:
|
||||
remove_button = ''
|
||||
if self.editable:
|
||||
remove_button = (
|
||||
f'<button type="button" class="tag-remove-btn" '
|
||||
f'data-tag-id="{td["id"]}" data-tag-name="{self._escape(td["name"])}">×</button>'
|
||||
)
|
||||
pills_html += f'''
|
||||
<span class="tag-pill" data-tag="{self._escape(td['name'])}" data-tag-id="{td['id']}" style="{self._tag_style(td['name'])}">
|
||||
<a href="/admin/core/snapshot/?tags__id__exact={td['id']}" class="tag-link">{self._escape(td['name'])}</a>
|
||||
<button type="button" class="tag-remove-btn" data-tag-id="{td['id']}" data-tag-name="{self._escape(td['name'])}">×</button>
|
||||
{remove_button}
|
||||
</span>
|
||||
'''
|
||||
|
||||
tags_json = escape(json.dumps(tag_data))
|
||||
|
||||
html = f'''
|
||||
<span id="{widget_id}_container" class="tag-editor-inline" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}">
|
||||
<span id="{widget_id}_pills" class="tag-pills-inline">
|
||||
{pills_html}
|
||||
</span>
|
||||
input_html = ''
|
||||
readonly_class = ' readonly' if not self.editable else ''
|
||||
if self.editable:
|
||||
input_html = f'''
|
||||
<input type="text"
|
||||
id="{widget_id}_input"
|
||||
class="tag-inline-input-sm"
|
||||
@@ -384,6 +652,14 @@ class InlineTagEditorWidget(TagEditorWidget):
|
||||
data-inline-tag-input="1"
|
||||
>
|
||||
<datalist id="{widget_id}_datalist"></datalist>
|
||||
'''
|
||||
|
||||
html = f'''
|
||||
<span id="{widget_id}_container" class="tag-editor-inline{readonly_class}" data-snapshot-id="{snapshot_id}" data-tags="{tags_json}" data-readonly="{int(not self.editable)}">
|
||||
<span id="{widget_id}_pills" class="tag-pills-inline">
|
||||
{pills_html}
|
||||
</span>
|
||||
{input_html}
|
||||
</span>
|
||||
'''
|
||||
|
||||
|
||||
Reference in New Issue
Block a user