__package__ = "archivebox.core" import html import json import os import shlex from pathlib import Path from urllib.parse import quote from functools import reduce from operator import and_ from django.contrib import admin from django.db.models import Min, Q, TextField from django.db.models.functions import Cast from django.utils.html import format_html from django.utils.safestring import mark_safe from django.core.exceptions import ValidationError from django.urls import reverse, resolve from django.utils import timezone from django.utils.text import smart_split from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG from archivebox.misc.paginators import AcceleratedPaginator from archivebox.base_models.admin import BaseModelAdmin from archivebox.hooks import get_plugin_icon from archivebox.core.host_utils import build_snapshot_url from archivebox.core.widgets import InlineTagEditorWidget from archivebox.core.views import LIVE_PLUGIN_BASE_URL from archivebox.machine.env_utils import env_to_shell_exports from archivebox.core.models import ArchiveResult, Snapshot def _quote_shell_string(value: str) -> str: return "'" + str(value).replace("'", "'\"'\"'") + "'" def _get_replay_source_url(result: ArchiveResult) -> str: process_env = getattr(getattr(result, "process", None), "env", None) or {} return str(process_env.get("SOURCE_URL") or result.snapshot.url or "") def build_abx_dl_display_command(result: ArchiveResult) -> str: source_url = _get_replay_source_url(result) plugin_name = str(result.plugin or "").strip() if not plugin_name and not source_url: return "abx-dl" if not source_url: return f"abx-dl --plugins={plugin_name}" return f"abx-dl --plugins={plugin_name} {_quote_shell_string(source_url)}" def build_abx_dl_replay_command(result: ArchiveResult) -> str: display_command = build_abx_dl_display_command(result) process = getattr(result, "process", None) env_items = env_to_shell_exports(getattr(process, "env", None) or {}) snapshot_dir = shlex.quote(str(result.snapshot_dir)) if env_items: return f"cd {snapshot_dir}; env {env_items} {display_command}" return f"cd {snapshot_dir}; {display_command}" def get_plugin_admin_url(plugin_name: str) -> str: from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, iter_plugin_dirs plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None) if plugin_dir: builtin_root = BUILTIN_PLUGINS_DIR.resolve() if plugin_dir.is_relative_to(builtin_root): return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" user_root = USER_PLUGINS_DIR.resolve() if plugin_dir.is_relative_to(user_root): return f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/" return f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/" def render_archiveresults_list(archiveresults_qs, limit=50): """Render a nice inline list view of archive results with status, plugin, output, and actions.""" result_ids = list(archiveresults_qs.order_by("plugin").values_list("pk", flat=True)[:limit]) if not result_ids: return mark_safe('
{str(result.id)[-8:]}
{str(result.id)}
Version: {version}
PWD: {result.pwd or "-"}
{full_output}
{}
', obj.end_ts.strftime("%Y-%m-%d %H:%M:%S")) def result_id(self, obj): return format_html( '[{}]',
reverse("admin:core_archiveresult_change", args=(obj.id,)),
str(obj.id)[:8],
)
def command(self, obj):
return format_html("{}", " ".join(obj.cmd or []))
def version(self, obj):
return format_html("{}", obj.cmd_version or "-")
def get_formset(self, request, obj=None, **kwargs):
formset = super().get_formset(request, obj, **kwargs)
snapshot = self.get_parent_object_from_request(request)
form_class = getattr(formset, "form", None)
base_fields = getattr(form_class, "base_fields", {})
snapshot_output_dir = str(snapshot.output_dir) if snapshot else ""
# import ipdb; ipdb.set_trace()
# formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
# default values for new entries
base_fields["status"].initial = "succeeded"
base_fields["start_ts"].initial = timezone.now()
base_fields["end_ts"].initial = timezone.now()
base_fields["cmd_version"].initial = "-"
base_fields["pwd"].initial = snapshot_output_dir
base_fields["cmd"].initial = '["-"]'
base_fields["output_str"].initial = "Manually recorded cmd output..."
if obj is not None:
# hidden values for existing entries and new entries
base_fields["start_ts"].widget = base_fields["start_ts"].hidden_widget()
base_fields["end_ts"].widget = base_fields["end_ts"].hidden_widget()
base_fields["cmd"].widget = base_fields["cmd"].hidden_widget()
base_fields["pwd"].widget = base_fields["pwd"].hidden_widget()
base_fields["cmd_version"].widget = base_fields["cmd_version"].hidden_widget()
return formset
def get_readonly_fields(self, request, obj=None):
if obj is not None:
return self.readonly_fields
else:
return []
class ArchiveResultAdmin(BaseModelAdmin):
list_display = (
"details_link",
"zip_link",
"created_at",
"snapshot_info",
"tags_inline",
"status_badge",
"plugin_with_icon",
"process_link",
"machine_link",
"cmd_str",
"output_str_display",
)
list_display_links = None
sort_fields = ("id", "created_at", "plugin", "status")
readonly_fields = (
"admin_actions",
"cmd",
"cmd_version",
"pwd",
"cmd_str",
"snapshot_info",
"tags_str",
"created_at",
"modified_at",
"output_summary",
"plugin_with_icon",
"process_link",
)
search_fields = (
"snapshot__id",
"snapshot__url",
"snapshot__tags__name",
"snapshot__crawl_id",
"plugin",
"hook_name",
"output_str",
"output_json",
"process__cmd",
)
autocomplete_fields = ["snapshot"]
fieldsets = (
(
"Actions",
{
"fields": ("admin_actions",),
"classes": ("card", "wide"),
},
),
(
"Snapshot",
{
"fields": ("snapshot", "snapshot_info", "tags_str"),
"classes": ("card", "wide"),
},
),
(
"Plugin",
{
"fields": ("plugin_with_icon", "process_link", "status"),
"classes": ("card",),
},
),
(
"Timing",
{
"fields": ("start_ts", "end_ts", "created_at", "modified_at"),
"classes": ("card",),
},
),
(
"Command",
{
"fields": ("cmd", "cmd_str", "cmd_version", "pwd"),
"classes": ("card",),
},
),
(
"Output",
{
"fields": ("output_str", "output_json", "output_files", "output_size", "output_mimetypes", "output_summary"),
"classes": ("card", "wide"),
},
),
)
list_filter = ("status", "plugin", "start_ts")
ordering = ["-start_ts"]
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
paginator = AcceleratedPaginator
save_on_top = True
actions = ["delete_selected"]
class Meta:
verbose_name = "Archive Result"
verbose_name_plural = "Archive Results"
def change_view(self, request, object_id, form_url="", extra_context=None):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
def changelist_view(self, request, extra_context=None):
self.request = request
return super().changelist_view(request, extra_context)
def get_queryset(self, request):
return (
super()
.get_queryset(request)
.select_related("snapshot", "process")
.prefetch_related("snapshot__tags")
.annotate(snapshot_first_tag=Min("snapshot__tags__name"))
)
def get_search_results(self, request, queryset, search_term):
if not search_term:
return queryset, False
queryset = queryset.annotate(
snapshot_id_text=Cast("snapshot__id", output_field=TextField()),
snapshot_crawl_id_text=Cast("snapshot__crawl_id", output_field=TextField()),
output_json_text=Cast("output_json", output_field=TextField()),
cmd_text=Cast("process__cmd", output_field=TextField()),
)
search_bits = [
bit[1:-1] if len(bit) >= 2 and bit[0] == bit[-1] and bit[0] in {'"', "'"} else bit for bit in smart_split(search_term)
]
search_bits = [bit.strip() for bit in search_bits if bit.strip()]
if not search_bits:
return queryset, False
filters = []
for bit in search_bits:
filters.append(
Q(snapshot_id_text__icontains=bit)
| Q(snapshot__url__icontains=bit)
| Q(snapshot__tags__name__icontains=bit)
| Q(snapshot_crawl_id_text__icontains=bit)
| Q(plugin__icontains=bit)
| Q(hook_name__icontains=bit)
| Q(output_str__icontains=bit)
| Q(output_json_text__icontains=bit)
| Q(cmd_text__icontains=bit),
)
return queryset.filter(reduce(and_, filters)).distinct(), True
def get_snapshot_view_url(self, result: ArchiveResult) -> str:
return build_snapshot_url(str(result.snapshot_id), request=getattr(self, "request", None))
def get_output_view_url(self, result: ArchiveResult) -> str:
output_path = result.embed_path() if hasattr(result, "embed_path") else None
if not output_path:
output_path = result.plugin or ""
return build_snapshot_url(str(result.snapshot_id), output_path, request=getattr(self, "request", None))
def get_output_files_url(self, result: ArchiveResult) -> str:
return f"{build_snapshot_url(str(result.snapshot_id), result.plugin, request=getattr(self, 'request', None))}/?files=1"
def get_output_zip_url(self, result: ArchiveResult) -> str:
return f"{self.get_output_files_url(result)}&download=zip"
@admin.display(description="Details", ordering="id")
def details_link(self, result):
return format_html(
'{}',
reverse("admin:core_archiveresult_change", args=[result.id]),
str(result.id)[-8:],
)
@admin.display(description="Zip")
def zip_link(self, result):
return format_html(
'⬇ ZIP',
self.get_output_zip_url(result),
)
@admin.display(
description="Snapshot",
ordering="snapshot__url",
)
def snapshot_info(self, result):
snapshot_id = str(result.snapshot_id)
return format_html(
'[{}] {} {}{}',
get_plugin_admin_url(result.plugin),
result.plugin,
icon,
get_plugin_admin_url(result.plugin),
result.plugin,
)
@admin.display(description="Process", ordering="process__pid")
def process_link(self, result):
if not result.process_id:
return "-"
process_label = result.process.pid if result.process and result.process.pid else "-"
return format_html(
'{}',
reverse("admin:machine_process_change", args=[result.process_id]),
process_label,
)
@admin.display(description="Machine", ordering="process__machine__hostname")
def machine_link(self, result):
if not result.process_id or not result.process or not result.process.machine_id:
return "-"
machine = result.process.machine
return format_html(
'{} {}',
reverse("admin:machine_machine_change", args=[machine.id]),
str(machine.id)[:8],
machine.hostname,
)
@admin.display(description="Command")
def cmd_str(self, result):
display_cmd = build_abx_dl_display_command(result)
replay_cmd = build_abx_dl_replay_command(result)
return format_html(
"""
""",
replay_cmd,
replay_cmd,
display_cmd,
)
def output_display(self, result):
# Determine output link path - use embed_path() which checks output_files
embed_path = result.embed_path() if hasattr(result, "embed_path") else None
output_path = embed_path if (result.status == "succeeded" and embed_path) else "index.html"
snapshot_id = str(result.snapshot_id)
return format_html(
'↗️{}',
build_snapshot_url(snapshot_id, output_path),
result.output_str,
)
@admin.display(description="Output", ordering="output_str")
def output_str_display(self, result):
output_text = str(result.output_str or "").strip()
if not output_text:
return "-"
live_path = result.embed_path() if hasattr(result, "embed_path") else None
if live_path:
return format_html(
'{}',
build_snapshot_url(str(result.snapshot_id), live_path),
output_text,
output_text,
)
return format_html(
'{}',
output_text,
output_text,
)
@admin.display(description="")
def admin_actions(self, result):
return format_html(
"""
""",
self.get_output_view_url(result),
self.get_output_files_url(result),
self.get_output_zip_url(result),
self.get_snapshot_view_url(result),
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split("data/", 1)[-1]
output_html = format_html(
'{}',
build_snapshot_url(snapshot_id, "index.html"),
)
embed_path = result.embed_path() if hasattr(result, "embed_path") else ""
path_from_embed = snapshot_dir / (embed_path or "")
output_html += format_html(
'{}/{}
',
str(snapshot_dir),
str(embed_path),
)
if os.access(path_from_embed, os.R_OK):
root_dir = str(path_from_embed)
else:
root_dir = str(snapshot_dir)
# print(root_dir, str(list(os.walk(root_dir))))
for root, dirs, files in os.walk(root_dir):
depth = root.replace(root_dir, "").count(os.sep) + 1
if depth > 2:
continue
indent = " " * 4 * (depth)
output_html += format_html('{}{}/
', indent, os.path.basename(root))
indentation_str = " " * 4 * (depth + 1)
for filename in sorted(files):
is_hidden = filename.startswith(".")
output_html += format_html(
'{}{}
',
int(not is_hidden),
indentation_str,
filename.strip(),
)
return output_html + mark_safe("")
def register_admin(admin_site):
admin_site.register(ArchiveResult, ArchiveResultAdmin)