Files
ArchiveBox/archivebox/crawls/admin.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

833 lines
33 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
__package__ = "archivebox.crawls"
from django import forms
from django.http import JsonResponse, HttpRequest, HttpResponseNotAllowed
from django.shortcuts import get_object_or_404, redirect
from django.urls import path, reverse
from django.utils.html import escape, format_html, format_html_join
from django.utils import timezone
from django.utils.safestring import mark_safe
from django.contrib import admin, messages
from django.db.models import Count, Q
from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.core.models import Snapshot
from archivebox.core.widgets import TagEditorWidget
from archivebox.crawls.models import Crawl, CrawlSchedule
def render_snapshots_list(snapshots_qs, limit=20, crawl=None):
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
snapshots = snapshots_qs.order_by("-created_at")[:limit].annotate(
total_results=Count("archiveresult"),
succeeded_results=Count("archiveresult", filter=Q(archiveresult__status="succeeded")),
failed_results=Count("archiveresult", filter=Q(archiveresult__status="failed")),
started_results=Count("archiveresult", filter=Q(archiveresult__status="started")),
skipped_results=Count("archiveresult", filter=Q(archiveresult__status="skipped")),
)
if not snapshots:
return mark_safe('<div style="color: #666; font-style: italic; padding: 8px 0;">No Snapshots yet...</div>')
# Status colors matching Django admin and progress monitor
status_colors = {
"queued": ("#6c757d", "#f8f9fa"), # gray
"started": ("#856404", "#fff3cd"), # amber
"sealed": ("#155724", "#d4edda"), # green
"failed": ("#721c24", "#f8d7da"), # red
}
rows = []
for snapshot in snapshots:
status = snapshot.status or "queued"
color, bg = status_colors.get(status, ("#6c757d", "#f8f9fa"))
# Calculate progress
total = snapshot.total_results
succeeded = snapshot.succeeded_results
failed = snapshot.failed_results
running = snapshot.started_results
skipped = snapshot.skipped_results
done = succeeded + failed + skipped
pending = max(total - done - running, 0)
progress_pct = int((done / total) * 100) if total > 0 else 0
progress_text = f"{done}/{total}" if total > 0 else "-"
progress_title = f"{succeeded} succeeded, {failed} failed, {running} running, {pending} pending, {skipped} skipped"
progress_color = "#28a745"
if failed:
progress_color = "#dc3545"
elif running:
progress_color = "#17a2b8"
elif pending:
progress_color = "#ffc107"
# Truncate title and URL
snapshot_title = snapshot.title or "Untitled"
title = snapshot_title[:60]
if len(snapshot_title) > 60:
title += "..."
url_display = snapshot.url[:50]
if len(snapshot.url) > 50:
url_display += "..."
delete_button = ""
exclude_button = ""
if crawl is not None:
delete_url = reverse("admin:crawls_crawl_snapshot_delete", args=[crawl.pk, snapshot.pk])
exclude_url = reverse("admin:crawls_crawl_snapshot_exclude_domain", args=[crawl.pk, snapshot.pk])
delete_button = f'''
<button type="button"
class="crawl-snapshots-action"
data-post-url="{escape(delete_url)}"
data-confirm="Delete this snapshot from the crawl?"
title="Delete this snapshot from the crawl and remove its URL from the crawl queue."
aria-label="Delete snapshot"
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">🗑</button>
'''
exclude_button = f'''
<button type="button"
class="crawl-snapshots-action"
data-post-url="{escape(exclude_url)}"
data-confirm="Exclude this domain from the crawl? This removes matching queued URLs, deletes pending matching snapshots, and blocks future matches."
title="Exclude this domain from this crawl. This removes matching URLs from the crawl queue, deletes pending matching snapshots, and blocks future matches."
aria-label="Exclude domain from crawl"
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">⊘</button>
'''
# Format date
date_str = snapshot.created_at.strftime("%Y-%m-%d %H:%M") if snapshot.created_at else "-"
rows.append(f'''
<tr style="border-bottom: 1px solid #eee;">
<td style="padding: 6px 8px; white-space: nowrap;">
<span style="display: inline-block; padding: 2px 8px; border-radius: 10px;
font-size: 11px; font-weight: 500; text-transform: uppercase;
color: {color}; background: {bg};">{status}</span>
</td>
<td style="padding: 6px 8px; white-space: nowrap;">
<a href="/{snapshot.archive_path}/" style="text-decoration: none;">
<img src="/{snapshot.archive_path}/favicon.ico"
style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
onerror="this.style.display='none'"/>
</a>
</td>
<td style="padding: 6px 8px; max-width: 300px;">
<a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
title="{escape(snapshot_title)}">{escape(title)}</a>
</td>
<td style="padding: 6px 8px; max-width: 250px;">
<a href="{escape(snapshot.url)}" target="_blank"
style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
title="{escape(snapshot.url)}">{escape(url_display)}</a>
</td>
<td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
<div style="display: inline-flex; align-items: center; gap: 6px;" title="{escape(progress_title)}">
<div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
<div style="width: {progress_pct}%; height: 100%;
background: {progress_color};
transition: width 0.3s;"></div>
</div>
<a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
style="font-size: 11px; color: #417690; min-width: 35px; text-decoration: none;"
title="View archive results">{progress_text}</a>
</div>
</td>
<td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
{date_str}
</td>
{f'<td style="padding: 6px 8px; white-space: nowrap; text-align: right;"><div style="display: inline-flex; gap: 6px;">{exclude_button}{delete_button}</div></td>' if crawl is not None else ""}
</tr>
''')
total_count = snapshots_qs.count()
footer = ""
if total_count > limit:
footer = f"""
<tr>
<td colspan="6" style="padding: 8px; text-align: center; color: #666; font-size: 12px; background: #f8f9fa;">
Showing {limit} of {total_count} snapshots
</td>
</tr>
"""
return mark_safe(f"""
<div data-crawl-snapshots-list style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
<thead>
<tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Status</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333; width: 24px;"></th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Title</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
<th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
{
'<th style="padding: 8px; text-align: right; font-weight: 600; color: #333;">Actions</th>' if crawl is not None else ""
}
</tr>
</thead>
<tbody>
{"".join(rows)}
{footer}
</tbody>
</table>
</div>
{
'''
<script>
(function() {
if (window.__archiveboxCrawlSnapshotActionsBound) {
return;
}
window.__archiveboxCrawlSnapshotActionsBound = true;
function getCookie(name) {
var cookieValue = null;
if (!document.cookie) {
return cookieValue;
}
var cookies = document.cookie.split(';');
for (var i = 0; i < cookies.length; i++) {
var cookie = cookies[i].trim();
if (cookie.substring(0, name.length + 1) === (name + '=')) {
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
break;
}
}
return cookieValue;
}
document.addEventListener('click', function(event) {
var button = event.target.closest('.crawl-snapshots-action');
if (!button) {
return;
}
event.preventDefault();
var confirmMessage = button.getAttribute('data-confirm');
if (confirmMessage && !window.confirm(confirmMessage)) {
return;
}
button.disabled = true;
fetch(button.getAttribute('data-post-url'), {
method: 'POST',
credentials: 'same-origin',
headers: {
'X-CSRFToken': getCookie('csrftoken') || '',
'X-Requested-With': 'XMLHttpRequest'
}
}).then(function(response) {
return response.json().then(function(data) {
if (!response.ok) {
throw new Error(data.error || 'Request failed');
}
return data;
});
}).then(function() {
window.location.reload();
}).catch(function(error) {
button.disabled = false;
window.alert(error.message || 'Request failed');
});
});
})();
</script>
'''
if crawl is not None
else ""
}
""")
class URLFiltersWidget(forms.Widget):
def render(self, name, value, attrs=None, renderer=None):
value = value if isinstance(value, dict) else {}
widget_id = (attrs or {}).get("id", name)
allowlist = escape(value.get("allowlist", "") or "")
denylist = escape(value.get("denylist", "") or "")
return mark_safe(f'''
<div id="{widget_id}_container" style="min-width: 420px;">
<input type="hidden" name="{name}" value="">
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
<div>
<label for="{widget_id}_allowlist" style="display: block; font-weight: 600; margin-bottom: 4px;">Allowlist</label>
<textarea id="{widget_id}_allowlist" name="{name}_allowlist" rows="3"
style="width: 100%; font-family: monospace; font-size: 12px;"
placeholder="example.com&#10;*.example.com">{allowlist}</textarea>
</div>
<div>
<label for="{widget_id}_denylist" style="display: block; font-weight: 600; margin-bottom: 4px;">Denylist</label>
<textarea id="{widget_id}_denylist" name="{name}_denylist" rows="3"
style="width: 100%; font-family: monospace; font-size: 12px;"
placeholder="static.example.com">{denylist}</textarea>
</div>
</div>
<label style="display: inline-flex; align-items: center; gap: 6px; margin-top: 8px; font-weight: 500;">
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
Same domain only
</label>
<p style="color: #666; font-size: 11px; margin: 6px 0 0 0;">
Enter domains, wildcards, or regex patterns. Denylist takes precedence over allowlist.
</p>
<script>
(function() {{
if (window.__archiveboxUrlFilterEditors && window.__archiveboxUrlFilterEditors['{widget_id}']) {{
return;
}}
window.__archiveboxUrlFilterEditors = window.__archiveboxUrlFilterEditors || {{}};
window.__archiveboxUrlFilterEditors['{widget_id}'] = true;
var urlsField = document.getElementById('id_urls');
var allowlistField = document.getElementById('{widget_id}_allowlist');
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
function extractUrl(line) {{
var trimmed = (line || '').trim();
if (!trimmed || trimmed.charAt(0) === '#') {{
return '';
}}
if (trimmed.charAt(0) === '{{') {{
try {{
var record = JSON.parse(trimmed);
return String(record.url || '').trim();
}} catch (error) {{
return '';
}}
}}
return trimmed;
}}
function syncAllowlistFromUrls() {{
if (!urlsField || !allowlistField || !sameDomainOnly || !sameDomainOnly.checked) {{
return;
}}
var domains = [];
var seen = Object.create(null);
urlsField.value.split(/\\n+/).forEach(function(line) {{
var url = extractUrl(line);
if (!url) {{
return;
}}
try {{
var parsed = new URL(url);
var domain = (parsed.hostname || '').toLowerCase();
if (domain && !seen[domain]) {{
seen[domain] = true;
domains.push(domain);
}}
}} catch (error) {{
return;
}}
}});
allowlistField.value = domains.join('\\n');
}}
if (sameDomainOnly) {{
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
}}
if (urlsField) {{
urlsField.addEventListener('input', syncAllowlistFromUrls);
urlsField.addEventListener('change', syncAllowlistFromUrls);
}}
}})();
</script>
</div>
''')
def value_from_datadict(self, data, files, name):
return {
"allowlist": data.get(f"{name}_allowlist", ""),
"denylist": data.get(f"{name}_denylist", ""),
"same_domain_only": data.get(f"{name}_same_domain_only") in ("1", "on", "true"),
}
class URLFiltersField(forms.Field):
widget = URLFiltersWidget
def to_python(self, value):
if isinstance(value, dict):
return value
return {"allowlist": "", "denylist": "", "same_domain_only": False}
class CrawlAdminForm(forms.ModelForm):
"""Custom form for Crawl admin to render urls field as textarea."""
tags_editor = forms.CharField(
label="Tags",
required=False,
widget=TagEditorWidget(),
help_text="Type tag names and press Enter or Space to add. Click × to remove.",
)
url_filters = URLFiltersField(
label="URL Filters",
required=False,
help_text="Set URL_ALLOWLIST / URL_DENYLIST for this crawl.",
)
class Meta:
model = Crawl
fields = "__all__"
widgets = {
"urls": forms.Textarea(
attrs={
"rows": 8,
"style": "width: 100%; font-family: monospace; font-size: 13px;",
"placeholder": "https://example.com\nhttps://example2.com\n# Comments start with #",
},
),
"notes": forms.Textarea(
attrs={
"rows": 1,
"style": "width: 100%; min-height: 0; resize: vertical;",
},
),
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {}
if self.instance and self.instance.pk:
self.initial["tags_editor"] = self.instance.tags_str
self.initial["url_filters"] = {
"allowlist": config.get("URL_ALLOWLIST", ""),
"denylist": config.get("URL_DENYLIST", ""),
"same_domain_only": False,
}
def clean_tags_editor(self):
tags_str = self.cleaned_data.get("tags_editor", "")
tag_names = []
seen = set()
for raw_name in tags_str.split(","):
name = raw_name.strip()
if not name:
continue
lowered = name.lower()
if lowered in seen:
continue
seen.add(lowered)
tag_names.append(name)
return ",".join(tag_names)
def clean_url_filters(self):
value = self.cleaned_data.get("url_filters") or {}
return {
"allowlist": "\n".join(Crawl.split_filter_patterns(value.get("allowlist", ""))),
"denylist": "\n".join(Crawl.split_filter_patterns(value.get("denylist", ""))),
"same_domain_only": bool(value.get("same_domain_only")),
}
def save(self, commit=True):
instance = super().save(commit=False)
instance.tags_str = self.cleaned_data.get("tags_editor", "")
url_filters = self.cleaned_data.get("url_filters") or {}
instance.set_url_filters(
url_filters.get("allowlist", ""),
url_filters.get("denylist", ""),
)
if commit:
instance.save()
instance.apply_crawl_config_filters()
save_m2m = getattr(self, "_save_m2m", None)
if callable(save_m2m):
save_m2m()
return instance
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
form = CrawlAdminForm
list_display = (
"id",
"created_at",
"created_by",
"max_depth",
"max_urls",
"max_size",
"label",
"notes",
"urls_preview",
"schedule_str",
"status",
"retry_at",
"health_display",
"num_snapshots",
)
sort_fields = (
"id",
"created_at",
"created_by",
"max_depth",
"max_urls",
"max_size",
"label",
"notes",
"schedule_str",
"status",
"retry_at",
)
search_fields = ("id", "created_by__username", "max_depth", "max_urls", "max_size", "label", "notes", "schedule_id", "status", "urls")
readonly_fields = ("created_at", "modified_at", "snapshots")
fieldsets = (
(
"URLs",
{
"fields": ("urls",),
"classes": ("card", "wide"),
},
),
(
"Info",
{
"fields": ("label", "notes", "tags_editor"),
"classes": ("card",),
},
),
(
"Settings",
{
"fields": (("max_depth", "max_urls", "max_size"), "url_filters", "config"),
"classes": ("card",),
},
),
(
"Status",
{
"fields": ("status", "retry_at"),
"classes": ("card",),
},
),
(
"Relations",
{
"fields": ("schedule", "created_by"),
"classes": ("card",),
},
),
(
"Timestamps",
{
"fields": ("created_at", "modified_at"),
"classes": ("card",),
},
),
(
"Snapshots",
{
"fields": ("snapshots",),
"classes": ("card", "wide"),
},
),
)
add_fieldsets = (
(
"URLs",
{
"fields": ("urls",),
"classes": ("card", "wide"),
},
),
(
"Info",
{
"fields": ("label", "notes", "tags_editor"),
"classes": ("card",),
},
),
(
"Settings",
{
"fields": (("max_depth", "max_urls", "max_size"), "url_filters", "config"),
"classes": ("card",),
},
),
(
"Status",
{
"fields": ("status", "retry_at"),
"classes": ("card",),
},
),
(
"Relations",
{
"fields": ("schedule", "created_by"),
"classes": ("card",),
},
),
)
list_filter = ("max_depth", "max_urls", "schedule", "created_by", "status", "retry_at")
ordering = ["-created_at", "-retry_at"]
list_per_page = 100
actions = ["delete_selected_batched"]
change_actions = ["recrawl"]
def get_queryset(self, request):
"""Optimize queries with select_related and annotations."""
qs = super().get_queryset(request)
return qs.select_related("schedule", "created_by").annotate(num_snapshots_cached=Count("snapshot_set"))
def get_fieldsets(self, request, obj=None):
return self.fieldsets if obj else self.add_fieldsets
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path(
"<path:object_id>/snapshot/<path:snapshot_id>/delete/",
self.admin_site.admin_view(self.delete_snapshot_view),
name="crawls_crawl_snapshot_delete",
),
path(
"<path:object_id>/snapshot/<path:snapshot_id>/exclude-domain/",
self.admin_site.admin_view(self.exclude_domain_view),
name="crawls_crawl_snapshot_exclude_domain",
),
]
return custom_urls + urls
@admin.action(description="Delete selected crawls")
def delete_selected_batched(self, request, queryset):
"""Delete crawls in a single transaction to avoid SQLite concurrency issues."""
from django.db import transaction
total = queryset.count()
# Get list of IDs to delete first (outside transaction)
ids_to_delete = list(queryset.values_list("pk", flat=True))
# Delete everything in a single atomic transaction
with transaction.atomic():
deleted_count, _ = Crawl.objects.filter(pk__in=ids_to_delete).delete()
messages.success(request, f"Successfully deleted {total} crawls ({deleted_count} total objects including related records).")
@action(label="Recrawl", description="Create a new crawl with the same settings")
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
# Validate URLs (required for crawl to start)
if not obj.urls:
messages.error(request, "Cannot recrawl: original crawl has no URLs.")
return redirect("admin:crawls_crawl_change", obj.id)
new_crawl = Crawl.objects.create(
urls=obj.urls,
max_depth=obj.max_depth,
max_urls=obj.max_urls,
max_size=obj.max_size,
tags_str=obj.tags_str,
config=obj.config,
schedule=obj.schedule,
label=f"{obj.label} (recrawl)" if obj.label else "",
notes=obj.notes,
created_by=request.user,
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
messages.success(request, f"Created new crawl {new_crawl.id} with the same settings. It will start processing shortly.")
return redirect("admin:crawls_crawl_change", new_crawl.id)
def num_snapshots(self, obj):
# Use cached annotation from get_queryset to avoid N+1
return getattr(obj, "num_snapshots_cached", obj.snapshot_set.count())
def snapshots(self, obj):
return render_snapshots_list(obj.snapshot_set.all(), crawl=obj)
def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
if request.method != "POST":
return HttpResponseNotAllowed(["POST"])
crawl = get_object_or_404(Crawl, pk=object_id)
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
if snapshot.status == Snapshot.StatusChoices.STARTED:
snapshot.cancel_running_hooks()
removed_urls = crawl.prune_url(snapshot.url)
snapshot.delete()
return JsonResponse(
{
"ok": True,
"snapshot_id": str(snapshot.id),
"removed_urls": removed_urls,
},
)
def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
if request.method != "POST":
return HttpResponseNotAllowed(["POST"])
crawl = get_object_or_404(Crawl, pk=object_id)
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
result = crawl.exclude_domain(snapshot.url)
return JsonResponse(
{
"ok": True,
**result,
},
)
@admin.display(description="Schedule", ordering="schedule")
def schedule_str(self, obj):
if not obj.schedule:
return mark_safe("<i>None</i>")
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
@admin.display(description="URLs", ordering="urls")
def urls_preview(self, obj):
first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ""
return first_url[:80] + "..." if len(first_url) > 80 else first_url
@admin.display(description="Health", ordering="health")
def health_display(self, obj):
h = obj.health
color = "green" if h >= 80 else "orange" if h >= 50 else "red"
return format_html('<span style="color: {};">{}</span>', color, h)
@admin.display(description="URLs")
def urls_editor(self, obj):
"""Editor for crawl URLs."""
widget_id = f"crawl_urls_{obj.pk}"
# Escape for safe HTML embedding
escaped_urls = (obj.urls or "").replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;")
# Count lines for auto-expand logic
line_count = len((obj.urls or "").split("\n"))
uri_rows = min(max(3, line_count), 10)
html = f'''
<div id="{widget_id}_container" style="max-width: 900px;">
<!-- URLs input -->
<div style="margin-bottom: 12px;">
<label style="font-weight: bold; display: block; margin-bottom: 4px;">URLs (one per line):</label>
<textarea id="{widget_id}_urls"
style="width: 100%; font-family: monospace; font-size: 13px;
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
resize: vertical;"
rows="{uri_rows}"
placeholder="https://example.com&#10;https://example2.com&#10;# Comments start with #"
readonly>{escaped_urls}</textarea>
<p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
{line_count} URL{"s" if line_count != 1 else ""} · Note: URLs displayed here for reference only
</p>
</div>
</div>
'''
return mark_safe(html)
class CrawlScheduleAdmin(BaseModelAdmin):
list_display = ("id", "created_at", "created_by", "label", "notes", "template_str", "crawls", "num_crawls", "num_snapshots")
sort_fields = ("id", "created_at", "created_by", "label", "notes", "template_str")
search_fields = ("id", "created_by__username", "label", "notes", "schedule_id", "template_id", "template__urls")
readonly_fields = ("created_at", "modified_at", "crawls", "snapshots")
fieldsets = (
(
"Schedule Info",
{
"fields": ("label", "notes"),
"classes": ("card",),
},
),
(
"Configuration",
{
"fields": ("schedule", "template"),
"classes": ("card",),
},
),
(
"Metadata",
{
"fields": ("created_by", "created_at", "modified_at"),
"classes": ("card",),
},
),
(
"Crawls",
{
"fields": ("crawls",),
"classes": ("card", "wide"),
},
),
(
"Snapshots",
{
"fields": ("snapshots",),
"classes": ("card", "wide"),
},
),
)
list_filter = ("created_by",)
ordering = ["-created_at"]
list_per_page = 100
actions = ["delete_selected"]
def get_queryset(self, request):
return (
super()
.get_queryset(request)
.select_related("created_by", "template")
.annotate(
crawl_count=Count("crawl", distinct=True),
snapshot_count=Count("crawl__snapshot_set", distinct=True),
)
)
def get_fieldsets(self, request, obj=None):
if obj is None:
return tuple(fieldset for fieldset in self.fieldsets if fieldset[0] not in {"Crawls", "Snapshots"})
return self.fieldsets
def save_model(self, request, obj, form, change):
if not obj.created_by_id and getattr(request, "user", None) and request.user.is_authenticated:
obj.created_by = request.user
super().save_model(request, obj, form, change)
@admin.display(description="Template", ordering="template")
def template_str(self, obj):
return format_html('<a href="{}">{}</a>', obj.template.admin_change_url, obj.template)
@admin.display(description="# Crawls", ordering="crawl_count")
def num_crawls(self, obj):
return getattr(obj, "crawl_count", obj.crawl_set.count())
@admin.display(description="# Snapshots", ordering="snapshot_count")
def num_snapshots(self, obj):
return getattr(obj, "snapshot_count", Snapshot.objects.filter(crawl__schedule=obj).count())
def crawls(self, obj):
return format_html_join(
"<br/>",
' - <a href="{}">{}</a>',
((crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by("-created_at")[:20]),
) or mark_safe("<i>No Crawls yet...</i>")
def snapshots(self, obj):
crawl_ids = obj.crawl_set.values_list("pk", flat=True)
return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))
def register_admin(admin_site):
admin_site.register(Crawl, CrawlAdmin)
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)