__package__ = 'archivebox.crawls' import json from pathlib import Path from django.utils.html import format_html, format_html_join, mark_safe from django.contrib import admin, messages from django.urls import path from django.http import JsonResponse from django.views.decorators.http import require_POST from archivebox import DATA_DIR from django_object_actions import action from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from core.models import Snapshot from crawls.models import Seed, Crawl, CrawlSchedule class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots') sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str') search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str') readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents') fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields) list_filter = ('extractor', 'created_by') ordering = ['-created_at'] list_per_page = 100 actions = ["delete_selected"] def num_crawls(self, obj): return obj.crawl_set.count() def num_snapshots(self, obj): return obj.snapshot_set.count() def scheduled_crawls(self, obj): return format_html_join('
', ' - {}', ( (scheduledcrawl.admin_change_url, scheduledcrawl) for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20] )) or mark_safe('No Scheduled Crawls yet...') def crawls(self, obj): return format_html_join('
', ' - {}', ( (crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by('-created_at')[:20] )) or mark_safe('No Crawls yet...') def snapshots(self, obj): return format_html_join('
', ' - {}', ( (snapshot.admin_change_url, snapshot) for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] )) or mark_safe('No Snapshots yet...') def contents(self, obj): if obj.uri.startswith('file:///data/'): source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1) contents = "" try: contents = source_file.read_text().strip()[:14_000] except Exception as e: contents = f'Error reading {source_file}: {e}' return format_html('{}:
{}
', source_file, contents) return format_html('See URLs here: {}', obj.uri, obj.uri) class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots') sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at') search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri') readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor') fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots') list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') ordering = ['-created_at', '-retry_at'] list_per_page = 100 actions = ["delete_selected"] change_actions = ['recrawl'] @action(label='Recrawl', description='Create a new crawl with the same settings') def recrawl(self, request, obj): """Duplicate this crawl as a new crawl with the same seed and settings.""" from django.utils import timezone new_crawl = Crawl.objects.create( seed=obj.seed, urls=obj.urls, max_depth=obj.max_depth, config=obj.config, schedule=obj.schedule, label=f"{obj.label} (recrawl)" if obj.label else "", notes=obj.notes, created_by=request.user, status=Crawl.StatusChoices.QUEUED, retry_at=timezone.now(), ) messages.success( request, f'Created new crawl {new_crawl.id} with the same settings. ' f'It will start processing shortly.' ) # Redirect to the new crawl's change page from django.shortcuts import redirect return redirect('admin:crawls_crawl_change', new_crawl.id) def get_urls(self): urls = super().get_urls() custom_urls = [ path('/save_seed_contents/', self.admin_site.admin_view(self.save_seed_contents_view), name='crawls_crawl_save_seed_contents'), ] return custom_urls + urls def save_seed_contents_view(self, request, object_id): """Handle saving seed file contents via AJAX.""" if request.method != 'POST': return JsonResponse({'success': False, 'error': 'POST required'}, status=405) try: crawl = Crawl.objects.get(pk=object_id) except Crawl.DoesNotExist: return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404) if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')): return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400) try: data = json.loads(request.body) contents = data.get('contents', '') except json.JSONDecodeError: return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400) source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1) try: # Ensure parent directory exists source_file.parent.mkdir(parents=True, exist_ok=True) source_file.write_text(contents) return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'}) except Exception as e: return JsonResponse({'success': False, 'error': str(e)}, status=500) def num_snapshots(self, obj): return obj.snapshot_set.count() def snapshots(self, obj): return format_html_join('
', '{}', ( (snapshot.admin_change_url, snapshot) for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] )) or mark_safe('No Snapshots yet...') @admin.display(description='Schedule', ordering='schedule') def schedule_str(self, obj): if not obj.schedule: return mark_safe('None') return format_html('{}', obj.schedule.admin_change_url, obj.schedule) @admin.display(description='Seed', ordering='seed') def seed_str(self, obj): if not obj.seed: return mark_safe('None') return format_html('{}', obj.seed.admin_change_url, obj.seed) @admin.display(description='URLs') def seed_urls_editor(self, obj): """Combined editor showing seed URL and file contents.""" widget_id = f'seed_urls_{obj.pk}' # Get the seed URI (or use urls field if no seed) seed_uri = '' if obj.seed and obj.seed.uri: seed_uri = obj.seed.uri elif obj.urls: seed_uri = obj.urls # Check if it's a local file we can edit is_file = seed_uri.startswith('file:///data/') contents = "" error = None source_file = None if is_file: source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1) try: contents = source_file.read_text().strip() except Exception as e: error = f'Error reading {source_file}: {e}' # Escape for safe HTML embedding escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') # Count lines for auto-expand logic line_count = len(contents.split('\n')) if contents else 0 uri_rows = min(max(1, seed_uri.count('\n') + 1), 3) html = f'''
{"" if not is_file else f'''
{"
" + error + "
" if error else ""}
'''} {"" if is_file else f'''
{seed_uri}
'''}
''' return mark_safe(html) class CrawlScheduleAdmin(BaseModelAdmin): list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots') sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str') search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri') readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots') fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields) list_filter = ('created_by',) ordering = ['-created_at'] list_per_page = 100 actions = ["delete_selected"] @admin.display(description='Template', ordering='template') def template_str(self, obj): return format_html('{}', obj.template.admin_change_url, obj.template) def num_crawls(self, obj): return obj.crawl_set.count() def num_snapshots(self, obj): return obj.snapshot_set.count() def crawls(self, obj): return format_html_join('
', ' - {}', ( (crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by('-created_at')[:20] )) or mark_safe('No Crawls yet...') def snapshots(self, obj): crawl_ids = obj.crawl_set.values_list('pk', flat=True) return format_html_join('
', ' - {}', ( (snapshot.admin_change_url, snapshot) for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20] )) or mark_safe('No Snapshots yet...') def register_admin(admin_site): admin_site.register(Seed, SeedAdmin) admin_site.register(Crawl, CrawlAdmin) admin_site.register(CrawlSchedule, CrawlScheduleAdmin)