__package__ = 'archivebox.crawls'
import json
from pathlib import Path
from django.utils.html import format_html, format_html_join, mark_safe
from django.contrib import admin, messages
from django.urls import path
from django.http import JsonResponse
from django.views.decorators.http import require_POST
from archivebox import DATA_DIR
from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from core.models import Snapshot
from crawls.models import Seed, Crawl, CrawlSchedule
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
def num_crawls(self, obj):
return obj.crawl_set.count()
def num_snapshots(self, obj):
return obj.snapshot_set.count()
def scheduled_crawls(self, obj):
return format_html_join('
', ' - {}', (
(scheduledcrawl.admin_change_url, scheduledcrawl)
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
)) or mark_safe('No Scheduled Crawls yet...')
def crawls(self, obj):
return format_html_join('
', ' - {}', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
)) or mark_safe('No Crawls yet...')
def snapshots(self, obj):
return format_html_join('
', ' - {}', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or mark_safe('No Snapshots yet...')
def contents(self, obj):
if obj.uri.startswith('file:///data/'):
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
contents = ""
try:
contents = source_file.read_text().strip()[:14_000]
except Exception as e:
contents = f'Error reading {source_file}: {e}'
return format_html('{}:
{}', source_file, contents)
return format_html('See URLs here: {}', obj.uri, obj.uri)
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
actions = ["delete_selected"]
change_actions = ['recrawl']
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same seed and settings."""
from django.utils import timezone
new_crawl = Crawl.objects.create(
seed=obj.seed,
urls=obj.urls,
max_depth=obj.max_depth,
config=obj.config,
schedule=obj.schedule,
label=f"{obj.label} (recrawl)" if obj.label else "",
notes=obj.notes,
created_by=request.user,
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
messages.success(
request,
f'Created new crawl {new_crawl.id} with the same settings. '
f'It will start processing shortly.'
)
# Redirect to the new crawl's change page
from django.shortcuts import redirect
return redirect('admin:crawls_crawl_change', new_crawl.id)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('