mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
remove Seed model in favor of Crawl as template
This commit is contained in:
@@ -17,7 +17,7 @@ from django_object_actions import action
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Seed, Crawl, CrawlSchedule
|
||||
from crawls.models import Crawl, CrawlSchedule
|
||||
|
||||
|
||||
def render_snapshots_list(snapshots_qs, limit=20):
|
||||
@@ -136,16 +136,16 @@ def render_snapshots_list(snapshots_qs, limit=20):
|
||||
''')
|
||||
|
||||
|
||||
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
|
||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor')
|
||||
|
||||
fieldsets = (
|
||||
('Source', {
|
||||
'fields': ('uri', 'contents'),
|
||||
('URLs', {
|
||||
'fields': ('urls_editor',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
@@ -153,83 +153,7 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': ('extractor', 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Crawls', {
|
||||
'fields': ('scheduled_crawls', 'crawls'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('extractor', 'created_by')
|
||||
ordering = ['-created_at']
|
||||
list_per_page = 100
|
||||
actions = ["delete_selected"]
|
||||
|
||||
def num_crawls(self, obj):
|
||||
return obj.crawl_set.count()
|
||||
|
||||
def num_snapshots(self, obj):
|
||||
return obj.snapshot_set.count()
|
||||
|
||||
def scheduled_crawls(self, obj):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(scheduledcrawl.admin_change_url, scheduledcrawl)
|
||||
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Scheduled Crawls yet...</i>')
|
||||
|
||||
def crawls(self, obj):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(crawl.admin_change_url, crawl)
|
||||
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Crawls yet...</i>')
|
||||
|
||||
def snapshots(self, obj):
|
||||
return render_snapshots_list(obj.snapshot_set.all())
|
||||
|
||||
def contents(self, obj):
|
||||
source_file = obj.get_file_path()
|
||||
if source_file:
|
||||
contents = ""
|
||||
try:
|
||||
contents = source_file.read_text().strip()[:14_000]
|
||||
except Exception as e:
|
||||
contents = f'Error reading {source_file}: {e}'
|
||||
|
||||
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
||||
|
||||
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
|
||||
|
||||
|
||||
|
||||
|
||||
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
|
||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
|
||||
|
||||
fieldsets = (
|
||||
('URLs', {
|
||||
'fields': ('seed_urls_editor',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
'fields': ('label', 'notes'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': ('max_depth', 'config'),
|
||||
'fields': ('max_depth', 'extractor', 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
@@ -237,7 +161,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('seed', 'schedule', 'created_by'),
|
||||
'fields': ('schedule', 'created_by'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
@@ -250,7 +174,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
||||
list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
|
||||
ordering = ['-created_at', '-retry_at']
|
||||
list_per_page = 100
|
||||
actions = ["delete_selected"]
|
||||
@@ -258,23 +182,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
@action(label='Recrawl', description='Create a new crawl with the same settings')
|
||||
def recrawl(self, request, obj):
|
||||
"""Duplicate this crawl as a new crawl with the same seed and settings."""
|
||||
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
|
||||
from django.utils import timezone
|
||||
from django.shortcuts import redirect
|
||||
|
||||
# Validate seed has a URI (required for crawl to start)
|
||||
if not obj.seed:
|
||||
messages.error(request, 'Cannot recrawl: original crawl has no seed.')
|
||||
return redirect('admin:crawls_crawl_change', obj.id)
|
||||
|
||||
if not obj.seed.uri:
|
||||
messages.error(request, 'Cannot recrawl: seed has no URI.')
|
||||
# Validate URLs (required for crawl to start)
|
||||
if not obj.urls:
|
||||
messages.error(request, 'Cannot recrawl: original crawl has no URLs.')
|
||||
return redirect('admin:crawls_crawl_change', obj.id)
|
||||
|
||||
new_crawl = Crawl.objects.create(
|
||||
seed=obj.seed,
|
||||
urls=obj.urls,
|
||||
extractor=obj.extractor,
|
||||
max_depth=obj.max_depth,
|
||||
tags_str=obj.tags_str,
|
||||
config=obj.config,
|
||||
schedule=obj.schedule,
|
||||
label=f"{obj.label} (recrawl)" if obj.label else "",
|
||||
@@ -292,43 +213,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
||||
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path('<path:object_id>/save_seed_contents/',
|
||||
self.admin_site.admin_view(self.save_seed_contents_view),
|
||||
name='crawls_crawl_save_seed_contents'),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def save_seed_contents_view(self, request, object_id):
|
||||
"""Handle saving seed file contents via AJAX."""
|
||||
if request.method != 'POST':
|
||||
return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
|
||||
|
||||
try:
|
||||
crawl = Crawl.objects.get(pk=object_id)
|
||||
except Crawl.DoesNotExist:
|
||||
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
|
||||
|
||||
source_file = crawl.seed.get_file_path() if crawl.seed else None
|
||||
if not source_file:
|
||||
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
|
||||
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
contents = data.get('contents', '')
|
||||
except json.JSONDecodeError:
|
||||
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
|
||||
|
||||
try:
|
||||
# Ensure parent directory exists
|
||||
source_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
source_file.write_text(contents)
|
||||
return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
|
||||
except Exception as e:
|
||||
return JsonResponse({'success': False, 'error': str(e)}, status=500)
|
||||
|
||||
def num_snapshots(self, obj):
|
||||
return obj.snapshot_set.count()
|
||||
|
||||
@@ -341,163 +225,68 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
return mark_safe('<i>None</i>')
|
||||
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
|
||||
|
||||
@admin.display(description='Seed', ordering='seed')
|
||||
def seed_str(self, obj):
|
||||
if not obj.seed:
|
||||
return mark_safe('<i>None</i>')
|
||||
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
|
||||
@admin.display(description='URLs', ordering='urls')
|
||||
def urls_preview(self, obj):
|
||||
first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
|
||||
return first_url[:80] + '...' if len(first_url) > 80 else first_url
|
||||
|
||||
@admin.display(description='URLs')
|
||||
def seed_urls_editor(self, obj):
|
||||
"""Combined editor showing seed URL and file contents."""
|
||||
widget_id = f'seed_urls_{obj.pk}'
|
||||
|
||||
# Get the seed URI (or use urls field if no seed)
|
||||
seed_uri = ''
|
||||
if obj.seed and obj.seed.uri:
|
||||
seed_uri = obj.seed.uri
|
||||
elif obj.urls:
|
||||
seed_uri = obj.urls
|
||||
def urls_editor(self, obj):
|
||||
"""Editor for crawl URLs."""
|
||||
widget_id = f'crawl_urls_{obj.pk}'
|
||||
|
||||
# Check if it's a local file we can edit
|
||||
source_file = obj.seed.get_file_path() if obj.seed else None
|
||||
source_file = obj.get_file_path()
|
||||
is_file = source_file is not None
|
||||
contents = ""
|
||||
file_contents = ""
|
||||
error = None
|
||||
|
||||
if is_file and source_file:
|
||||
try:
|
||||
contents = source_file.read_text().strip()
|
||||
file_contents = source_file.read_text().strip()
|
||||
except Exception as e:
|
||||
error = f'Error reading {source_file}: {e}'
|
||||
|
||||
# Escape for safe HTML embedding
|
||||
escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
escaped_file_contents = file_contents.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
|
||||
# Count lines for auto-expand logic
|
||||
line_count = len(contents.split('\n')) if contents else 0
|
||||
uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
|
||||
line_count = len((obj.urls or '').split('\n'))
|
||||
file_line_count = len(file_contents.split('\n')) if file_contents else 0
|
||||
uri_rows = min(max(3, line_count), 10)
|
||||
|
||||
html = f'''
|
||||
<div id="{widget_id}_container" style="max-width: 900px;">
|
||||
<!-- Seed URL input (auto-expands) -->
|
||||
<!-- URLs input -->
|
||||
<div style="margin-bottom: 12px;">
|
||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
|
||||
<textarea id="{widget_id}_uri"
|
||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">URLs (one per line):</label>
|
||||
<textarea id="{widget_id}_urls"
|
||||
style="width: 100%; font-family: monospace; font-size: 13px;
|
||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
|
||||
resize: vertical; min-height: 32px; overflow: hidden;"
|
||||
resize: vertical;"
|
||||
rows="{uri_rows}"
|
||||
placeholder="file:///data/sources/... or https://..."
|
||||
{"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
|
||||
placeholder="https://example.com https://example2.com # Comments start with #"
|
||||
readonly>{escaped_urls}</textarea>
|
||||
<p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
|
||||
{line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{"" if not is_file else f'''
|
||||
<!-- File contents editor -->
|
||||
<!-- File contents preview (if first URL is a file://) -->
|
||||
<div style="margin-bottom: 8px;">
|
||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
|
||||
File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
||||
File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
||||
</label>
|
||||
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
|
||||
<textarea id="{widget_id}_contents"
|
||||
style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
|
||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
|
||||
placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
|
||||
</div>
|
||||
|
||||
<div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
|
||||
<button type="button" id="{widget_id}_save_btn"
|
||||
onclick="saveSeedUrls_{widget_id}()"
|
||||
style="padding: 8px 20px; background: #417690; color: white; border: none;
|
||||
border-radius: 4px; cursor: pointer; font-weight: bold;">
|
||||
Save URLs
|
||||
</button>
|
||||
<span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
|
||||
<span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
|
||||
<textarea id="{widget_id}_file_preview"
|
||||
style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
|
||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
|
||||
readonly>{escaped_file_contents}</textarea>
|
||||
</div>
|
||||
'''}
|
||||
|
||||
{"" if is_file else f'''
|
||||
<div style="margin-top: 8px; color: #666;">
|
||||
<a href="{seed_uri}" target="_blank">{seed_uri}</a>
|
||||
</div>
|
||||
'''}
|
||||
|
||||
<script>
|
||||
(function() {{
|
||||
var uriInput = document.getElementById('{widget_id}_uri');
|
||||
var contentsInput = document.getElementById('{widget_id}_contents');
|
||||
var status = document.getElementById('{widget_id}_status');
|
||||
var lineCount = document.getElementById('{widget_id}_line_count');
|
||||
var saveBtn = document.getElementById('{widget_id}_save_btn');
|
||||
|
||||
// Auto-resize URI input
|
||||
function autoResizeUri() {{
|
||||
uriInput.style.height = 'auto';
|
||||
uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
|
||||
}}
|
||||
uriInput.addEventListener('input', autoResizeUri);
|
||||
autoResizeUri();
|
||||
|
||||
if (contentsInput) {{
|
||||
function updateLineCount() {{
|
||||
var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
|
||||
lineCount.textContent = lines.length + ' URLs';
|
||||
}}
|
||||
|
||||
contentsInput.addEventListener('input', function() {{
|
||||
updateLineCount();
|
||||
if (status) {{
|
||||
status.textContent = '(unsaved changes)';
|
||||
status.style.color = '#c4820e';
|
||||
}}
|
||||
}});
|
||||
|
||||
updateLineCount();
|
||||
}}
|
||||
|
||||
window.saveSeedUrls_{widget_id} = function() {{
|
||||
if (!saveBtn) return;
|
||||
saveBtn.disabled = true;
|
||||
saveBtn.textContent = 'Saving...';
|
||||
if (status) status.textContent = '';
|
||||
|
||||
fetch(window.location.pathname + 'save_seed_contents/', {{
|
||||
method: 'POST',
|
||||
headers: {{
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
|
||||
}},
|
||||
body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
|
||||
}})
|
||||
.then(function(response) {{ return response.json(); }})
|
||||
.then(function(data) {{
|
||||
if (data.success) {{
|
||||
if (status) {{
|
||||
status.textContent = '✓ ' + data.message;
|
||||
status.style.color = '#28a745';
|
||||
}}
|
||||
}} else {{
|
||||
if (status) {{
|
||||
status.textContent = '✗ ' + data.error;
|
||||
status.style.color = '#dc3545';
|
||||
}}
|
||||
}}
|
||||
}})
|
||||
.catch(function(err) {{
|
||||
if (status) {{
|
||||
status.textContent = '✗ Error: ' + err;
|
||||
status.style.color = '#dc3545';
|
||||
}}
|
||||
}})
|
||||
.finally(function() {{
|
||||
saveBtn.disabled = false;
|
||||
saveBtn.textContent = 'Save URLs';
|
||||
}});
|
||||
}};
|
||||
}})();
|
||||
</script>
|
||||
</div>
|
||||
'''
|
||||
return mark_safe(html)
|
||||
@@ -507,7 +296,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
class CrawlScheduleAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str')
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
|
||||
|
||||
@@ -561,6 +350,5 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(Seed, SeedAdmin)
|
||||
admin_site.register(Crawl, CrawlAdmin)
|
||||
admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
|
||||
|
||||
61
archivebox/crawls/migrations/0002_drop_seed_model.py
Normal file
61
archivebox/crawls/migrations/0002_drop_seed_model.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# Generated by Django 6.0 on 2025-12-25 09:34
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.db.models.deletion
|
||||
import pathlib
|
||||
import uuid
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0001_initial'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='crawl',
|
||||
name='seed',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawl',
|
||||
name='extractor',
|
||||
field=models.CharField(default='auto', help_text='Parser for reading URLs (auto, html, json, rss, etc)', max_length=32),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='output_dir',
|
||||
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='urls',
|
||||
field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawlschedule',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawlschedule',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.DeleteModel(
|
||||
name='Seed',
|
||||
),
|
||||
]
|
||||
@@ -20,91 +20,6 @@ if TYPE_CHECKING:
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
|
||||
|
||||
class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
uri = models.URLField(max_length=2048)
|
||||
extractor = models.CharField(default='auto', max_length=32)
|
||||
tags_str = models.CharField(max_length=255, null=False, blank=True, default='')
|
||||
label = models.CharField(max_length=255, null=False, blank=True, default='')
|
||||
config = models.JSONField(default=dict)
|
||||
output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='')
|
||||
notes = models.TextField(blank=True, null=False, default='')
|
||||
|
||||
crawl_set: models.Manager['Crawl']
|
||||
|
||||
class Meta:
|
||||
verbose_name = 'Seed'
|
||||
verbose_name_plural = 'Seeds'
|
||||
unique_together = (('created_by', 'uri', 'extractor'), ('created_by', 'label'))
|
||||
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.uri[:64]}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Seed',
|
||||
indent_level=0,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'uri': str(self.uri)[:64],
|
||||
'extractor': self.extractor,
|
||||
'label': self.label or None,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
|
||||
# Use absolute path for file:// URLs so extractors can find the files
|
||||
source_path = str(source_file.resolve())
|
||||
seed, _ = cls.objects.get_or_create(
|
||||
label=label or source_file.name, uri=f'file://{source_path}',
|
||||
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
||||
extractor=parser, tags_str=tag, config=config or {},
|
||||
)
|
||||
return seed
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return self.uri.split('://', 1)[0].lower()
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_seed', args=[self.id])
|
||||
|
||||
def get_file_path(self) -> Path | None:
|
||||
"""
|
||||
Get the filesystem path for file:// URIs.
|
||||
Handles both old format (file:///data/...) and new format (file:///absolute/path).
|
||||
Returns None if URI is not a file:// URI.
|
||||
"""
|
||||
if not self.uri.startswith('file://'):
|
||||
return None
|
||||
|
||||
# Remove file:// prefix
|
||||
path_str = self.uri.replace('file://', '', 1)
|
||||
|
||||
# Handle old format: file:///data/... -> DATA_DIR/...
|
||||
if path_str.startswith('/data/'):
|
||||
return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
|
||||
|
||||
# Handle new format: file:///absolute/path
|
||||
return Path(path_str)
|
||||
|
||||
@property
|
||||
def snapshot_set(self) -> QuerySet['Snapshot']:
|
||||
from core.models import Snapshot
|
||||
return Snapshot.objects.filter(crawl_id__in=self.crawl_set.values_list('pk', flat=True))
|
||||
|
||||
|
||||
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
@@ -124,14 +39,15 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
|
||||
verbose_name_plural = 'Scheduled Crawls'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.id}] {self.template.seed.uri[:64] if self.template and self.template.seed else ""} @ {self.schedule}'
|
||||
urls_preview = self.template.urls[:64] if self.template and self.template.urls else ""
|
||||
return f'[{self.id}] {urls_preview} @ {self.schedule}'
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_any', args=[self.id])
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.label = self.label or (self.template.seed.label if self.template and self.template.seed else '')
|
||||
self.label = self.label or (self.template.label if self.template else '')
|
||||
super().save(*args, **kwargs)
|
||||
if self.template:
|
||||
self.template.schedule = self
|
||||
@@ -144,8 +60,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
|
||||
urls = models.TextField(blank=True, null=False, default='')
|
||||
urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
|
||||
extractor = models.CharField(default='auto', max_length=32, help_text='Parser for reading URLs (auto, html, json, rss, etc)')
|
||||
config = models.JSONField(default=dict)
|
||||
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
|
||||
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
@@ -171,31 +87,40 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
verbose_name_plural = 'Crawls'
|
||||
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
return f'[{self.id}] {first_url[:64]}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Crawl',
|
||||
indent_level=1,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
|
||||
'first_url': first_url[:64],
|
||||
'max_depth': self.max_depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
|
||||
crawl, _ = cls.objects.get_or_create(
|
||||
seed=seed, max_depth=max_depth, tags_str=tags_str or seed.tags_str,
|
||||
config=seed.config or config or {},
|
||||
created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id,
|
||||
def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto',
|
||||
tags_str: str = '', config=None, created_by=None):
|
||||
"""Create a crawl from a file containing URLs."""
|
||||
urls_content = source_file.read_text()
|
||||
crawl = cls.objects.create(
|
||||
urls=urls_content,
|
||||
extractor=extractor,
|
||||
max_depth=max_depth,
|
||||
tags_str=tags_str,
|
||||
label=label or source_file.name,
|
||||
config=config or {},
|
||||
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
||||
)
|
||||
return crawl
|
||||
|
||||
@@ -203,14 +128,47 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
||||
|
||||
def get_urls_list(self) -> list[str]:
|
||||
"""Get list of URLs from urls field, filtering out comments and empty lines."""
|
||||
if not self.urls:
|
||||
return []
|
||||
return [
|
||||
url.strip()
|
||||
for url in self.urls.split('\n')
|
||||
if url.strip() and not url.strip().startswith('#')
|
||||
]
|
||||
|
||||
def get_file_path(self) -> Path | None:
|
||||
"""
|
||||
Get filesystem path if this crawl references a local file.
|
||||
Checks if the first URL is a file:// URI.
|
||||
"""
|
||||
urls = self.get_urls_list()
|
||||
if not urls:
|
||||
return None
|
||||
|
||||
first_url = urls[0]
|
||||
if not first_url.startswith('file://'):
|
||||
return None
|
||||
|
||||
# Remove file:// prefix
|
||||
path_str = first_url.replace('file://', '', 1)
|
||||
return Path(path_str)
|
||||
|
||||
def create_root_snapshot(self) -> 'Snapshot':
|
||||
from core.models import Snapshot
|
||||
|
||||
first_url = self.get_urls_list()[0] if self.get_urls_list() else None
|
||||
if not first_url:
|
||||
raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
|
||||
|
||||
try:
|
||||
return Snapshot.objects.get(crawl=self, url=self.seed.uri)
|
||||
return Snapshot.objects.get(crawl=self, url=first_url)
|
||||
except Snapshot.DoesNotExist:
|
||||
pass
|
||||
|
||||
root_snapshot, _ = Snapshot.objects.update_or_create(
|
||||
crawl=self, url=self.seed.uri,
|
||||
crawl=self, url=first_url,
|
||||
defaults={
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
|
||||
@@ -42,11 +42,12 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
if not self.crawl.seed:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
|
||||
if not self.crawl.urls:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
|
||||
return False
|
||||
if not self.crawl.seed.uri:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
|
||||
urls_list = self.crawl.get_urls_list()
|
||||
if not urls_list:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
|
||||
return False
|
||||
return True
|
||||
|
||||
@@ -121,13 +122,14 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run all on_Crawl hooks
|
||||
first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ''
|
||||
results = run_hooks(
|
||||
event_name='Crawl',
|
||||
output_dir=output_dir,
|
||||
timeout=60,
|
||||
config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
|
||||
config_objects=[self.crawl],
|
||||
crawl_id=str(self.crawl.id),
|
||||
seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
|
||||
seed_uri=first_url,
|
||||
)
|
||||
|
||||
# Process hook results - parse JSONL output and create DB objects
|
||||
|
||||
Reference in New Issue
Block a user