remove Seed model in favor of Crawl as template

2026-04-05 15:27:53 +10:00 · 2025-12-25 01:52:38 -08:00
parent 28e6c5bb65
commit bb53228ebf
30 changed files with 785 additions and 690 deletions
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -17,7 +17,7 @@ from django_object_actions import action
 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin

 from core.models import Snapshot
-from crawls.models import Seed, Crawl, CrawlSchedule
+from crawls.models import Crawl, CrawlSchedule


 def render_snapshots_list(snapshots_qs, limit=20):
@@ -136,16 +136,16 @@ def render_snapshots_list(snapshots_qs, limit=20):
    ''')


-class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
-    list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
-    sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
-    search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
+class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
+    list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
+    sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
+    search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')

-    readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
+    readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor')

    fieldsets = (
-        ('Source', {
-            'fields': ('uri', 'contents'),
+        ('URLs', {
+            'fields': ('urls_editor',),
            'classes': ('card', 'wide'),
        }),
        ('Info', {
@@ -153,83 +153,7 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
            'classes': ('card',),
        }),
        ('Settings', {
-            'fields': ('extractor', 'config'),
-            'classes': ('card',),
-        }),
-        ('Metadata', {
-            'fields': ('created_by', 'created_at', 'modified_at'),
-            'classes': ('card',),
-        }),
-        ('Crawls', {
-            'fields': ('scheduled_crawls', 'crawls'),
-            'classes': ('card',),
-        }),
-        ('Snapshots', {
-            'fields': ('snapshots',),
-            'classes': ('card',),
-        }),
-    )
-
-    list_filter = ('extractor', 'created_by')
-    ordering = ['-created_at']
-    list_per_page = 100
-    actions = ["delete_selected"]
-
-    def num_crawls(self, obj):
-        return obj.crawl_set.count()
-
-    def num_snapshots(self, obj):
-        return obj.snapshot_set.count()
-
-    def scheduled_crawls(self, obj):
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (scheduledcrawl.admin_change_url, scheduledcrawl)
-            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Scheduled Crawls yet...</i>')
-
-    def crawls(self, obj):
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (crawl.admin_change_url, crawl)
-            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Crawls yet...</i>')
-
-    def snapshots(self, obj):
-        return render_snapshots_list(obj.snapshot_set.all())
-
-    def contents(self, obj):
-        source_file = obj.get_file_path()
-        if source_file:
-            contents = ""
-            try:
-                contents = source_file.read_text().strip()[:14_000]
-            except Exception as e:
-                contents = f'Error reading {source_file}: {e}'
-
-            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
-
-        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
-
-
-
-
-class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
-    list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
-    sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
-    search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
-
-    readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
-
-    fieldsets = (
-        ('URLs', {
-            'fields': ('seed_urls_editor',),
-            'classes': ('card', 'wide'),
-        }),
-        ('Info', {
-            'fields': ('label', 'notes'),
-            'classes': ('card',),
-        }),
-        ('Settings', {
-            'fields': ('max_depth', 'config'),
+            'fields': ('max_depth', 'extractor', 'config'),
            'classes': ('card',),
        }),
        ('Status', {
@@ -237,7 +161,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
            'classes': ('card',),
        }),
        ('Relations', {
-            'fields': ('seed', 'schedule', 'created_by'),
+            'fields': ('schedule', 'created_by'),
            'classes': ('card',),
        }),
        ('Timestamps', {
@@ -250,7 +174,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
        }),
    )

-    list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
+    list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
    ordering = ['-created_at', '-retry_at']
    list_per_page = 100
    actions = ["delete_selected"]
@@ -258,23 +182,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):

    @action(label='Recrawl', description='Create a new crawl with the same settings')
    def recrawl(self, request, obj):
-        """Duplicate this crawl as a new crawl with the same seed and settings."""
+        """Duplicate this crawl as a new crawl with the same URLs and settings."""
        from django.utils import timezone
        from django.shortcuts import redirect

-        # Validate seed has a URI (required for crawl to start)
-        if not obj.seed:
-            messages.error(request, 'Cannot recrawl: original crawl has no seed.')
-            return redirect('admin:crawls_crawl_change', obj.id)
-
-        if not obj.seed.uri:
-            messages.error(request, 'Cannot recrawl: seed has no URI.')
+        # Validate URLs (required for crawl to start)
+        if not obj.urls:
+            messages.error(request, 'Cannot recrawl: original crawl has no URLs.')
            return redirect('admin:crawls_crawl_change', obj.id)

        new_crawl = Crawl.objects.create(
-            seed=obj.seed,
            urls=obj.urls,
+            extractor=obj.extractor,
            max_depth=obj.max_depth,
+            tags_str=obj.tags_str,
            config=obj.config,
            schedule=obj.schedule,
            label=f"{obj.label} (recrawl)" if obj.label else "",
@@ -292,43 +213,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):

        return redirect('admin:crawls_crawl_change', new_crawl.id)

-    def get_urls(self):
-        urls = super().get_urls()
-        custom_urls = [
-            path('<path:object_id>/save_seed_contents/',
-                 self.admin_site.admin_view(self.save_seed_contents_view),
-                 name='crawls_crawl_save_seed_contents'),
-        ]
-        return custom_urls + urls
-
-    def save_seed_contents_view(self, request, object_id):
-        """Handle saving seed file contents via AJAX."""
-        if request.method != 'POST':
-            return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
-
-        try:
-            crawl = Crawl.objects.get(pk=object_id)
-        except Crawl.DoesNotExist:
-            return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
-
-        source_file = crawl.seed.get_file_path() if crawl.seed else None
-        if not source_file:
-            return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
-
-        try:
-            data = json.loads(request.body)
-            contents = data.get('contents', '')
-        except json.JSONDecodeError:
-            return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
-
-        try:
-            # Ensure parent directory exists
-            source_file.parent.mkdir(parents=True, exist_ok=True)
-            source_file.write_text(contents)
-            return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
-        except Exception as e:
-            return JsonResponse({'success': False, 'error': str(e)}, status=500)
-
    def num_snapshots(self, obj):
        return obj.snapshot_set.count()

@@ -341,163 +225,68 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
            return mark_safe('<i>None</i>')
        return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)

-    @admin.display(description='Seed', ordering='seed')
-    def seed_str(self, obj):
-        if not obj.seed:
-            return mark_safe('<i>None</i>')
-        return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
+    @admin.display(description='URLs', ordering='urls')
+    def urls_preview(self, obj):
+        first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
+        return first_url[:80] + '...' if len(first_url) > 80 else first_url

    @admin.display(description='URLs')
-    def seed_urls_editor(self, obj):
-        """Combined editor showing seed URL and file contents."""
-        widget_id = f'seed_urls_{obj.pk}'
-
-        # Get the seed URI (or use urls field if no seed)
-        seed_uri = ''
-        if obj.seed and obj.seed.uri:
-            seed_uri = obj.seed.uri
-        elif obj.urls:
-            seed_uri = obj.urls
+    def urls_editor(self, obj):
+        """Editor for crawl URLs."""
+        widget_id = f'crawl_urls_{obj.pk}'

        # Check if it's a local file we can edit
-        source_file = obj.seed.get_file_path() if obj.seed else None
+        source_file = obj.get_file_path()
        is_file = source_file is not None
-        contents = ""
+        file_contents = ""
        error = None

        if is_file and source_file:
            try:
-                contents = source_file.read_text().strip()
+                file_contents = source_file.read_text().strip()
            except Exception as e:
                error = f'Error reading {source_file}: {e}'

        # Escape for safe HTML embedding
-        escaped_uri = seed_uri.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
-        escaped_contents = (contents or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
+        escaped_urls = (obj.urls or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
+        escaped_file_contents = file_contents.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')

        # Count lines for auto-expand logic
-        line_count = len(contents.split('\n')) if contents else 0
-        uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
+        line_count = len((obj.urls or '').split('\n'))
+        file_line_count = len(file_contents.split('\n')) if file_contents else 0
+        uri_rows = min(max(3, line_count), 10)

        html = f'''
        <div id="{widget_id}_container" style="max-width: 900px;">
-            <!-- Seed URL input (auto-expands) -->
+            <!-- URLs input -->
            <div style="margin-bottom: 12px;">
-                <label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
-                <textarea id="{widget_id}_uri"
+                <label style="font-weight: bold; display: block; margin-bottom: 4px;">URLs (one per line):</label>
+                <textarea id="{widget_id}_urls"
                          style="width: 100%; font-family: monospace; font-size: 13px;
                                 padding: 8px; border: 1px solid #ccc; border-radius: 4px;
-                                 resize: vertical; min-height: 32px; overflow: hidden;"
+                                 resize: vertical;"
                          rows="{uri_rows}"
-                          placeholder="file:///data/sources/... or https://..."
-                          {"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
+                          placeholder="https://example.com&#10;https://example2.com&#10;# Comments start with #"
+                          readonly>{escaped_urls}</textarea>
+                <p style="color: #666; font-size: 12px; margin: 4px 0 0 0;">
+                    {line_count} URL{'s' if line_count != 1 else ''} · URLs are read-only in admin, edit via API or CLI
+                </p>
            </div>

            {"" if not is_file else f'''
-            <!-- File contents editor -->
+            <!-- File contents preview (if first URL is a file://) -->
            <div style="margin-bottom: 8px;">
                <label style="font-weight: bold; display: block; margin-bottom: 4px;">
-                    File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
+                    File Preview: <code style="font-weight: normal; color: #666;">{source_file}</code>
                </label>
                {"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
-                <textarea id="{widget_id}_contents"
-                          style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
-                                 padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
-                          placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
-            </div>
-
-            <div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
-                <button type="button" id="{widget_id}_save_btn"
-                        onclick="saveSeedUrls_{widget_id}()"
-                        style="padding: 8px 20px; background: #417690; color: white; border: none;
-                               border-radius: 4px; cursor: pointer; font-weight: bold;">
-                    Save URLs
-                </button>
-                <span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
-                <span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
+                <textarea id="{widget_id}_file_preview"
+                          style="width: 100%; height: {min(400, max(150, file_line_count * 18))}px; font-family: monospace; font-size: 12px;
+                                 padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical; background: #f9f9f9;"
+                          readonly>{escaped_file_contents}</textarea>
            </div>
            '''}

-            {"" if is_file else f'''
-            <div style="margin-top: 8px; color: #666;">
-                <a href="{seed_uri}" target="_blank">{seed_uri}</a>
-            </div>
-            '''}
-
-            <script>
-                (function() {{
-                    var uriInput = document.getElementById('{widget_id}_uri');
-                    var contentsInput = document.getElementById('{widget_id}_contents');
-                    var status = document.getElementById('{widget_id}_status');
-                    var lineCount = document.getElementById('{widget_id}_line_count');
-                    var saveBtn = document.getElementById('{widget_id}_save_btn');
-
-                    // Auto-resize URI input
-                    function autoResizeUri() {{
-                        uriInput.style.height = 'auto';
-                        uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
-                    }}
-                    uriInput.addEventListener('input', autoResizeUri);
-                    autoResizeUri();
-
-                    if (contentsInput) {{
-                        function updateLineCount() {{
-                            var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
-                            lineCount.textContent = lines.length + ' URLs';
-                        }}
-
-                        contentsInput.addEventListener('input', function() {{
-                            updateLineCount();
-                            if (status) {{
-                                status.textContent = '(unsaved changes)';
-                                status.style.color = '#c4820e';
-                            }}
-                        }});
-
-                        updateLineCount();
-                    }}
-
-                    window.saveSeedUrls_{widget_id} = function() {{
-                        if (!saveBtn) return;
-                        saveBtn.disabled = true;
-                        saveBtn.textContent = 'Saving...';
-                        if (status) status.textContent = '';
-
-                        fetch(window.location.pathname + 'save_seed_contents/', {{
-                            method: 'POST',
-                            headers: {{
-                                'Content-Type': 'application/json',
-                                'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
-                            }},
-                            body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
-                        }})
-                        .then(function(response) {{ return response.json(); }})
-                        .then(function(data) {{
-                            if (data.success) {{
-                                if (status) {{
-                                    status.textContent = '✓ ' + data.message;
-                                    status.style.color = '#28a745';
-                                }}
-                            }} else {{
-                                if (status) {{
-                                    status.textContent = '✗ ' + data.error;
-                                    status.style.color = '#dc3545';
-                                }}
-                            }}
-                        }})
-                        .catch(function(err) {{
-                            if (status) {{
-                                status.textContent = '✗ Error: ' + err;
-                                status.style.color = '#dc3545';
-                            }}
-                        }})
-                        .finally(function() {{
-                            saveBtn.disabled = false;
-                            saveBtn.textContent = 'Save URLs';
-                        }});
-                    }};
-                }})();
-            </script>
        </div>
        '''
        return mark_safe(html)
@@ -507,7 +296,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
 class CrawlScheduleAdmin(BaseModelAdmin):
    list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots')
    sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str')
-    search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
+    search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls')

    readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')

@@ -561,6 +350,5 @@ class CrawlScheduleAdmin(BaseModelAdmin):


 def register_admin(admin_site):
-    admin_site.register(Seed, SeedAdmin)
    admin_site.register(Crawl, CrawlAdmin)
    admin_site.register(CrawlSchedule, CrawlScheduleAdmin)
--- a/archivebox/crawls/migrations/0002_drop_seed_model.py
+++ b/archivebox/crawls/migrations/0002_drop_seed_model.py
@@ -0,0 +1,61 @@
+# Generated by Django 6.0 on 2025-12-25 09:34
+
+import archivebox.base_models.models
+import django.db.models.deletion
+import pathlib
+import uuid
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0001_initial'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='crawl',
+            name='seed',
+        ),
+        migrations.AddField(
+            model_name='crawl',
+            name='extractor',
+            field=models.CharField(default='auto', help_text='Parser for reading URLs (auto, html, json, rss, etc)', max_length=32),
+        ),
+        migrations.AlterField(
+            model_name='crawl',
+            name='created_by',
+            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AlterField(
+            model_name='crawl',
+            name='id',
+            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='crawl',
+            name='output_dir',
+            field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
+        ),
+        migrations.AlterField(
+            model_name='crawl',
+            name='urls',
+            field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
+        ),
+        migrations.AlterField(
+            model_name='crawlschedule',
+            name='created_by',
+            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AlterField(
+            model_name='crawlschedule',
+            name='id',
+            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        ),
+        migrations.DeleteModel(
+            name='Seed',
+        ),
+    ]
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -20,91 +20,6 @@ if TYPE_CHECKING:
    from core.models import Snapshot, ArchiveResult


-class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
-    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
-    created_at = models.DateTimeField(default=timezone.now, db_index=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
-    modified_at = models.DateTimeField(auto_now=True)
-
-    uri = models.URLField(max_length=2048)
-    extractor = models.CharField(default='auto', max_length=32)
-    tags_str = models.CharField(max_length=255, null=False, blank=True, default='')
-    label = models.CharField(max_length=255, null=False, blank=True, default='')
-    config = models.JSONField(default=dict)
-    output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='')
-    notes = models.TextField(blank=True, null=False, default='')
-
-    crawl_set: models.Manager['Crawl']
-
-    class Meta:
-        verbose_name = 'Seed'
-        verbose_name_plural = 'Seeds'
-        unique_together = (('created_by', 'uri', 'extractor'), ('created_by', 'label'))
-
-    def __str__(self):
-        return f'[{self.id}] {self.uri[:64]}'
-
-    def save(self, *args, **kwargs):
-        is_new = self._state.adding
-        super().save(*args, **kwargs)
-        if is_new:
-            from archivebox.misc.logging_util import log_worker_event
-            log_worker_event(
-                worker_type='DB',
-                event='Created Seed',
-                indent_level=0,
-                metadata={
-                    'id': str(self.id),
-                    'uri': str(self.uri)[:64],
-                    'extractor': self.extractor,
-                    'label': self.label or None,
-                },
-            )
-
-    @classmethod
-    def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
-        # Use absolute path for file:// URLs so extractors can find the files
-        source_path = str(source_file.resolve())
-        seed, _ = cls.objects.get_or_create(
-            label=label or source_file.name, uri=f'file://{source_path}',
-            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
-            extractor=parser, tags_str=tag, config=config or {},
-        )
-        return seed
-
-    @property
-    def source_type(self):
-        return self.uri.split('://', 1)[0].lower()
-
-    @property
-    def api_url(self) -> str:
-        return reverse_lazy('api-1:get_seed', args=[self.id])
-
-    def get_file_path(self) -> Path | None:
-        """
-        Get the filesystem path for file:// URIs.
-        Handles both old format (file:///data/...) and new format (file:///absolute/path).
-        Returns None if URI is not a file:// URI.
-        """
-        if not self.uri.startswith('file://'):
-            return None
-
-        # Remove file:// prefix
-        path_str = self.uri.replace('file://', '', 1)
-
-        # Handle old format: file:///data/... -> DATA_DIR/...
-        if path_str.startswith('/data/'):
-            return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
-
-        # Handle new format: file:///absolute/path
-        return Path(path_str)
-
-    @property
-    def snapshot_set(self) -> QuerySet['Snapshot']:
-        from core.models import Snapshot
-        return Snapshot.objects.filter(crawl_id__in=self.crawl_set.values_list('pk', flat=True))
-
-
 class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -124,14 +39,15 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
        verbose_name_plural = 'Scheduled Crawls'

    def __str__(self) -> str:
-        return f'[{self.id}] {self.template.seed.uri[:64] if self.template and self.template.seed else ""} @ {self.schedule}'
+        urls_preview = self.template.urls[:64] if self.template and self.template.urls else ""
+        return f'[{self.id}] {urls_preview} @ {self.schedule}'

    @property
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_any', args=[self.id])

    def save(self, *args, **kwargs):
-        self.label = self.label or (self.template.seed.label if self.template and self.template.seed else '')
+        self.label = self.label or (self.template.label if self.template else '')
        super().save(*args, **kwargs)
        if self.template:
            self.template.schedule = self
@@ -144,8 +60,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
    modified_at = models.DateTimeField(auto_now=True)

-    seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
-    urls = models.TextField(blank=True, null=False, default='')
+    urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
+    extractor = models.CharField(default='auto', max_length=32, help_text='Parser for reading URLs (auto, html, json, rss, etc)')
    config = models.JSONField(default=dict)
    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
@@ -171,31 +87,40 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        verbose_name_plural = 'Crawls'

    def __str__(self):
-        return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
+        first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
+        return f'[{self.id}] {first_url[:64]}'

    def save(self, *args, **kwargs):
        is_new = self._state.adding
        super().save(*args, **kwargs)
        if is_new:
            from archivebox.misc.logging_util import log_worker_event
+            first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
            log_worker_event(
                worker_type='DB',
                event='Created Crawl',
                indent_level=1,
                metadata={
                    'id': str(self.id),
-                    'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
+                    'first_url': first_url[:64],
                    'max_depth': self.max_depth,
                    'status': self.status,
                },
            )

    @classmethod
-    def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
-        crawl, _ = cls.objects.get_or_create(
-            seed=seed, max_depth=max_depth, tags_str=tags_str or seed.tags_str,
-            config=seed.config or config or {},
-            created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id,
+    def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto',
+                  tags_str: str = '', config=None, created_by=None):
+        """Create a crawl from a file containing URLs."""
+        urls_content = source_file.read_text()
+        crawl = cls.objects.create(
+            urls=urls_content,
+            extractor=extractor,
+            max_depth=max_depth,
+            tags_str=tags_str,
+            label=label or source_file.name,
+            config=config or {},
+            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
        )
        return crawl

@@ -203,14 +128,47 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_crawl', args=[self.id])

+    def get_urls_list(self) -> list[str]:
+        """Get list of URLs from urls field, filtering out comments and empty lines."""
+        if not self.urls:
+            return []
+        return [
+            url.strip()
+            for url in self.urls.split('\n')
+            if url.strip() and not url.strip().startswith('#')
+        ]
+
+    def get_file_path(self) -> Path | None:
+        """
+        Get filesystem path if this crawl references a local file.
+        Checks if the first URL is a file:// URI.
+        """
+        urls = self.get_urls_list()
+        if not urls:
+            return None
+
+        first_url = urls[0]
+        if not first_url.startswith('file://'):
+            return None
+
+        # Remove file:// prefix
+        path_str = first_url.replace('file://', '', 1)
+        return Path(path_str)
+
    def create_root_snapshot(self) -> 'Snapshot':
        from core.models import Snapshot
+
+        first_url = self.get_urls_list()[0] if self.get_urls_list() else None
+        if not first_url:
+            raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
+
        try:
-            return Snapshot.objects.get(crawl=self, url=self.seed.uri)
+            return Snapshot.objects.get(crawl=self, url=first_url)
        except Snapshot.DoesNotExist:
            pass
+
        root_snapshot, _ = Snapshot.objects.update_or_create(
-            crawl=self, url=self.seed.uri,
+            crawl=self, url=first_url,
            defaults={
                'status': Snapshot.INITIAL_STATE,
                'retry_at': timezone.now(),
--- a/archivebox/crawls/statemachines.py
+++ b/archivebox/crawls/statemachines.py
@@ -42,11 +42,12 @@ class CrawlMachine(StateMachine, strict_states=True):
        return self.__repr__()
        
    def can_start(self) -> bool:
-        if not self.crawl.seed:
-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
+        if not self.crawl.urls:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
            return False
-        if not self.crawl.seed.uri:
-            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
+        urls_list = self.crawl.get_urls_list()
+        if not urls_list:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
            return False
        return True
        
@@ -121,13 +122,14 @@ class CrawlMachine(StateMachine, strict_states=True):
        output_dir.mkdir(parents=True, exist_ok=True)

        # Run all on_Crawl hooks
+        first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else ''
        results = run_hooks(
            event_name='Crawl',
            output_dir=output_dir,
            timeout=60,
-            config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
+            config_objects=[self.crawl],
            crawl_id=str(self.crawl.id),
-            seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
+            seed_uri=first_url,
        )

        # Process hook results - parse JSONL output and create DB objects