remove Seed model in favor of Crawl as template

2026-04-05 23:37:58 +10:00 · 2025-12-25 01:52:38 -08:00
parent 28e6c5bb65
commit bb53228ebf
30 changed files with 785 additions and 690 deletions
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -33,15 +33,18 @@ GLOBAL_CONTEXT = {}


 class SnapshotActionForm(ActionForm):
-    tags = forms.ModelMultipleChoiceField(
-        label='Edit tags',
-        queryset=Tag.objects.all(),
-        required=False,
-        widget=FilteredSelectMultiple(
-            'core_tag__name',
-            False,
-        ),
-    )
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Define tags field in __init__ to avoid database access during app initialization
+        self.fields['tags'] = forms.ModelMultipleChoiceField(
+            label='Edit tags',
+            queryset=Tag.objects.all(),
+            required=False,
+            widget=FilteredSelectMultiple(
+                'core_tag__name',
+                False,
+            ),
+        )

    # TODO: allow selecting actions for specific extractors? is this useful?
    # extractor = forms.ChoiceField(
@@ -165,14 +168,69 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):

    def admin_actions(self, obj):
        return format_html(
-            # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
            '''
-            <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a> &nbsp; &nbsp;
-            <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
-            <a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
+            <div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
+                <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
+                   href="/archive/{}"
+                   onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
+                   onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
+                    📄 Summary Page
+                </a>
+                <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
+                   href="/archive/{}/index.html#all"
+                   onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
+                   onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
+                    📁 Result Files
+                </a>
+                <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
+                   href="{}"
+                   target="_blank"
+                   onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
+                   onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
+                    🔗 Original URL
+                </a>
+
+                <span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
+
+                <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
+                   href="/admin/core/snapshot/?id__exact={}"
+                   title="Get missing extractors"
+                   onmouseover="this.style.background='#d1fae5';"
+                   onmouseout="this.style.background='#ecfdf5';">
+                    ⬇️ Get Missing
+                </a>
+                <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
+                   href="/admin/core/snapshot/?id__exact={}"
+                   title="Create a fresh new snapshot of this URL"
+                   onmouseover="this.style.background='#dbeafe';"
+                   onmouseout="this.style.background='#eff6ff';">
+                    🆕 Archive Again
+                </a>
+                <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
+                   href="/admin/core/snapshot/?id__exact={}"
+                   title="Re-run all extractors (overwrite existing)"
+                   onmouseover="this.style.background='#fef3c7';"
+                   onmouseout="this.style.background='#fffbeb';">
+                    🔄 Redo All
+                </a>
+                <a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; color: #991b1b; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
+                   href="/admin/core/snapshot/?id__exact={}"
+                   title="Permanently delete this snapshot"
+                   onmouseover="this.style.background='#fee2e2';"
+                   onmouseout="this.style.background='#fef2f2';">
+                    ☠️ Delete
+                </a>
+            </div>
+            <p style="margin-top: 12px; font-size: 12px; color: #64748b;">
+                <b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
+            </p>
            ''',
            obj.timestamp,
            obj.timestamp,
+            obj.url,
+            obj.pk,
+            obj.pk,
+            obj.pk,
            obj.pk,
        )

--- a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
+++ b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
@@ -0,0 +1,101 @@
+# Generated by Django 6.0 on 2025-12-25 09:34
+
+import archivebox.base_models.models
+import django.db.models.deletion
+import django.utils.timezone
+import uuid
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0025_allow_duplicate_urls_per_crawl'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='output_dir',
+        ),
+        migrations.RemoveField(
+            model_name='snapshot',
+            name='output_dir',
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='created_at',
+            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='created_by',
+            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='extractor',
+            field=models.CharField(db_index=True, max_length=32),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='id',
+            field=models.AutoField(editable=False, primary_key=True, serialize=False),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='status',
+            field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='uuid',
+            field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='bookmarked_at',
+            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='created_at',
+            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='created_by',
+            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='downloaded_at',
+            field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='id',
+            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        ),
+        # migrations.AlterField(
+        #     model_name='snapshot',
+        #     name='tags',
+        #     field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
+        # ),
+        migrations.AlterField(
+            model_name='snapshottag',
+            name='id',
+            field=models.AutoField(primary_key=True, serialize=False),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='created_by',
+            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
+        ),
+        migrations.AlterUniqueTogether(
+            name='snapshottag',
+            unique_together={('snapshot', 'tag')},
+        ),
+    ]
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -59,7 +59,7 @@ INSTALLED_APPS = [
    "config",  # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
    "machine",  # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
    "workers",  # handles starting and managing background workers and processes (orchestrators and actors)
-    "crawls",  # handles Seed, Crawl, and CrawlSchedule models and management
+    "crawls",  # handles Crawl and CrawlSchedule models and management
    "personas",  # handles Persona and session management
    "core",  # core django model with Snapshot, ArchiveResult, etc.
    "api",  # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
@@ -194,10 +194,6 @@ DATABASES = {
        "NAME": DATABASE_NAME,
        **SQLITE_CONNECTION_OPTIONS,
    },
-    "queue": {
-        "NAME": CONSTANTS.QUEUE_DATABASE_FILE,
-        **SQLITE_CONNECTION_OPTIONS,
-    },
    # "filestore": {
    #     "NAME": CONSTANTS.FILESTORE_DATABASE_FILE,
    #     **SQLITE_CONNECTION_OPTIONS,
--- a/archivebox/core/settings_logging.py
+++ b/archivebox/core/settings_logging.py
@@ -2,8 +2,6 @@ __package__ = 'archivebox.core'

 import re
 import os
-
-import shutil
 import tempfile
 import logging

@@ -11,7 +9,6 @@ import pydantic
 import django.template

 from archivebox.config import CONSTANTS
-from archivebox.misc.logging import IS_TTY


 IGNORABLE_URL_PATTERNS = [
@@ -79,7 +76,6 @@ SETTINGS_LOGGING = {
    "formatters": {
        "rich": {
            "datefmt": "[%Y-%m-%d %H:%M:%S]",
-            # "format": "{asctime} {levelname} {module} {name} {message} {username}",
            "format": "%(name)s %(message)s",
        },
        "outbound_webhooks": {
@@ -99,26 +95,13 @@ SETTINGS_LOGGING = {
        },
    },
    "handlers": {
-        # "console": {
-        #     "level": "DEBUG",
-        #     'formatter': 'simple',
-        #     "class": "logging.StreamHandler",
-        #     'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
-        # },
        "default": {
            "class": "rich.logging.RichHandler",
            "formatter": "rich",
            "level": "DEBUG",
            "markup": False,
-            "rich_tracebacks": IS_TTY,
+            "rich_tracebacks": False,  # Use standard Python tracebacks (no frame/box)
            "filters": ["noisyrequestsfilter"],
-            "tracebacks_suppress": [
-                django,
-                pydantic,
-            ],
-            "tracebacks_width": shutil.get_terminal_size((100, 10)).columns - 1,
-            "tracebacks_word_wrap": False,
-            "tracebacks_show_locals": False,
        },
        "logfile": {
            "level": "INFO",
@@ -132,7 +115,7 @@ SETTINGS_LOGGING = {
        "outbound_webhooks": {
            "class": "rich.logging.RichHandler",
            "markup": False,
-            "rich_tracebacks": True,
+            "rich_tracebacks": False,  # Use standard Python tracebacks (no frame/box)
            "formatter": "outbound_webhooks",
        },
        # "mail_admins": {
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -15,7 +15,7 @@ from statemachine import State, StateMachine
 # from workers.actor import ActorType

 from core.models import Snapshot, ArchiveResult
-from crawls.models import Crawl, Seed
+from crawls.models import Crawl


 class SnapshotMachine(StateMachine, strict_states=True):
@@ -247,17 +247,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
        )
        self.archiveresult.save(write_indexes=True)

-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
+        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)

-        # Also update Crawl and Seed health stats if snapshot has a crawl
+        # Also update Crawl health stats if snapshot has a crawl
        snapshot = self.archiveresult.snapshot
        if snapshot.crawl_id:
            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-            crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
-            if crawl:
-                Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1)

    @failed.enter
    def enter_failed(self):
@@ -268,17 +265,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
            end_ts=timezone.now(),
        )

-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
+        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)

-        # Also update Crawl and Seed health stats if snapshot has a crawl
+        # Also update Crawl health stats if snapshot has a crawl
        snapshot = self.archiveresult.snapshot
        if snapshot.crawl_id:
            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
-            crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
-            if crawl:
-                Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1)

    @skipped.enter
    def enter_skipped(self):
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -33,7 +33,7 @@ from archivebox.search import query_search_index

 from core.models import Snapshot
 from core.forms import AddLinkForm
-from crawls.models import Seed, Crawl
+from crawls.models import Crawl
 from archivebox.hooks import get_extractors, get_extractor_name


@@ -119,7 +119,11 @@ class SnapshotView(View):
            if result_file.name in existing_files or result_file.name == 'index.html':
                continue

-            file_size = result_file.stat().st_size or 0
+            # Skip circular symlinks and other stat() failures
+            try:
+                file_size = result_file.stat().st_size or 0
+            except OSError:
+                continue

            if file_size > min_size_threshold:
                archiveresults[result_file.name] = {
@@ -471,14 +475,16 @@ class AddView(UserPassesTestMixin, FormView):
        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
        sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))

-        # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
+        # 2. create a new Crawl with the URLs from the file
        timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
-        seed = Seed.from_file(
-            sources_file,
+        urls_content = sources_file.read_text()
+        crawl = Crawl.objects.create(
+            urls=urls_content,
+            extractor=parser,
+            max_depth=depth,
+            tags_str=tag,
            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
-            parser=parser,
-            tag=tag,
-            created_by=self.request.user.pk,
+            created_by_id=self.request.user.pk,
            config={
                # 'ONLY_NEW': not update,
                # 'INDEX_ONLY': index_only,
@@ -486,9 +492,8 @@ class AddView(UserPassesTestMixin, FormView):
                'DEPTH': depth,
                'EXTRACTORS': extractors or '',
                # 'DEFAULT_PERSONA': persona or 'Default',
-            })
-        # 3. create a new Crawl pointing to the Seed
-        crawl = Crawl.from_seed(seed, max_depth=depth)
+            }
+        )
        
        # 4. start the Orchestrator & wait until it completes
        #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
@@ -569,19 +574,7 @@ def live_progress_view(request):
            # Count URLs in the crawl (for when snapshots haven't been created yet)
            urls_count = 0
            if crawl.urls:
-                urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
-            elif crawl.seed and crawl.seed.uri:
-                # Try to get URL count from seed
-                if crawl.seed.uri.startswith('file:///'):
-                    try:
-                        from pathlib import Path
-                        seed_file = Path(crawl.seed.uri.replace('file://', ''))
-                        if seed_file.exists():
-                            urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
-                    except:
-                        pass
-                else:
-                    urls_count = 1  # Single URL seed
+                urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')])

            # Calculate crawl progress
            crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
@@ -635,8 +628,8 @@ def live_progress_view(request):
                })

            # Check if crawl can start (for debugging stuck crawls)
-            can_start = bool(crawl.seed and crawl.seed.uri)
-            seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
+            can_start = bool(crawl.urls)
+            urls_preview = crawl.urls[:60] if crawl.urls else None

            # Check if retry_at is in the future (would prevent worker from claiming)
            retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
@@ -657,7 +650,7 @@ def live_progress_view(request):
                'pending_snapshots': pending_snapshots,
                'active_snapshots': active_snapshots_for_crawl,
                'can_start': can_start,
-                'seed_uri': seed_uri,
+                'urls_preview': urls_preview,
                'retry_at_future': retry_at_future,
                'seconds_until_retry': seconds_until_retry,
            })