use full dotted paths for all archivebox imports, add migrations and more fixes

2026-04-05 23:37:58 +10:00 · 2025-12-29 00:47:08 -08:00
parent 1e4d3ffd11
commit f4e7820533
61 changed files with 1082 additions and 2985 deletions
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -5,8 +5,12 @@ from django.apps import AppConfig

 class CoreConfig(AppConfig):
    name = 'archivebox.core'
+    label = 'core'

    def ready(self):
        """Register the archivebox.core.admin_site as the main django admin site"""
        from archivebox.core.admin_site import register_admin_site
        register_admin_site()
+
+        # Import models to register state machines with the registry
+        from archivebox.core import models  # noqa: F401
--- a/archivebox/core/migrations/0024_b_clear_config_fields.py
+++ b/archivebox/core/migrations/0024_b_clear_config_fields.py
@@ -0,0 +1,57 @@
+# Data migration to clear config fields that may contain invalid JSON
+# This runs before 0025 to prevent CHECK constraint failures
+
+from django.db import migrations
+
+
+def clear_config_fields(apps, schema_editor):
+    """Clear all config fields in related tables to avoid JSON validation errors."""
+    db_alias = schema_editor.connection.alias
+
+    # Disable foreign key checks temporarily to allow updates
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=OFF")
+
+    tables_to_clear = [
+        ('crawls_seed', 'config'),
+        ('crawls_crawl', 'config'),
+        ('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
+        ('machine_machine', 'stats'),
+        ('machine_machine', 'config'),
+    ]
+
+    for table_info in tables_to_clear:
+        if table_info is None:
+            continue
+        table_name, field_name = table_info
+
+        try:
+            with schema_editor.connection.cursor() as cursor:
+                # Check if table exists first
+                cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
+                if not cursor.fetchone():
+                    print(f"  Skipping {table_name}.{field_name}: table does not exist")
+                    continue
+
+                # Set all to empty JSON object
+                cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
+                print(f"  Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
+        except Exception as e:
+            print(f"  Skipping {table_name}.{field_name}: {e}")
+
+    # Re-enable foreign key checks
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=ON")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0023_new_schema'),
+        ('crawls', '0001_initial'),
+        ('machine', '0001_squashed'),
+    ]
+
+    operations = [
+        migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
+    ]
--- a/archivebox/core/migrations/0024_c_disable_fk_checks.py
+++ b/archivebox/core/migrations/0024_c_disable_fk_checks.py
@@ -0,0 +1,28 @@
+# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
+
+from django.db import migrations
+
+
+def disable_fk_checks(apps, schema_editor):
+    """Temporarily disable foreign key checks."""
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=OFF")
+        print("  Disabled foreign key checks")
+
+
+def enable_fk_checks(apps, schema_editor):
+    """Re-enable foreign key checks."""
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA foreign_keys=ON")
+        print("  Enabled foreign key checks")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_b_clear_config_fields'),
+    ]
+
+    operations = [
+        migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
+    ]
--- a/archivebox/core/migrations/0024_d_fix_crawls_config.py
+++ b/archivebox/core/migrations/0024_d_fix_crawls_config.py
@@ -0,0 +1,93 @@
+# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
+
+from django.db import migrations
+
+
+def fix_crawls_config(apps, schema_editor):
+    """
+    Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
+    Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
+    For fresh installs, crawls.0001_initial creates the correct schema.
+    """
+    with schema_editor.connection.cursor() as cursor:
+        # Check if this is an upgrade from old 0.8.x or a fresh install
+        # In fresh installs, crawls.0001_initial was applied, creating seed FK
+        # In upgrades, the table was created by old migrations before 0001_initial existed
+        cursor.execute("""
+            SELECT COUNT(*) FROM django_migrations
+            WHERE app='crawls' AND name='0001_initial'
+        """)
+        has_crawls_0001 = cursor.fetchone()[0] > 0
+
+        if has_crawls_0001:
+            # Fresh install - crawls.0001_initial already created the correct schema
+            # Just clear config to avoid CHECK constraint issues
+            print("  Fresh install detected - clearing config field only")
+            try:
+                cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
+            except Exception as e:
+                print(f"  Skipping config clear: {e}")
+            return
+
+        # Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
+        print("  Upgrading from 0.8.x - rebuilding crawls_crawl table")
+        cursor.execute("PRAGMA foreign_keys=OFF")
+
+        # Backup
+        cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
+
+        # Recreate without config CHECK constraint, with nullable seed_id
+        cursor.execute("DROP TABLE crawls_crawl")
+        cursor.execute("""
+            CREATE TABLE "crawls_crawl" (
+                "num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
+                "num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
+                "id" char(32) NOT NULL PRIMARY KEY,
+                "created_at" datetime NOT NULL,
+                "modified_at" datetime NOT NULL,
+                "urls" text NOT NULL,
+                "config" text,
+                "max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
+                "tags_str" varchar(1024) NOT NULL,
+                "persona_id" char(32) NULL,
+                "label" varchar(64) NOT NULL,
+                "notes" text NOT NULL,
+                "output_dir" varchar(512) NOT NULL,
+                "status" varchar(15) NOT NULL,
+                "retry_at" datetime NULL,
+                "created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
+                "seed_id" char(32) NULL DEFAULT NULL,
+                "schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
+            )
+        """)
+
+        # Restore data
+        cursor.execute("""
+            INSERT INTO "crawls_crawl" (
+                "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
+                "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
+                "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
+            )
+            SELECT
+                "num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
+                "urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
+                "output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
+            FROM crawls_crawl_backup
+        """)
+
+        cursor.execute("DROP TABLE crawls_crawl_backup")
+
+        # NULL out config to avoid any invalid JSON
+        cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_c_disable_fk_checks'),
+        ('crawls', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
+    ]
--- a/archivebox/core/migrations/0024_snapshot_crawl.py
+++ b/archivebox/core/migrations/0024_snapshot_crawl.py
@@ -8,9 +8,7 @@ import django.db.models.deletion
 class Migration(migrations.Migration):

    dependencies = [
-        ('core', '0023_new_schema'),
-        ('crawls', '0001_initial'),
-        ('machine', '0001_squashed'),
+        ('core', '0024_d_fix_crawls_config'),
    ]

    operations = [
--- a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
+++ b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py
@@ -10,6 +10,13 @@ from django.db import migrations, models

 def populate_archiveresult_uuids(apps, schema_editor):
    """Generate unique UUIDs for ArchiveResults that don't have one."""
+    # Check if uuid column exists before trying to populate it
+    with schema_editor.connection.cursor() as cursor:
+        cursor.execute("PRAGMA table_info(core_archiveresult)")
+        columns = [row[1] for row in cursor.fetchall()]
+        if 'uuid' not in columns:
+            return  # uuid column doesn't exist, skip this data migration
+
    ArchiveResult = apps.get_model('core', 'ArchiveResult')
    for result in ArchiveResult.objects.filter(uuid__isnull=True):
        result.uuid = uuid_compat.uuid7()
@@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor):
    pass


+def remove_output_dir_if_exists(apps, schema_editor):
+    """Remove output_dir columns if they exist."""
+    with schema_editor.connection.cursor() as cursor:
+        # Check and remove from core_archiveresult
+        cursor.execute("PRAGMA table_info(core_archiveresult)")
+        columns = [row[1] for row in cursor.fetchall()]
+        if 'output_dir' in columns:
+            cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
+
+        # Check and remove from core_snapshot
+        cursor.execute("PRAGMA table_info(core_snapshot)")
+        columns = [row[1] for row in cursor.fetchall()]
+        if 'output_dir' in columns:
+            cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
+
+
 class Migration(migrations.Migration):

    dependencies = [
@@ -33,82 +56,90 @@ class Migration(migrations.Migration):
        migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),

        # Remove output_dir fields (not needed, computed from snapshot)
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='output_dir',
-        ),
-        migrations.RemoveField(
-            model_name='snapshot',
-            name='output_dir',
+        migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
+
+        # Update Django's migration state to match 0.9.x schema
+        # Database already has correct types from 0.8.x, just update state
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Archiveresult field alterations
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='extractor',
+                    field=models.CharField(db_index=True, max_length=32),
+                ),
+                # Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='status',
+                    field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
+                ),
+
+                # Snapshot field alterations
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='bookmarked_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='downloaded_at',
+                    field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+            ],
+            database_operations=[
+                # No actual database changes needed - schema is already correct from 0.8.x
+            ],
        ),

-        # Archiveresult field alterations
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='created_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='created_by',
-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='extractor',
-            field=models.CharField(db_index=True, max_length=32),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='id',
-            field=models.AutoField(editable=False, primary_key=True, serialize=False),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='status',
-            field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
-        ),
-
-        # Snapshot field alterations
-        migrations.AlterField(
-            model_name='snapshot',
-            name='bookmarked_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='created_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='created_by',
-            field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='downloaded_at',
-            field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
-        ),
-        migrations.AlterField(
-            model_name='snapshot',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-
-        # SnapshotTag and Tag alterations
-        migrations.AlterField(
-            model_name='snapshottag',
-            name='id',
-            field=models.AutoField(primary_key=True, serialize=False),
-        ),
-        migrations.AlterField(
-            model_name='tag',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterUniqueTogether(
-            name='snapshottag',
-            unique_together={('snapshot', 'tag')},
+        # SnapshotTag and Tag alterations - state only, DB already correct
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='snapshottag',
+                    name='id',
+                    field=models.AutoField(primary_key=True, serialize=False),
+                ),
+                migrations.AlterField(
+                    model_name='tag',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterUniqueTogether(
+                    name='snapshottag',
+                    unique_together={('snapshot', 'tag')},
+                ),
+            ],
+            database_operations=[],
        ),
    ]
--- a/archivebox/core/migrations/0029_archiveresult_hook_fields.py
+++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
@@ -13,68 +13,79 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        # Add new output fields (keep old 'output' temporarily for migration)
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_str',
-            field=models.TextField(
-                blank=True,
-                default='',
-                help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_json',
-            field=models.JSONField(
-                null=True,
-                blank=True,
-                default=None,
-                help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_files',
-            field=models.JSONField(
-                default=dict,
-                help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_size',
-            field=models.BigIntegerField(
-                default=0,
-                help_text='Total recursive size in bytes of all output files'
-            ),
-        ),
-
-        migrations.AddField(
-            model_name='archiveresult',
-            name='output_mimetypes',
-            field=models.CharField(
-                max_length=512,
-                blank=True,
-                default='',
-                help_text='CSV of mimetypes sorted by size descending'
-            ),
-        ),
-
-        # Add binary FK (optional)
-        migrations.AddField(
-            model_name='archiveresult',
-            name='binary',
-            field=models.ForeignKey(
-                'machine.Binary',
-                on_delete=models.SET_NULL,
-                null=True,
-                blank=True,
-                related_name='archiveresults',
-                help_text='Primary binary used by this hook (optional)'
-            ),
+        # Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_str',
+                    field=models.TextField(
+                        blank=True,
+                        default='',
+                        help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_json',
+                    field=models.JSONField(
+                        null=True,
+                        blank=True,
+                        default=None,
+                        help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_files',
+                    field=models.JSONField(
+                        default=dict,
+                        help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_size',
+                    field=models.BigIntegerField(
+                        default=0,
+                        help_text='Total recursive size in bytes of all output files'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_mimetypes',
+                    field=models.CharField(
+                        max_length=512,
+                        blank=True,
+                        default='',
+                        help_text='CSV of mimetypes sorted by size descending'
+                    ),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='binary',
+                    field=models.ForeignKey(
+                        'machine.Binary',
+                        on_delete=models.SET_NULL,
+                        null=True,
+                        blank=True,
+                        related_name='archiveresults',
+                        help_text='Primary binary used by this hook (optional)'
+                    ),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
+                        ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
+                        ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
+                        ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
+                        ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
+                        ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
        ),
    ]
--- a/archivebox/core/migrations/0030_migrate_output_field.py
+++ b/archivebox/core/migrations/0030_migrate_output_field.py
@@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor):
    Logic:
    - If output contains JSON {...}, move to output_json
    - Otherwise, move to output_str
+
+    Use raw SQL to avoid CHECK constraint issues during migration.
    """
-    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+    # Use raw SQL to migrate data without triggering CHECK constraints
+    with schema_editor.connection.cursor() as cursor:
+        # Get all archive results
+        cursor.execute("""
+            SELECT id, output FROM core_archiveresult
+        """)

-    for ar in ArchiveResult.objects.all().iterator():
-        old_output = ar.output or ''
+        for row in cursor.fetchall():
+            ar_id, old_output = row
+            old_output = old_output or ''

-        # Case 1: JSON output
-        if old_output.strip().startswith('{'):
-            try:
-                parsed = json.loads(old_output)
-                ar.output_json = parsed
-                ar.output_str = ''
-            except json.JSONDecodeError:
-                # Not valid JSON, treat as string
-                ar.output_str = old_output
-
-        # Case 2: File path or plain string
-        else:
-            ar.output_str = old_output
-
-        ar.save(update_fields=['output_str', 'output_json'])
+            # Case 1: JSON output
+            if old_output.strip().startswith('{'):
+                try:
+                    # Validate it's actual JSON
+                    parsed = json.loads(old_output)
+                    # Update with JSON - cast to JSON to satisfy CHECK constraint
+                    json_str = json.dumps(parsed)
+                    cursor.execute("""
+                        UPDATE core_archiveresult
+                        SET output_str = '', output_json = json(?)
+                        WHERE id = ?
+                    """, (json_str, ar_id))
+                except json.JSONDecodeError:
+                    # Not valid JSON, treat as string
+                    cursor.execute("""
+                        UPDATE core_archiveresult
+                        SET output_str = ?, output_json = NULL
+                        WHERE id = ?
+                    """, (old_output, ar_id))
+            # Case 2: File path or plain string
+            else:
+                cursor.execute("""
+                    UPDATE core_archiveresult
+                    SET output_str = ?, output_json = NULL
+                    WHERE id = ?
+                """, (old_output, ar_id))


 def reverse_migrate(apps, schema_editor):
--- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
+++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
@@ -16,43 +16,62 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='binary',
-            field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
+        # Update Django's state only - database already has correct schema from 0029
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='binary',
+                    field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_files',
+                    field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_json',
+                    field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_mimetypes',
+                    field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_size',
+                    field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='output_str',
+                    field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='uuid',
+                    field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
+                ),
+            ],
+            database_operations=[
+                # No database changes needed - columns already exist with correct types
+            ],
        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_files',
-            field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_json',
-            field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_mimetypes',
-            field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_size',
-            field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='output_str',
-            field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
-        ),
-        migrations.AlterField(
-            model_name='archiveresult',
-            name='uuid',
-            field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
-        ),
-        migrations.AddConstraint(
-            model_name='snapshot',
-            constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
+        # Add unique constraint without table rebuild
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddConstraint(
+                    model_name='snapshot',
+                    constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
+                    reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
+                ),
+            ],
        ),
    ]
--- a/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
+++ b/archivebox/core/migrations/0033_rename_extractor_add_hook_name.py
@@ -10,20 +10,35 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        migrations.RenameField(
-            model_name='archiveresult',
-            old_name='extractor',
-            new_name='plugin',
-        ),
-        migrations.AddField(
-            model_name='archiveresult',
-            name='hook_name',
-            field=models.CharField(
-                blank=True,
-                default='',
-                max_length=255,
-                db_index=True,
-                help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
-            ),
+        # Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.RenameField(
+                    model_name='archiveresult',
+                    old_name='extractor',
+                    new_name='plugin',
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='hook_name',
+                    field=models.CharField(
+                        blank=True,
+                        default='',
+                        max_length=255,
+                        db_index=True,
+                        help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
+                    ),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
+                        ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
+                        CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
        ),
    ]
--- a/archivebox/core/migrations/0034_snapshot_current_step.py
+++ b/archivebox/core/migrations/0034_snapshot_current_step.py
@@ -11,13 +11,27 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        migrations.AddField(
-            model_name='snapshot',
-            name='current_step',
-            field=models.PositiveSmallIntegerField(
-                default=0,
-                db_index=True,
-                help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
-            ),
+        # Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='current_step',
+                    field=models.PositiveSmallIntegerField(
+                        default=0,
+                        db_index=True,
+                        help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
+                    ),
+                ),
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
+                        CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
        ),
    ]
--- a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
+++ b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
@@ -54,7 +54,7 @@ class Migration(migrations.Migration):

    dependencies = [
        ('core', '0034_snapshot_current_step'),
-        ('crawls', '0004_alter_crawl_output_dir'),
+        ('crawls', '0005_drop_seed_id_column'),
    ]

    operations = [
@@ -64,16 +64,24 @@ class Migration(migrations.Migration):
            reverse_code=migrations.RunPython.noop,
        ),

-        # Step 2: Make crawl non-nullable
-        migrations.AlterField(
-            model_name='snapshot',
-            name='crawl',
-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
-        ),
-
-        # Step 3: Remove created_by field
-        migrations.RemoveField(
-            model_name='snapshot',
-            name='created_by',
+        # Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Make crawl non-nullable
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='crawl',
+                    field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
+                ),
+                # Remove created_by field from Django's state
+                migrations.RemoveField(
+                    model_name='snapshot',
+                    name='created_by',
+                ),
+            ],
+            database_operations=[
+                # No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
+                # created_by_id column remains in database but is unused
+            ],
        ),
    ]
--- a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
+++ b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
@@ -10,10 +10,18 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        # Remove created_by field from ArchiveResult
+        # Remove created_by field from ArchiveResult (state only)
        # No data migration needed - created_by can be accessed via snapshot.crawl.created_by
-        migrations.RemoveField(
-            model_name='archiveresult',
-            name='created_by',
+        # Leave created_by_id column in database (unused but harmless, avoids table rebuild)
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.RemoveField(
+                    model_name='archiveresult',
+                    name='created_by',
+                ),
+            ],
+            database_operations=[
+                # No database changes - leave created_by_id column in place to avoid table rebuild
+            ],
        ),
    ]
--- a/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py
+++ b/archivebox/core/migrations/0037_remove_archiveresult_output_dir_and_more.py
@@ -0,0 +1,44 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0036_remove_archiveresult_created_by'),
+    ]
+
+    operations = [
+        # Update Django's state only - database columns remain for backwards compat
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.RemoveField(
+                    model_name='archiveresult',
+                    name='output_dir',
+                ),
+                migrations.RemoveField(
+                    model_name='snapshot',
+                    name='output_dir',
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='tags',
+                    field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
+                ),
+            ],
+            database_operations=[
+                # No database changes - columns remain in place to avoid table rebuilds
+            ],
+        ),
+    ]
--- a/archivebox/core/migrations/0038_fix_missing_columns.py
+++ b/archivebox/core/migrations/0038_fix_missing_columns.py
@@ -0,0 +1,84 @@
+# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
+
+from django.db import migrations, models, connection
+import django.utils.timezone
+
+
+def add_columns_if_not_exist(apps, schema_editor):
+    """Add columns to ArchiveResult only if they don't already exist."""
+    with connection.cursor() as cursor:
+        # Get existing columns
+        cursor.execute("PRAGMA table_info(core_archiveresult)")
+        existing_columns = {row[1] for row in cursor.fetchall()}
+
+        # Add num_uses_failed if it doesn't exist
+        if 'num_uses_failed' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
+
+        # Add num_uses_succeeded if it doesn't exist
+        if 'num_uses_succeeded' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
+
+        # Add config if it doesn't exist
+        if 'config' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
+
+        # Add retry_at if it doesn't exist
+        if 'retry_at' not in existing_columns:
+            cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
+            cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0037_remove_archiveresult_output_dir_and_more'),
+    ]
+
+    operations = [
+        # Add missing columns to ArchiveResult
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='num_uses_failed',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='num_uses_succeeded',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='retry_at',
+                    field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
+                ),
+            ],
+            database_operations=[
+                migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
+            ],
+        ),
+
+        # Drop created_by_id from Snapshot (database only, already removed from model in 0035)
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # No state changes - field already removed in 0035
+            ],
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
+                        -- Drop index first, then column
+                        DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
+                        ALTER TABLE core_snapshot DROP COLUMN created_by_id;
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
+        ),
+    ]
--- a/archivebox/core/migrations/0039_fix_num_uses_values.py
+++ b/archivebox/core/migrations/0039_fix_num_uses_values.py
@@ -0,0 +1,30 @@
+# Fix num_uses_failed and num_uses_succeeded string values to integers
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0038_fix_missing_columns'),
+    ]
+
+    operations = [
+        # Fix string values that got inserted as literals instead of integers
+        migrations.RunSQL(
+            sql="""
+                UPDATE core_snapshot
+                SET num_uses_failed = 0
+                WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
+
+                UPDATE core_snapshot
+                SET num_uses_succeeded = 0
+                WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
+
+                UPDATE core_snapshot
+                SET depth = 0
+                WHERE typeof(depth) = 'text' OR depth = 'depth';
+            """,
+            reverse_sql=migrations.RunSQL.noop,
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        )

        merged = 0
-        for dup in duplicates.iterator():
+        for dup in duplicates.iterator(chunk_size=500):
            snapshots = list(
                cls.objects
                .filter(url=dup['url'], timestamp=dup['timestamp'])
--- a/archivebox/core/models.py.bak
+++ b/archivebox/core/models.py.bak
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str:
            'output_path': output_path,
            'plugin': plugin,
        })
-        return mark_safe(tpl.render(ctx))
+        rendered = tpl.render(ctx)
+        # Only return non-empty content (strip whitespace to check)
+        if rendered.strip():
+            return mark_safe(rendered)
+        return ''
    except Exception:
        return ''

@@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str:
            'output_path': output_path,
            'plugin': plugin,
        })
-        return mark_safe(tpl.render(ctx))
+        rendered = tpl.render(ctx)
+        # Only return non-empty content (strip whitespace to check)
+        if rendered.strip():
+            return mark_safe(rendered)
+        return ''
    except Exception:
        return ''

@@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str:
            'output_path': output_path,
            'plugin': plugin,
        })
-        return mark_safe(tpl.render(ctx))
+        rendered = tpl.render(ctx)
+        # Only return non-empty content (strip whitespace to check)
+        if rendered.strip():
+            return mark_safe(rendered)
+        return ''
    except Exception:
        return ''

--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -539,7 +539,7 @@ from django.http import JsonResponse
 def live_progress_view(request):
    """Simple JSON endpoint for live progress status - used by admin progress monitor."""
    try:
-        from workers.orchestrator import Orchestrator
+        from archivebox.workers.orchestrator import Orchestrator
        from archivebox.crawls.models import Crawl
        from archivebox.core.models import Snapshot, ArchiveResult
        from django.db.models import Case, When, Value, IntegerField