more migrations

2026-01-03 01:15:57 +10:00 · 2025-12-30 10:30:52 -08:00
parent 96ee1bf686
commit 91375d35a3
3 changed files with 546 additions and 0 deletions
--- a/archivebox/core/migrations/0025_cleanup_schema.py
+++ b/archivebox/core/migrations/0025_cleanup_schema.py
@@ -0,0 +1,380 @@
+# Generated by hand on 2025-12-29
+# Cleans up extra columns from raw SQL migrations and ensures schema matches models
+
+from django.db import migrations, models
+import django.db.models.deletion
+import django.utils.timezone
+from django.conf import settings
+import archivebox.base_models.models
+
+
+def cleanup_extra_columns(apps, schema_editor):
+    """
+    Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models.
+    The actual models use @property methods to access these values from the process FK.
+    """
+    with schema_editor.connection.cursor() as cursor:
+        # Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
+        cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'")
+        has_cmd = cursor.fetchone()[0] > 0
+
+        if has_cmd:
+            print("  Cleaning up temporary columns from core_archiveresult...")
+            # Rebuild table without the extra columns
+            cursor.execute("""
+                CREATE TABLE core_archiveresult_final (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    uuid TEXT,
+                    created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+                    modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+
+                    snapshot_id TEXT NOT NULL,
+                    plugin VARCHAR(32) NOT NULL DEFAULT '',
+                    hook_name VARCHAR(255) NOT NULL DEFAULT '',
+
+                    start_ts DATETIME,
+                    end_ts DATETIME,
+                    status VARCHAR(15) NOT NULL DEFAULT 'queued',
+                    retry_at DATETIME,
+
+                    output_files TEXT NOT NULL DEFAULT '{}',
+                    output_json TEXT,
+                    output_str TEXT NOT NULL DEFAULT '',
+                    output_size INTEGER NOT NULL DEFAULT 0,
+                    output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
+
+                    config TEXT,
+                    notes TEXT NOT NULL DEFAULT '',
+                    num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
+                    num_uses_failed INTEGER NOT NULL DEFAULT 0,
+
+                    process_id TEXT,
+
+                    FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
+                    FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
+                )
+            """)
+
+            # Copy data (cmd, pwd, etc. are now accessed via process FK)
+            cursor.execute("""
+                INSERT INTO core_archiveresult_final SELECT
+                    id, uuid, created_at, modified_at,
+                    snapshot_id, plugin, hook_name,
+                    start_ts, end_ts, status, retry_at,
+                    output_files, output_json, output_str, output_size, output_mimetypes,
+                    config, notes, num_uses_succeeded, num_uses_failed,
+                    process_id
+                FROM core_archiveresult
+            """)
+
+            # Replace table
+            cursor.execute("DROP TABLE core_archiveresult")
+            cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult")
+
+            # Recreate indexes
+            cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
+            cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
+            cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
+            cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
+            cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
+            cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
+
+            print("  ✓ Cleaned up core_archiveresult schema")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_assign_default_crawl'),
+        ('machine', '0001_initial'),
+        ('crawls', '0002_upgrade_to_0_9_0'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.SeparateDatabaseAndState(
+            database_operations=[
+                migrations.RunPython(
+                    cleanup_extra_columns,
+                    reverse_code=migrations.RunPython.noop,
+                ),
+            ],
+            state_operations=[
+                # Tell Django about all the fields that exist after raw SQL migrations
+                #  ArchiveResult model options
+                migrations.AlterModelOptions(
+                    name='archiveresult',
+                    options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
+                ),
+
+                # Remove old fields
+                migrations.RemoveField(model_name='archiveresult', name='cmd'),
+                migrations.RemoveField(model_name='archiveresult', name='pwd'),
+                migrations.RemoveField(model_name='archiveresult', name='cmd_version'),
+                migrations.RemoveField(model_name='archiveresult', name='extractor'),
+                migrations.RemoveField(model_name='archiveresult', name='output'),
+                migrations.RemoveField(model_name='snapshot', name='added'),
+                migrations.RemoveField(model_name='snapshot', name='updated'),
+
+                # Add new ArchiveResult fields
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='plugin',
+                    field=models.CharField(blank=True, default='', max_length=32),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='hook_name',
+                    field=models.CharField(blank=True, default='', max_length=255),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_str',
+                    field=models.TextField(blank=True, default=''),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_json',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_files',
+                    field=models.JSONField(blank=True, default=dict),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_size',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='output_mimetypes',
+                    field=models.CharField(blank=True, default='', max_length=512),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='notes',
+                    field=models.TextField(blank=True, default=''),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='num_uses_succeeded',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='num_uses_failed',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='retry_at',
+                    field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='modified_at',
+                    field=models.DateTimeField(auto_now=True),
+                ),
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='process',
+                    field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
+                ),
+
+                # Update Snapshot model
+                migrations.AlterModelOptions(
+                    name='snapshot',
+                    options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='modified_at',
+                    field=models.DateTimeField(auto_now=True),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='bookmarked_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='downloaded_at',
+                    field=models.DateTimeField(blank=True, null=True),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='crawl',
+                    field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='depth',
+                    field=models.PositiveSmallIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='parent_snapshot',
+                    field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='status',
+                    field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='retry_at',
+                    field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='current_step',
+                    field=models.PositiveSmallIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='fs_version',
+                    field=models.CharField(default='0.9.0', max_length=10),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='notes',
+                    field=models.TextField(blank=True, default=''),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='num_uses_succeeded',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='num_uses_failed',
+                    field=models.PositiveIntegerField(default=0),
+                ),
+
+                # Update Tag model
+                migrations.AlterModelOptions(
+                    name='tag',
+                    options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
+                ),
+                migrations.AddField(
+                    model_name='tag',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
+                ),
+                migrations.AddField(
+                    model_name='tag',
+                    name='modified_at',
+                    field=models.DateTimeField(auto_now=True),
+                ),
+                migrations.AddField(
+                    model_name='tag',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
+                ),
+
+                # Alter field types
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='id',
+                    field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='uuid',
+                    field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='end_ts',
+                    field=models.DateTimeField(blank=True, default=None, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='start_ts',
+                    field=models.DateTimeField(blank=True, default=None, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='status',
+                    field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='id',
+                    field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='timestamp',
+                    field=models.CharField(db_index=True, max_length=32, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='url',
+                    field=models.URLField(max_length=2048),
+                ),
+                migrations.AlterField(
+                    model_name='tag',
+                    name='slug',
+                    field=models.SlugField(editable=False, max_length=100, unique=True),
+                ),
+
+                # Create M2M model for snapshot tags
+                migrations.CreateModel(
+                    name='SnapshotTag',
+                    fields=[
+                        ('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')),
+                        ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
+                        ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
+                    ],
+                    options={
+                        'db_table': 'core_snapshot_tags',
+                    },
+                ),
+                migrations.AlterUniqueTogether(
+                    name='snapshottag',
+                    unique_together={('snapshot', 'tag')},
+                ),
+
+                # Update tags field on Snapshot to use the through model
+                migrations.AlterField(
+                    model_name='snapshot',
+                    name='tags',
+                    field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
+                ),
+
+                # Add constraints
+                migrations.AddConstraint(
+                    model_name='snapshot',
+                    constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+                ),
+                migrations.AddConstraint(
+                    model_name='snapshot',
+                    constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
+                ),
+            ],
+        ),
+    ]
--- a/archivebox/core/migrations/0026_final_field_adjustments.py
+++ b/archivebox/core/migrations/0026_final_field_adjustments.py
@@ -0,0 +1,76 @@
+# Generated by hand on 2025-12-30
+# Final field adjustments to match model definitions exactly
+
+from django.db import migrations, models
+import django.db.models.deletion
+import django.utils.timezone
+from archivebox.uuid_compat import uuid7
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0025_cleanup_schema'),
+        ('crawls', '0002_upgrade_to_0_9_0'),
+    ]
+
+    operations = [
+        # Alter Snapshot fields to match model exactly
+        migrations.AlterField(
+            model_name='snapshot',
+            name='id',
+            field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='timestamp',
+            field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='url',
+            field=models.URLField(db_index=True, unique=False),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='downloaded_at',
+            field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='parent_snapshot',
+            field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='retry_at',
+            field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='fs_version',
+            field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='tags',
+            field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
+        ),
+
+        # Alter SnapshotTag fields
+        migrations.AlterField(
+            model_name='snapshottag',
+            name='id',
+            field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
+        ),
+        migrations.AlterField(
+            model_name='snapshottag',
+            name='snapshot',
+            field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
+        ),
+        migrations.AlterField(
+            model_name='snapshottag',
+            name='tag',
+            field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
+        ),
+    ]
--- a/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py
+++ b/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py
@@ -0,0 +1,90 @@
+# Generated by hand on 2025-12-29
+# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema
+
+from django.db import migrations
+
+
+def upgrade_crawl_schema_if_needed(apps, schema_editor):
+    """
+    Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column).
+    """
+    with schema_editor.connection.cursor() as cursor:
+        # Check if we need to upgrade (missing urls column means v0.8.6rc0)
+        cursor.execute("""
+            SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls'
+        """)
+        has_urls = cursor.fetchone()[0] > 0
+
+        if not has_urls:
+            print("  Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...")
+
+            # Create new table with v0.9.0 schema
+            cursor.execute("""
+                CREATE TABLE crawls_crawl_new (
+                    id TEXT PRIMARY KEY NOT NULL,
+                    created_at DATETIME NOT NULL,
+                    modified_at DATETIME NOT NULL,
+                    num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
+                    num_uses_failed INTEGER NOT NULL DEFAULT 0,
+
+                    urls TEXT NOT NULL DEFAULT '[]',
+                    config TEXT,
+                    max_depth INTEGER NOT NULL DEFAULT 0,
+                    tags_str VARCHAR(1024) NOT NULL DEFAULT '',
+                    persona_id TEXT,
+                    label VARCHAR(64) NOT NULL DEFAULT '',
+                    notes TEXT NOT NULL DEFAULT '',
+                    output_dir VARCHAR(512) NOT NULL DEFAULT '',
+
+                    status VARCHAR(15) NOT NULL DEFAULT 'queued',
+                    retry_at DATETIME,
+                    created_by_id INTEGER NOT NULL,
+                    schedule_id TEXT,
+
+                    FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
+                    FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
+                )
+            """)
+
+            # Copy data from old table (v0.8.6rc0 schema)
+            cursor.execute("""
+                INSERT INTO crawls_crawl_new (
+                    id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
+                    urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id
+                )
+                SELECT
+                    id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
+                    '[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id,
+                    CAST(schedule_id AS TEXT)
+                FROM crawls_crawl
+            """)
+
+            # Replace old table
+            cursor.execute("DROP TABLE crawls_crawl")
+            cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl")
+
+            # Create indexes
+            cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)")
+            cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)")
+            cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)")
+            cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)")
+            cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)")
+
+            print("  ✓ Upgraded crawls_crawl to v0.9.0 schema")
+        else:
+            print("  ✓ crawls_crawl already has v0.9.0 schema")
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0001_initial'),
+        ('auth', '0012_alter_user_first_name_max_length'),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            upgrade_crawl_schema_if_needed,
+            reverse_code=migrations.RunPython.noop,
+        ),
+    ]