From 91375d35a3984ce3fac8dd13294854adcf4226df Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Dec 2025 10:30:52 -0800 Subject: [PATCH] more migrations --- .../core/migrations/0025_cleanup_schema.py | 380 ++++++++++++++++++ .../0026_final_field_adjustments.py | 76 ++++ .../migrations/0002_upgrade_to_0_9_0.py | 90 +++++ 3 files changed, 546 insertions(+) create mode 100644 archivebox/core/migrations/0025_cleanup_schema.py create mode 100644 archivebox/core/migrations/0026_final_field_adjustments.py create mode 100644 archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py diff --git a/archivebox/core/migrations/0025_cleanup_schema.py b/archivebox/core/migrations/0025_cleanup_schema.py new file mode 100644 index 00000000..78057e4b --- /dev/null +++ b/archivebox/core/migrations/0025_cleanup_schema.py @@ -0,0 +1,380 @@ +# Generated by hand on 2025-12-29 +# Cleans up extra columns from raw SQL migrations and ensures schema matches models + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +import archivebox.base_models.models + + +def cleanup_extra_columns(apps, schema_editor): + """ + Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models. + The actual models use @property methods to access these values from the process FK. + """ + with schema_editor.connection.cursor() as cursor: + # Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0) + cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'") + has_cmd = cursor.fetchone()[0] > 0 + + if has_cmd: + print(" Cleaning up temporary columns from core_archiveresult...") + # Rebuild table without the extra columns + cursor.execute(""" + CREATE TABLE core_archiveresult_final ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid TEXT, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + snapshot_id TEXT NOT NULL, + plugin VARCHAR(32) NOT NULL DEFAULT '', + hook_name VARCHAR(255) NOT NULL DEFAULT '', + + start_ts DATETIME, + end_ts DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + output_files TEXT NOT NULL DEFAULT '{}', + output_json TEXT, + output_str TEXT NOT NULL DEFAULT '', + output_size INTEGER NOT NULL DEFAULT 0, + output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', + + config TEXT, + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + process_id TEXT, + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, + FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT + ) + """) + + # Copy data (cmd, pwd, etc. are now accessed via process FK) + cursor.execute(""" + INSERT INTO core_archiveresult_final SELECT + id, uuid, created_at, modified_at, + snapshot_id, plugin, hook_name, + start_ts, end_ts, status, retry_at, + output_files, output_json, output_str, output_size, output_mimetypes, + config, notes, num_uses_succeeded, num_uses_failed, + process_id + FROM core_archiveresult + """) + + # Replace table + cursor.execute("DROP TABLE core_archiveresult") + cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult") + + # Recreate indexes + cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)") + cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)") + cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)") + cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") + cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)") + cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)") + + print(" ✓ Cleaned up core_archiveresult schema") + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_assign_default_crawl'), + ('machine', '0001_initial'), + ('crawls', '0002_upgrade_to_0_9_0'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + cleanup_extra_columns, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # Tell Django about all the fields that exist after raw SQL migrations + # ArchiveResult model options + migrations.AlterModelOptions( + name='archiveresult', + options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'}, + ), + + # Remove old fields + migrations.RemoveField(model_name='archiveresult', name='cmd'), + migrations.RemoveField(model_name='archiveresult', name='pwd'), + migrations.RemoveField(model_name='archiveresult', name='cmd_version'), + migrations.RemoveField(model_name='archiveresult', name='extractor'), + migrations.RemoveField(model_name='archiveresult', name='output'), + migrations.RemoveField(model_name='snapshot', name='added'), + migrations.RemoveField(model_name='snapshot', name='updated'), + + # Add new ArchiveResult fields + migrations.AddField( + model_name='archiveresult', + name='plugin', + field=models.CharField(blank=True, default='', max_length=32), + ), + migrations.AddField( + model_name='archiveresult', + name='hook_name', + field=models.CharField(blank=True, default='', max_length=255), + ), + migrations.AddField( + model_name='archiveresult', + name='output_str', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='archiveresult', + name='output_json', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='output_files', + field=models.JSONField(blank=True, default=dict), + ), + migrations.AddField( + model_name='archiveresult', + name='output_size', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField(blank=True, default='', max_length=512), + ), + migrations.AddField( + model_name='archiveresult', + name='config', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='notes', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='archiveresult', + name='num_uses_succeeded', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='num_uses_failed', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='archiveresult', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='archiveresult', + name='process', + field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'), + ), + + # Update Snapshot model + migrations.AlterModelOptions( + name='snapshot', + options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'}, + ), + migrations.AddField( + model_name='snapshot', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='snapshot', + name='bookmarked_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, null=True), + ), + migrations.AddField( + model_name='snapshot', + name='crawl', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'), + ), + migrations.AddField( + model_name='snapshot', + name='depth', + field=models.PositiveSmallIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='parent_snapshot', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), + ), + migrations.AddField( + model_name='snapshot', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15), + ), + migrations.AddField( + model_name='snapshot', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + migrations.AddField( + model_name='snapshot', + name='current_step', + field=models.PositiveSmallIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='fs_version', + field=models.CharField(default='0.9.0', max_length=10), + ), + migrations.AddField( + model_name='snapshot', + name='config', + field=models.JSONField(blank=True, default=dict), + ), + migrations.AddField( + model_name='snapshot', + name='notes', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='snapshot', + name='num_uses_succeeded', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='num_uses_failed', + field=models.PositiveIntegerField(default=0), + ), + + # Update Tag model + migrations.AlterModelOptions( + name='tag', + options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'}, + ), + migrations.AddField( + model_name='tag', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name='tag', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='tag', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), + ), + + # Alter field types + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='end_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='start_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='timestamp', + field=models.CharField(db_index=True, max_length=32, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='url', + field=models.URLField(max_length=2048), + ), + migrations.AlterField( + model_name='tag', + name='slug', + field=models.SlugField(editable=False, max_length=100, unique=True), + ), + + # Create M2M model for snapshot tags + migrations.CreateModel( + name='SnapshotTag', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')), + ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')), + ], + options={ + 'db_table': 'core_snapshot_tags', + }, + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'tag')}, + ), + + # Update tags field on Snapshot to use the through model + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'), + ), + + # Add constraints + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), + ), + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0026_final_field_adjustments.py b/archivebox/core/migrations/0026_final_field_adjustments.py new file mode 100644 index 00000000..a7d16774 --- /dev/null +++ b/archivebox/core/migrations/0026_final_field_adjustments.py @@ -0,0 +1,76 @@ +# Generated by hand on 2025-12-30 +# Final field adjustments to match model definitions exactly + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0025_cleanup_schema'), + ('crawls', '0002_upgrade_to_0_9_0'), + ] + + operations = [ + # Alter Snapshot fields to match model exactly + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='timestamp', + field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='url', + field=models.URLField(db_index=True, unique=False), + ), + migrations.AlterField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='parent_snapshot', + field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), + ), + migrations.AlterField( + model_name='snapshot', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='fs_version', + field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), + ), + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + ), + + # Alter SnapshotTag fields + migrations.AlterField( + model_name='snapshottag', + name='id', + field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + ), + migrations.AlterField( + model_name='snapshottag', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), + ), + migrations.AlterField( + model_name='snapshottag', + name='tag', + field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), + ), + ] diff --git a/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py b/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py new file mode 100644 index 00000000..7afca909 --- /dev/null +++ b/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py @@ -0,0 +1,90 @@ +# Generated by hand on 2025-12-29 +# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema + +from django.db import migrations + + +def upgrade_crawl_schema_if_needed(apps, schema_editor): + """ + Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column). + """ + with schema_editor.connection.cursor() as cursor: + # Check if we need to upgrade (missing urls column means v0.8.6rc0) + cursor.execute(""" + SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls' + """) + has_urls = cursor.fetchone()[0] > 0 + + if not has_urls: + print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...") + + # Create new table with v0.9.0 schema + cursor.execute(""" + CREATE TABLE crawls_crawl_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + urls TEXT NOT NULL DEFAULT '[]', + config TEXT, + max_depth INTEGER NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id TEXT, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(512) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + created_by_id INTEGER NOT NULL, + schedule_id TEXT, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, + FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL + ) + """) + + # Copy data from old table (v0.8.6rc0 schema) + cursor.execute(""" + INSERT INTO crawls_crawl_new ( + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id + ) + SELECT + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + '[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id, + CAST(schedule_id AS TEXT) + FROM crawls_crawl + """) + + # Replace old table + cursor.execute("DROP TABLE crawls_crawl") + cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl") + + # Create indexes + cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)") + cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)") + cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)") + cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)") + cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)") + + print(" ✓ Upgraded crawls_crawl to v0.9.0 schema") + else: + print(" ✓ crawls_crawl already has v0.9.0 schema") + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0001_initial'), + ('auth', '0012_alter_user_first_name_max_length'), + ] + + operations = [ + migrations.RunPython( + upgrade_crawl_schema_if_needed, + reverse_code=migrations.RunPython.noop, + ), + ]