more migrations

This commit is contained in:
Nick Sweeting
2025-12-30 10:30:52 -08:00
parent 96ee1bf686
commit 91375d35a3
3 changed files with 546 additions and 0 deletions

View File

@@ -0,0 +1,380 @@
# Generated by hand on 2025-12-29
# Cleans up extra columns from raw SQL migrations and ensures schema matches models
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
import archivebox.base_models.models
def cleanup_extra_columns(apps, schema_editor):
"""
Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models.
The actual models use @property methods to access these values from the process FK.
"""
with schema_editor.connection.cursor() as cursor:
# Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'")
has_cmd = cursor.fetchone()[0] > 0
if has_cmd:
print(" Cleaning up temporary columns from core_archiveresult...")
# Rebuild table without the extra columns
cursor.execute("""
CREATE TABLE core_archiveresult_final (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
process_id TEXT,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
)
""")
# Copy data (cmd, pwd, etc. are now accessed via process FK)
cursor.execute("""
INSERT INTO core_archiveresult_final SELECT
id, uuid, created_at, modified_at,
snapshot_id, plugin, hook_name,
start_ts, end_ts, status, retry_at,
output_files, output_json, output_str, output_size, output_mimetypes,
config, notes, num_uses_succeeded, num_uses_failed,
process_id
FROM core_archiveresult
""")
# Replace table
cursor.execute("DROP TABLE core_archiveresult")
cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult")
# Recreate indexes
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
print(" ✓ Cleaned up core_archiveresult schema")
class Migration(migrations.Migration):
dependencies = [
('core', '0024_assign_default_crawl'),
('machine', '0001_initial'),
('crawls', '0002_upgrade_to_0_9_0'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(
cleanup_extra_columns,
reverse_code=migrations.RunPython.noop,
),
],
state_operations=[
# Tell Django about all the fields that exist after raw SQL migrations
# ArchiveResult model options
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
# Remove old fields
migrations.RemoveField(model_name='archiveresult', name='cmd'),
migrations.RemoveField(model_name='archiveresult', name='pwd'),
migrations.RemoveField(model_name='archiveresult', name='cmd_version'),
migrations.RemoveField(model_name='archiveresult', name='extractor'),
migrations.RemoveField(model_name='archiveresult', name='output'),
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# Add new ArchiveResult fields
migrations.AddField(
model_name='archiveresult',
name='plugin',
field=models.CharField(blank=True, default='', max_length=32),
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, default='', max_length=255),
),
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(blank=True, default=dict),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', max_length=512),
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
# Update Snapshot model
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
),
migrations.AddField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', max_length=10),
),
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(blank=True, default=dict),
),
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
# Update Tag model
migrations.AlterModelOptions(
name='tag',
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
),
migrations.AddField(
model_name='tag',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='tag',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
# Alter field types
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(max_length=2048),
),
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(editable=False, max_length=100, unique=True),
),
# Create M2M model for snapshot tags
migrations.CreateModel(
name='SnapshotTag',
fields=[
('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')),
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
],
options={
'db_table': 'core_snapshot_tags',
},
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
# Update tags field on Snapshot to use the through model
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
),
# Add constraints
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
),
],
),
]

View File

@@ -0,0 +1,76 @@
# Generated by hand on 2025-12-30
# Final field adjustments to match model definitions exactly
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from archivebox.uuid_compat import uuid7
class Migration(migrations.Migration):
dependencies = [
('core', '0025_cleanup_schema'),
('crawls', '0002_upgrade_to_0_9_0'),
]
operations = [
# Alter Snapshot fields to match model exactly
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True, unique=False),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
# Alter SnapshotTag fields
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
),
migrations.AlterField(
model_name='snapshottag',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
),
]

View File

@@ -0,0 +1,90 @@
# Generated by hand on 2025-12-29
# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema
from django.db import migrations
def upgrade_crawl_schema_if_needed(apps, schema_editor):
"""
Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column).
"""
with schema_editor.connection.cursor() as cursor:
# Check if we need to upgrade (missing urls column means v0.8.6rc0)
cursor.execute("""
SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls'
""")
has_urls = cursor.fetchone()[0] > 0
if not has_urls:
print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...")
# Create new table with v0.9.0 schema
cursor.execute("""
CREATE TABLE crawls_crawl_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
urls TEXT NOT NULL DEFAULT '[]',
config TEXT,
max_depth INTEGER NOT NULL DEFAULT 0,
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
persona_id TEXT,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
output_dir VARCHAR(512) NOT NULL DEFAULT '',
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
created_by_id INTEGER NOT NULL,
schedule_id TEXT,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
)
""")
# Copy data from old table (v0.8.6rc0 schema)
cursor.execute("""
INSERT INTO crawls_crawl_new (
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id
)
SELECT
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
'[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id,
CAST(schedule_id AS TEXT)
FROM crawls_crawl
""")
# Replace old table
cursor.execute("DROP TABLE crawls_crawl")
cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl")
# Create indexes
cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)")
cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)")
cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)")
cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)")
cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)")
print(" ✓ Upgraded crawls_crawl to v0.9.0 schema")
else:
print(" ✓ crawls_crawl already has v0.9.0 schema")
class Migration(migrations.Migration):
dependencies = [
('crawls', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.RunPython(
upgrade_crawl_schema_if_needed,
reverse_code=migrations.RunPython.noop,
),
]