more migration id/uuid and config propagation fixes

This commit is contained in:
Nick Sweeting
2026-01-04 16:16:26 -08:00
parent 839ae744cf
commit 456aaee287
16 changed files with 789 additions and 94 deletions

View File

@@ -15,6 +15,7 @@ def get_table_columns(table_name):
def upgrade_core_tables(apps, schema_editor):
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
from archivebox.uuid_compat import uuid7
cursor = connection.cursor()
# Check if core_archiveresult table exists
@@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor):
if has_data:
if has_uuid and not has_abid:
# Migrating from v0.7.2 (has uuid, minimal fields)
print('Migrating ArchiveResult from v0.7.2 schema...')
# Migrating from v0.7.2+ (has uuid column)
print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, snapshot_id, cmd, pwd, cmd_version,
@@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor):
FROM core_archiveresult;
""")
else:
print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
# Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
old_records = cursor.fetchall()
for record in old_records:
new_uuid = uuid7().hex
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, snapshot_id, cmd, pwd, cmd_version,
start_ts, end_ts, status, extractor, output
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")

View File

@@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor):
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
# transformed by migration 0023, so we don't need to copy them here.
# NOTE: UUIDs are already populated by migration 0023 for all migration paths
# Debug: Check Snapshot timestamps at end of RunPython
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")

View File

@@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7
def migrate_archiveresult_id_to_uuid(apps, schema_editor):
"""
Migrate ArchiveResult from integer PK to UUID PK.
Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration).
Handles both migration paths:
- 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs
- 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs
Strategy:
1. Add old_id field to store current integer IDs
2. Generate UUIDs for any records missing them
3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
1. Create new table with UUID as primary key (no temporary columns)
2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x)
3. Copy all data with UUID as new id
4. Drop old table, rename new table
5. Recreate indexes
Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid)
"""
cursor = connection.cursor()
@@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
row_count = cursor.fetchone()[0]
if row_count == 0:
print('No ArchiveResult records to migrate')
return
# Don't skip if table is empty - we still need to recreate to remove uuid column
# (fresh installs create table with uuid from 0025, but model expects no uuid after 0029)
print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
if row_count == 0:
print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...')
else:
print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
# Step 0: Check if machine_process table exists, if not NULL out process_id values
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
@@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
print('machine_process table does not exist yet, setting process_id to NULL')
cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
# Step 1: Create new table with UUID as primary key
# Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns)
cursor.execute("""
CREATE TABLE core_archiveresult_new (
id TEXT PRIMARY KEY NOT NULL,
old_id INTEGER,
uuid TEXT UNIQUE,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
@@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
""")
# Step 2: Generate UUIDs for records that don't have them
cursor.execute("SELECT id, uuid FROM core_archiveresult")
records = cursor.fetchall()
# Check if uuid column exists (0.8.x has it, 0.7.x doesn't)
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = cursor.fetchall()
col_names = [col[1] for col in columns]
has_uuid_column = 'uuid' in col_names
id_to_uuid = {}
for old_id, existing_uuid in records:
if existing_uuid:
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
# (existing UUIDs might be stored with or without dashes in old schema)
id_to_uuid[old_id] = UUID(existing_uuid).hex
else:
# Generate new UUIDv7 (time-ordered) as 32-char hex
id_to_uuid[old_id] = uuid7().hex
if has_uuid_column:
cursor.execute("SELECT id, uuid FROM core_archiveresult")
records = cursor.fetchall()
id_to_uuid = {}
for old_id, existing_uuid in records:
if existing_uuid:
# Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
# (existing UUIDs might be stored with or without dashes in old schema)
id_to_uuid[old_id] = UUID(existing_uuid).hex
else:
# Generate new UUIDv7 (time-ordered) as 32-char hex
id_to_uuid[old_id] = uuid7().hex
else:
# 0.7.x path: no uuid column, generate new UUIDs for all records
cursor.execute("SELECT id FROM core_archiveresult")
records = cursor.fetchall()
id_to_uuid = {old_id: uuid7().hex for (old_id,) in records}
# Step 3: Copy data with UUIDs as new primary key
cursor.execute("SELECT * FROM core_archiveresult")
old_records = cursor.fetchall()
# Get column names
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = cursor.fetchall()
col_names = [col[1] for col in columns]
# col_names already fetched in Step 2
inserted_count = 0
for i, record in enumerate(old_records):
old_id = record[col_names.index('id')]
new_uuid = id_to_uuid[old_id]
@@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
# Build insert with new structure
values = {col_names[i]: record[i] for i in range(len(col_names))}
# Check which fields exist in new table
# List of fields to copy (all fields from new schema except id, old_id, uuid)
fields_to_copy = [
'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
'status', 'retry_at', 'start_ts', 'end_ts',
@@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
]
# Build INSERT statement
# Build INSERT statement (only copy fields that exist in source)
existing_fields = [f for f in fields_to_copy if f in values]
placeholders = ', '.join(['?'] * (len(existing_fields) + 3)) # +3 for id, old_id, uuid
field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
if i == 0:
print(f'[0029] Source columns: {col_names}')
print(f'[0029] Copying fields: {existing_fields}')
cursor.execute(
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
insert_values
)
placeholders = ', '.join(['?'] * (len(existing_fields) + 1)) # +1 for id
field_list = 'id, ' + ', '.join(existing_fields)
insert_values = [new_uuid] + [values.get(f) for f in existing_fields]
try:
cursor.execute(
f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
insert_values
)
inserted_count += 1
except Exception as e:
print(f'[0029] ERROR inserting record {old_id}: {e}')
if i == 0:
print(f'[0029] First record values: {insert_values[:5]}...')
raise
print(f'[0029] Inserted {inserted_count}/{len(old_records)} records')
# Step 4: Replace old table with new table
cursor.execute("DROP TABLE core_archiveresult")
@@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
@@ -159,23 +188,17 @@ class Migration(migrations.Migration):
),
],
state_operations=[
# Remove old uuid field
# Remove uuid field (was added in 0025, we're merging it into id)
migrations.RemoveField(
model_name='archiveresult',
name='uuid',
),
# Change id from AutoField to UUIDField
# Change id from AutoField to UUIDField (absorbing the uuid field)
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
),
# Add old_id field to preserve legacy integer IDs
migrations.AddField(
model_name='archiveresult',
name='old_id',
field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
),
],
),
]

View File

@@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def domain(self) -> str:
return url_domain(self.url)
@cached_property
@property
def output_dir(self):
"""The filesystem path to the snapshot's output directory."""
import os
@@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')
# Clean up .pid files from output directory
if self.OUTPUT_DIR.exists():
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
if Path(self.output_dir).exists():
for pid_file in Path(self.output_dir).glob('**/*.pid'):
pid_file.unlink(missing_ok=True)
# Update all STARTED ArchiveResults from filesystem
@@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# UUID primary key (migrated from integer in 0029)
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
# old_id preserves the legacy integer ID for backward compatibility
old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
# Note: uuid field was removed in migration 0029 when id became UUID
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
@property
def output_dir_parent(self) -> str:
return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))
# Properties that delegate to Process model (for backwards compatibility)
# These properties will replace the direct fields after migration is complete