more migration id/uuid and config propagation fixes

2026-04-06 07:47:53 +10:00 · 2026-01-04 16:16:26 -08:00
parent 839ae744cf
commit 456aaee287
16 changed files with 789 additions and 94 deletions
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -15,6 +15,7 @@ def get_table_columns(table_name):

 def upgrade_core_tables(apps, schema_editor):
    """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
+    from archivebox.uuid_compat import uuid7
    cursor = connection.cursor()

    # Check if core_archiveresult table exists
@@ -60,8 +61,8 @@ def upgrade_core_tables(apps, schema_editor):

    if has_data:
        if has_uuid and not has_abid:
-            # Migrating from v0.7.2 (has uuid, minimal fields)
-            print('Migrating ArchiveResult from v0.7.2 schema...')
+            # Migrating from v0.7.2+ (has uuid column)
+            print('Migrating ArchiveResult from v0.7.2+ schema (with uuid)...')
            cursor.execute("""
                INSERT OR IGNORE INTO core_archiveresult_new (
                    id, uuid, snapshot_id, cmd, pwd, cmd_version,
@@ -86,7 +87,18 @@ def upgrade_core_tables(apps, schema_editor):
                FROM core_archiveresult;
            """)
        else:
-            print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
+            # Migrating from v0.7.2 (no uuid or abid column - generate fresh UUIDs)
+            print('Migrating ArchiveResult from v0.7.2 schema (no uuid - generating UUIDs)...')
+            cursor.execute("SELECT id, snapshot_id, cmd, pwd, cmd_version, start_ts, end_ts, status, extractor, output FROM core_archiveresult")
+            old_records = cursor.fetchall()
+            for record in old_records:
+                new_uuid = uuid7().hex
+                cursor.execute("""
+                    INSERT OR IGNORE INTO core_archiveresult_new (
+                        id, uuid, snapshot_id, cmd, pwd, cmd_version,
+                        start_ts, end_ts, status, extractor, output
+                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """, (record[0], new_uuid, record[1], record[2], record[3], record[4], record[5], record[6], record[7], record[8], record[9]))

    cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
    cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -33,6 +33,7 @@ def copy_old_fields_to_new(apps, schema_editor):

    # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
    # transformed by migration 0023, so we don't need to copy them here.
+    # NOTE: UUIDs are already populated by migration 0023 for all migration paths

    # Debug: Check Snapshot timestamps at end of RunPython
    cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
--- a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
+++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
@@ -8,12 +8,20 @@ from archivebox.uuid_compat import uuid7

 def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    """
-    Migrate ArchiveResult from integer PK to UUID PK.
+    Migrate ArchiveResult from integer PK to UUID PK (clean one-step migration).
+
+    Handles both migration paths:
+    - 0.7.x: ArchiveResult has integer id, NO uuid field → generate new UUIDs
+    - 0.8.x: ArchiveResult has integer id + optional uuid field → reuse existing UUIDs

    Strategy:
-    1. Add old_id field to store current integer IDs
-    2. Generate UUIDs for any records missing them
-    3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
+    1. Create new table with UUID as primary key (no temporary columns)
+    2. Generate UUIDs for records missing them (0.7.x) or reuse existing (0.8.x)
+    3. Copy all data with UUID as new id
+    4. Drop old table, rename new table
+    5. Recreate indexes
+
+    Result: Clean schema with ONLY id as UUIDField (no old_id, no uuid)
    """
    cursor = connection.cursor()

@@ -26,11 +34,13 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
    row_count = cursor.fetchone()[0]

-    if row_count == 0:
-        print('No ArchiveResult records to migrate')
-        return
+    # Don't skip if table is empty - we still need to recreate to remove uuid column
+    # (fresh installs create table with uuid from 0025, but model expects no uuid after 0029)

-    print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
+    if row_count == 0:
+        print('[0029] Recreating ArchiveResult table schema (integer→UUID PK, removing uuid column)...')
+    else:
+        print(f'[0029] Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')

    # Step 0: Check if machine_process table exists, if not NULL out process_id values
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
@@ -40,12 +50,10 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
        print('machine_process table does not exist yet, setting process_id to NULL')
        cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")

-    # Step 1: Create new table with UUID as primary key
+    # Step 1: Create new table with UUID as primary key (clean - no old_id or uuid columns)
    cursor.execute("""
        CREATE TABLE core_archiveresult_new (
            id TEXT PRIMARY KEY NOT NULL,
-            old_id INTEGER,
-            uuid TEXT UNIQUE,
            created_at DATETIME NOT NULL,
            modified_at DATETIME NOT NULL,

@@ -78,28 +86,36 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    """)

    # Step 2: Generate UUIDs for records that don't have them
-    cursor.execute("SELECT id, uuid FROM core_archiveresult")
-    records = cursor.fetchall()
+    # Check if uuid column exists (0.8.x has it, 0.7.x doesn't)
+    cursor.execute("PRAGMA table_info(core_archiveresult)")
+    columns = cursor.fetchall()
+    col_names = [col[1] for col in columns]
+    has_uuid_column = 'uuid' in col_names

-    id_to_uuid = {}
-    for old_id, existing_uuid in records:
-        if existing_uuid:
-            # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
-            # (existing UUIDs might be stored with or without dashes in old schema)
-            id_to_uuid[old_id] = UUID(existing_uuid).hex
-        else:
-            # Generate new UUIDv7 (time-ordered) as 32-char hex
-            id_to_uuid[old_id] = uuid7().hex
+    if has_uuid_column:
+        cursor.execute("SELECT id, uuid FROM core_archiveresult")
+        records = cursor.fetchall()
+        id_to_uuid = {}
+        for old_id, existing_uuid in records:
+            if existing_uuid:
+                # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
+                # (existing UUIDs might be stored with or without dashes in old schema)
+                id_to_uuid[old_id] = UUID(existing_uuid).hex
+            else:
+                # Generate new UUIDv7 (time-ordered) as 32-char hex
+                id_to_uuid[old_id] = uuid7().hex
+    else:
+        # 0.7.x path: no uuid column, generate new UUIDs for all records
+        cursor.execute("SELECT id FROM core_archiveresult")
+        records = cursor.fetchall()
+        id_to_uuid = {old_id: uuid7().hex for (old_id,) in records}

    # Step 3: Copy data with UUIDs as new primary key
    cursor.execute("SELECT * FROM core_archiveresult")
    old_records = cursor.fetchall()

-    # Get column names
-    cursor.execute("PRAGMA table_info(core_archiveresult)")
-    columns = cursor.fetchall()
-    col_names = [col[1] for col in columns]
-
+    # col_names already fetched in Step 2
+    inserted_count = 0
    for i, record in enumerate(old_records):
        old_id = record[col_names.index('id')]
        new_uuid = id_to_uuid[old_id]
@@ -107,7 +123,7 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
        # Build insert with new structure
        values = {col_names[i]: record[i] for i in range(len(col_names))}

-        # Check which fields exist in new table
+        # List of fields to copy (all fields from new schema except id, old_id, uuid)
        fields_to_copy = [
            'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
            'status', 'retry_at', 'start_ts', 'end_ts',
@@ -115,17 +131,31 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
            'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
        ]

-        # Build INSERT statement
+        # Build INSERT statement (only copy fields that exist in source)
        existing_fields = [f for f in fields_to_copy if f in values]
-        placeholders = ', '.join(['?'] * (len(existing_fields) + 3))  # +3 for id, old_id, uuid
-        field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)

-        insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
+        if i == 0:
+            print(f'[0029] Source columns: {col_names}')
+            print(f'[0029] Copying fields: {existing_fields}')

-        cursor.execute(
-            f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
-            insert_values
-        )
+        placeholders = ', '.join(['?'] * (len(existing_fields) + 1))  # +1 for id
+        field_list = 'id, ' + ', '.join(existing_fields)
+
+        insert_values = [new_uuid] + [values.get(f) for f in existing_fields]
+
+        try:
+            cursor.execute(
+                f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
+                insert_values
+            )
+            inserted_count += 1
+        except Exception as e:
+            print(f'[0029] ERROR inserting record {old_id}: {e}')
+            if i == 0:
+                print(f'[0029] First record values: {insert_values[:5]}...')
+                raise
+
+    print(f'[0029] Inserted {inserted_count}/{len(old_records)} records')

    # Step 4: Replace old table with new table
    cursor.execute("DROP TABLE core_archiveresult")
@@ -139,7 +169,6 @@ def migrate_archiveresult_id_to_uuid(apps, schema_editor):
    cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
    cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
    cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
-    cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")

    print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')

@@ -159,23 +188,17 @@ class Migration(migrations.Migration):
                ),
            ],
            state_operations=[
-                # Remove old uuid field
+                # Remove uuid field (was added in 0025, we're merging it into id)
                migrations.RemoveField(
                    model_name='archiveresult',
                    name='uuid',
                ),
-                # Change id from AutoField to UUIDField
+                # Change id from AutoField to UUIDField (absorbing the uuid field)
                migrations.AlterField(
                    model_name='archiveresult',
                    name='id',
                    field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
                ),
-                # Add old_id field to preserve legacy integer IDs
-                migrations.AddField(
-                    model_name='archiveresult',
-                    name='old_id',
-                    field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
-                ),
            ],
        ),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1354,7 +1354,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def domain(self) -> str:
        return url_domain(self.url)

-    @cached_property
+    @property
    def output_dir(self):
        """The filesystem path to the snapshot's output directory."""
        import os
@@ -1435,8 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                print(f'[yellow]🔪 Killed {killed_count} process(es) for hook {process.pid}[/yellow]')

        # Clean up .pid files from output directory
-        if self.OUTPUT_DIR.exists():
-            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+        if Path(self.output_dir).exists():
+            for pid_file in Path(self.output_dir).glob('**/*.pid'):
                pid_file.unlink(missing_ok=True)

        # Update all STARTED ArchiveResults from filesystem
@@ -2263,9 +2263,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

    # UUID primary key (migrated from integer in 0029)
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
-    # old_id preserves the legacy integer ID for backward compatibility
-    old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
-    # Note: uuid field was removed in migration 0029 when id became UUID
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

@@ -2494,7 +2491,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

    @property
    def output_dir_parent(self) -> str:
-        return str(self.snapshot.OUTPUT_DIR.relative_to(CONSTANTS.DATA_DIR))
+        return str(Path(self.snapshot.output_dir).relative_to(CONSTANTS.DATA_DIR))

    # Properties that delegate to Process model (for backwards compatibility)
    # These properties will replace the direct fields after migration is complete