From 96ee1bf686fee908272b21ccce9c7b64b333cdd5 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Tue, 30 Dec 2025 09:57:33 -0800
Subject: [PATCH] more migration fixes

---
 archivebox/cli/archivebox_update.py           | 94 ++++++++++++++-----
 .../core/migrations/0023_upgrade_to_0_9_0.py  | 86 ++++++++++++-----
 .../migrations/0024_assign_default_crawl.py   | 16 +++-
 3 files changed, 144 insertions(+), 52 deletions(-)

diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index 01e5bfde..996f1820 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -38,6 +38,14 @@ def update(filter_patterns: Iterable[str] = (),
 
     from archivebox.core.models import Snapshot
     from django.utils import timezone
+    from django.core.management import call_command
+
+    # Run migrations first to ensure DB schema is up-to-date
+    print('[*] Checking for pending migrations...')
+    try:
+        call_command('migrate', '--no-input', verbosity=0)
+    except Exception as e:
+        print(f'[!] Warning: Migration check failed: {e}')
 
     while True:
         if filter_patterns or before or after:
@@ -136,9 +144,17 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
 
         # Check if needs migration (0.8.x → 0.9.x)
         if snapshot.fs_migration_needed:
-            snapshot.save()  # Triggers migration + creates symlink
-            stats['migrated'] += 1
-            print(f"    [{stats['processed']}] Migrated: {entry_path.name}")
+            try:
+                snapshot.save()  # Triggers migration + creates symlink
+                stats['migrated'] += 1
+                print(f"    [{stats['processed']}] Migrated: {entry_path.name}")
+            except Exception as e:
+                # Snapshot already exists in DB with different crawl - skip it
+                if 'UNIQUE constraint failed' in str(e):
+                    stats['skipped'] += 1
+                    print(f"    [{stats['processed']}] Skipped (already in DB): {entry_path.name}")
+                else:
+                    raise
         else:
             stats['skipped'] += 1
 
@@ -170,19 +186,33 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
     print(f'[*] Processing {total} snapshots from database (most recent first)...')
 
     # Process from most recent to least recent
-    for snapshot in Snapshot.objects.order_by('-bookmarked_at').iterator(chunk_size=batch_size):
-        # Reconcile index.json with DB
-        snapshot.reconcile_with_index_json()
-
-        # Queue for archiving (state machine will handle it)
-        snapshot.status = Snapshot.StatusChoices.QUEUED
-        snapshot.retry_at = timezone.now()
-        snapshot.save()
-
-        stats['reconciled'] += 1
-        stats['queued'] += 1
+    for snapshot in Snapshot.objects.select_related('crawl').order_by('-bookmarked_at').iterator(chunk_size=batch_size):
         stats['processed'] += 1
 
+        # Skip snapshots with missing crawl references (orphaned by migration errors)
+        if not snapshot.crawl_id:
+            continue
+
+        try:
+            # Reconcile index.json with DB
+            snapshot.reconcile_with_index_json()
+
+            # Clean up invalid field values from old migrations
+            if not isinstance(snapshot.current_step, int):
+                snapshot.current_step = 0
+
+            # Queue for archiving (state machine will handle it)
+            snapshot.status = Snapshot.StatusChoices.QUEUED
+            snapshot.retry_at = timezone.now()
+            snapshot.save()
+
+            stats['reconciled'] += 1
+            stats['queued'] += 1
+        except Exception as e:
+            # Skip snapshots that can't be processed (e.g., missing crawl)
+            print(f"    [!] Skipping snapshot {snapshot.id}: {e}")
+            continue
+
         if stats['processed'] % batch_size == 0:
             transaction.commit()
             print(f"    [{stats['processed']}/{total}] Processed...")
@@ -219,19 +249,33 @@ def process_filtered_snapshots(
     total = snapshots.count()
     print(f'[*] Found {total} matching snapshots')
 
-    for snapshot in snapshots.iterator(chunk_size=batch_size):
-        # Reconcile index.json with DB
-        snapshot.reconcile_with_index_json()
-
-        # Queue for archiving
-        snapshot.status = Snapshot.StatusChoices.QUEUED
-        snapshot.retry_at = timezone.now()
-        snapshot.save()
-
-        stats['reconciled'] += 1
-        stats['queued'] += 1
+    for snapshot in snapshots.select_related('crawl').iterator(chunk_size=batch_size):
         stats['processed'] += 1
 
+        # Skip snapshots with missing crawl references
+        if not snapshot.crawl_id:
+            continue
+
+        try:
+            # Reconcile index.json with DB
+            snapshot.reconcile_with_index_json()
+
+            # Clean up invalid field values from old migrations
+            if not isinstance(snapshot.current_step, int):
+                snapshot.current_step = 0
+
+            # Queue for archiving
+            snapshot.status = Snapshot.StatusChoices.QUEUED
+            snapshot.retry_at = timezone.now()
+            snapshot.save()
+
+            stats['reconciled'] += 1
+            stats['queued'] += 1
+        except Exception as e:
+            # Skip snapshots that can't be processed
+            print(f"    [!] Skipping snapshot {snapshot.id}: {e}")
+            continue
+
         if stats['processed'] % batch_size == 0:
             transaction.commit()
             print(f"    [{stats['processed']}/{total}] Processed...")
diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
index a652bc99..ca7e9b0b 100644
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -143,50 +143,50 @@ def upgrade_from_v072_or_v086(apps, schema_editor):
 
                 url TEXT NOT NULL,
                 timestamp TEXT NOT NULL,
-                tags TEXT,
                 title TEXT,
 
-                crawl_id TEXT NOT NULL,
+                crawl_id TEXT,
                 depth INTEGER NOT NULL DEFAULT 0,
                 parent_snapshot_id TEXT,
 
                 status VARCHAR(15) NOT NULL DEFAULT 'queued',
                 retry_at DATETIME,
-                current_step VARCHAR(50) NOT NULL DEFAULT '',
+                current_step INTEGER NOT NULL DEFAULT 0,
 
                 fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
                 config TEXT,
                 notes TEXT NOT NULL DEFAULT '',
                 num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
-                num_uses_failed INTEGER NOT NULL DEFAULT 0,
+                num_uses_failed INTEGER NOT NULL DEFAULT 0
 
-                FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
-                FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
+                -- Note: crawl_id foreign key will be added in 0024 after assigning crawl_ids
+                -- FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
+                -- FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
             )
         """)
 
         # Copy snapshot data
         if has_crawl_id:
-            # v0.8.6rc0 schema
+            # v0.8.6rc0 schema - already has created_at, modified_at, bookmarked_at
             cursor.execute("""
                 INSERT OR IGNORE INTO core_snapshot_new (
-                    id, created_at, modified_at, bookmarked_at, url, timestamp,
-                    crawl_id, depth, status, retry_at, config
+                    id, created_at, modified_at, bookmarked_at, downloaded_at, url, timestamp,
+                    crawl_id, status, retry_at
                 )
                 SELECT
                     id,
-                    COALESCE(added, CURRENT_TIMESTAMP),
-                    COALESCE(updated, added, CURRENT_TIMESTAMP),
-                    COALESCE(added, CURRENT_TIMESTAMP),
+                    created_at,
+                    modified_at,
+                    bookmarked_at,
+                    downloaded_at,
                     url, timestamp,
-                    crawl_id, COALESCE(depth, 0),
+                    NULLIF(crawl_id, ''),
                     COALESCE(status, 'queued'),
-                    retry_at,
-                    config
+                    retry_at
                 FROM core_snapshot
             """)
         else:
-            # v0.7.2 schema - will get crawl_id assigned by later migration
+            # v0.7.2 schema - will get crawl_id assigned by later migration (0024)
             cursor.execute("""
                 INSERT OR IGNORE INTO core_snapshot_new (
                     id, created_at, modified_at, bookmarked_at, url, timestamp, crawl_id
@@ -197,7 +197,7 @@ def upgrade_from_v072_or_v086(apps, schema_editor):
                     COALESCE(updated, added, CURRENT_TIMESTAMP),
                     COALESCE(added, CURRENT_TIMESTAMP),
                     url, timestamp,
-                    '' as crawl_id
+                    NULL as crawl_id
                 FROM core_snapshot
             """)
 
@@ -217,6 +217,13 @@ def upgrade_from_v072_or_v086(apps, schema_editor):
         # PART 3: Upgrade core_tag table
         # ============================================================================
 
+        # Check if tag id is INTEGER (v0.7.2) or TEXT (v0.8.6rc0)
+        cursor.execute("""
+            SELECT type FROM pragma_table_info('core_tag') WHERE name='id'
+        """)
+        tag_id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
+        tag_id_is_int = 'INT' in tag_id_type.upper()
+
         cursor.execute("""
             CREATE TABLE IF NOT EXISTS core_tag_new (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -231,10 +238,26 @@ def upgrade_from_v072_or_v086(apps, schema_editor):
             )
         """)
 
-        cursor.execute("""
-            INSERT OR IGNORE INTO core_tag_new (id, name, slug)
-            SELECT id, name, slug FROM core_tag
-        """)
+        if tag_id_is_int:
+            # v0.7.2: Direct copy (INTEGER to INTEGER)
+            cursor.execute("""
+                INSERT OR IGNORE INTO core_tag_new (id, name, slug)
+                SELECT id, name, slug FROM core_tag
+            """)
+        else:
+            # v0.8.6rc0: Need to remap TEXT ids to new INTEGER ids
+            cursor.execute("SELECT id, name, slug FROM core_tag")
+            old_tags = cursor.fetchall()
+            tag_id_mapping = {}  # old_text_id -> new_int_id
+
+            for old_id, name, slug in old_tags:
+                cursor.execute("""
+                    INSERT OR IGNORE INTO core_tag_new (name, slug)
+                    VALUES (?, ?)
+                """, [name, slug])
+                cursor.execute("SELECT id FROM core_tag_new WHERE slug = ?", [slug])
+                new_id = cursor.fetchone()[0]
+                tag_id_mapping[old_id] = new_id
 
         cursor.execute("DROP TABLE IF EXISTS core_tag")
         cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag")
@@ -251,10 +274,23 @@ def upgrade_from_v072_or_v086(apps, schema_editor):
             )
         """)
 
-        cursor.execute("""
-            INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
-            SELECT snapshot_id, tag_id FROM core_snapshot_tags
-        """)
+        if tag_id_is_int:
+            # Direct copy for v0.7.2
+            cursor.execute("""
+                INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
+                SELECT snapshot_id, tag_id FROM core_snapshot_tags
+            """)
+        else:
+            # v0.8.6rc0: Use mapping to convert old TEXT ids to new INTEGER ids
+            cursor.execute("SELECT snapshot_id, tag_id FROM core_snapshot_tags")
+            m2m_entries = cursor.fetchall()
+            for snapshot_id, old_tag_id in m2m_entries:
+                new_tag_id = tag_id_mapping.get(old_tag_id)
+                if new_tag_id:
+                    cursor.execute("""
+                        INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
+                        VALUES (?, ?)
+                    """, [snapshot_id, new_tag_id])
 
         cursor.execute("DROP TABLE IF EXISTS core_snapshot_tags")
         cursor.execute("ALTER TABLE core_snapshot_tags_new RENAME TO core_snapshot_tags")
diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py
index 5658f408..02cf2bdb 100644
--- a/archivebox/core/migrations/0024_assign_default_crawl.py
+++ b/archivebox/core/migrations/0024_assign_default_crawl.py
@@ -56,7 +56,8 @@ class Migration(migrations.Migration):
 
     dependencies = [
         ('core', '0023_upgrade_to_0_9_0'),
-        ('crawls', '0001_initial'),
+        ('crawls', '0002_upgrade_to_0_9_0'),
+        ('machine', '0001_initial'),
         ('auth', '0012_alter_user_first_name_max_length'),
     ]
 
@@ -99,7 +100,18 @@ class Migration(migrations.Migration):
                     FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
                 );
 
-                INSERT INTO core_snapshot_final SELECT * FROM core_snapshot;
+                INSERT INTO core_snapshot_final (
+                    id, created_at, modified_at, url, timestamp, bookmarked_at,
+                    crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
+                    config, notes, num_uses_succeeded, num_uses_failed,
+                    status, retry_at, current_step
+                )
+                SELECT
+                    id, created_at, modified_at, url, timestamp, bookmarked_at,
+                    crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
+                    COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
+                    status, retry_at, current_step
+                FROM core_snapshot;
 
                 DROP TABLE core_snapshot;
                 ALTER TABLE core_snapshot_final RENAME TO core_snapshot;