From f7457b13adf5be71e4def0fe73fb20c400d8ff80 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 31 Dec 2025 17:44:55 -0800 Subject: [PATCH] more migrations fixes attempts --- TODO_fix_migration_path.md | 427 ++++++++++++++++++ archivebox/base_models/models.py | 29 +- .../core/migrations/0023_upgrade_to_0_9_0.py | 62 +-- ...options_alter_snapshot_options_and_more.py | 49 +- .../migrations/0002_upgrade_from_0_8_6.py | 99 ++++ 5 files changed, 593 insertions(+), 73 deletions(-) create mode 100644 TODO_fix_migration_path.md create mode 100644 archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py diff --git a/TODO_fix_migration_path.md b/TODO_fix_migration_path.md new file mode 100644 index 00000000..4bd25e5e --- /dev/null +++ b/TODO_fix_migration_path.md @@ -0,0 +1,427 @@ +# TODO: Fix Migration Path for v0.7.2/v0.8.6rc0 → v0.9.0 + +## Critical Issue + +The migrations currently **LOSE DATA** during the v0.7.2 → v0.9.0 upgrade: +- `extractor` field data is not being copied to `plugin` field +- `output` field data is not being copied to `output_str` field +- Timestamp fields (`added`, `updated`) may not be properly transformed +- Tag UUID → INTEGER conversion may lose FK relationships + +## Test Database Locations + +Sample databases for testing are available at: +``` +/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data/index.sqlite3 +/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.8.6rc0/data/index.sqlite3 +``` + +Schema comparison reports: +``` +/tmp/schema_comparison_report.md +/tmp/table_presence_matrix.md +``` + +## How to Test Migrations + +### 1. Fresh Install Test +```bash +rm -rf /tmp/test_fresh && mkdir -p /tmp/test_fresh +DATA_DIR=/tmp/test_fresh python -m archivebox init +DATA_DIR=/tmp/test_fresh python -m archivebox status +``` + +### 2. v0.7.2 Migration Test +```bash +rm -rf /tmp/test_v072 && mkdir -p /tmp/test_v072 +cp /Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data/index.sqlite3 /tmp/test_v072/ +DATA_DIR=/tmp/test_v072 python -m archivebox init +DATA_DIR=/tmp/test_v072 python -m archivebox status +``` + +### 3. v0.8.6rc0 Migration Test +```bash +rm -rf /tmp/test_v086 && mkdir -p /tmp/test_v086 +cp /Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.8.6rc0/data/index.sqlite3 /tmp/test_v086/ +DATA_DIR=/tmp/test_v086 python -m archivebox init +DATA_DIR=/tmp/test_v086 python -m archivebox status +``` + +### 4. Verify Data Integrity + +After each test, compare original vs migrated data: + +```bash +# Check ArchiveResult data preservation +echo "=== ORIGINAL ===" +sqlite3 /path/to/original.db "SELECT id, extractor, output, status FROM core_archiveresult LIMIT 5;" + +echo "=== MIGRATED ===" +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT id, plugin, output_str, status FROM core_archiveresult LIMIT 5;" + +# Check Snapshot data preservation +echo "=== ORIGINAL SNAPSHOTS ===" +sqlite3 /path/to/original.db "SELECT id, url, title, added, updated FROM core_snapshot LIMIT 5;" + +echo "=== MIGRATED SNAPSHOTS ===" +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT id, url, title, bookmarked_at, created_at, modified_at FROM core_snapshot LIMIT 5;" + +# Check Tag data preservation +echo "=== ORIGINAL TAGS ===" +sqlite3 /path/to/original.db "SELECT * FROM core_tag;" + +echo "=== MIGRATED TAGS ===" +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT * FROM core_tag;" + +# Check snapshot-tag relationships +sqlite3 /tmp/test_vXXX/index.sqlite3 "SELECT COUNT(*) FROM core_snapshot_tags;" +``` + +**CRITICAL**: Verify: +- Row counts match +- All URLs, titles, timestamps are preserved +- All extractor values are copied to plugin field +- All output values are copied to output_str field +- All tag relationships are maintained (tag IDs should be converted from UUID to INTEGER for v0.8.6) + +## Migration Philosophy + +### Principle: Minimal Manual SQL + +Use this approach for complex migrations: + +1. **Python**: Detect existing schema version + ```python + def get_table_columns(table_name): + cursor = connection.cursor() + cursor.execute(f"PRAGMA table_info({table_name})") + return {row[1] for row in cursor.fetchall()} + + cols = get_table_columns('core_archiveresult') + has_extractor = 'extractor' in cols + has_plugin = 'plugin' in cols + ``` + +2. **SQL**: Modify database structure during migration + ```sql + CREATE TABLE core_archiveresult_new (...); + INSERT INTO core_archiveresult_new SELECT ... FROM core_archiveresult; + DROP TABLE core_archiveresult; + ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult; + ``` + +3. **Python**: Copy data between old and new field names + ```python + if 'extractor' in cols and 'plugin' in cols: + cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '')") + ``` + +4. **SQL**: Drop old columns/tables + ```sql + -- Django's RemoveField will handle this + ``` + +5. **Django**: Register the end state so Django knows what the schema should be + ```python + migrations.SeparateDatabaseAndState( + database_operations=[...], # Your SQL/Python migrations + state_operations=[...] # Tell Django what the final schema looks like + ) + ``` + +### Key Files + +- **core/migrations/0023_upgrade_to_0_9_0.py**: Raw SQL migration that upgrades tables from v0.7.2/v0.8.6 schema + - Should create NEW tables with OLD field names (extractor, output, added, updated) + - Should preserve ALL data during table rebuild + - Should NOT add new fields yet (let Django migrations handle that) + +- **core/migrations/0025_alter_archiveresult_options_...py**: Django-generated migration + - Adds new fields (plugin, output_str, bookmarked_at, created_at, etc.) + - Should include RunPython to copy data from old fields to new fields AFTER AddField operations + - RemoveField operations to remove old columns + +- **crawls/migrations/0002_upgrade_from_0_8_6.py**: Handles crawls_crawl table upgrade + - v0.8.6 has `seed_id` + `persona` (VARCHAR) + - v0.9.0 has `urls` + `persona_id` (UUID FK) + +## How to Make vs Apply Migrations + +### Making Migrations (Creating New Migrations) + +**Always run from the archivebox/ subdirectory** (NOT from a data dir): + +```bash +cd archivebox/ +./manage.py makemigrations +./manage.py makemigrations --check # Verify no unreflected changes +``` + +This works because `archivebox/manage.py` has: +```python +os.environ.setdefault('ARCHIVEBOX_DATA_DIR', '.') +``` + +### Applying Migrations (Testing Migrations) + +**Always run from inside a data directory** using `archivebox init`: + +```bash +# WRONG - Don't do this: +cd /some/data/dir +../path/to/archivebox/manage.py migrate + +# RIGHT - Do this: +DATA_DIR=/some/data/dir python -m archivebox init +``` + +Why? Because `archivebox init`: +- Sets up the data directory structure +- Runs migrations with proper DATA_DIR context +- Creates necessary files and folders +- Validates the installation + +## Schema Version Differences + +### v0.7.2 Schema (Migration 0022) +- **ArchiveResult**: `id` (INTEGER), `uuid`, `extractor`, `output`, `cmd`, `pwd`, `cmd_version`, `start_ts`, `end_ts`, `status`, `snapshot_id` +- **Snapshot**: `id`, `url`, `timestamp`, `title`, `added`, `updated`, `crawl_id` +- **Tag**: `id` (INTEGER), `name`, `slug` +- **Crawl**: Doesn't exist in v0.7.2 + +### v0.8.6rc0 Schema +- **ArchiveResult**: `id`, `abid` (not uuid!), `extractor`, `output`, `created_at`, `modified_at`, `retry_at`, `status`, ... +- **Snapshot**: `id`, `url`, `bookmarked_at`, `created_at`, `modified_at`, `crawl_id`, `status`, `retry_at`, ... +- **Tag**: `id` (UUID/CHAR!), `name`, `slug`, `abid`, `created_at`, `modified_at`, `created_by_id` +- **Crawl**: `id`, `seed_id`, `persona` (VARCHAR), `max_depth`, `tags_str`, `status`, `retry_at`, ... + +### v0.9.0 Target Schema +- **ArchiveResult**: `id` (INTEGER), `uuid`, `plugin` (not extractor!), `output_str` (not output!), `hook_name`, `created_at`, `modified_at`, `output_files`, `output_json`, `output_size`, `output_mimetypes`, `retry_at`, ... +- **Snapshot**: `id`, `url`, `bookmarked_at` (not added!), `created_at`, `modified_at` (not updated!), `crawl_id`, `parent_snapshot_id`, `status`, `retry_at`, `current_step`, `depth`, `fs_version`, ... +- **Tag**: `id` (INTEGER!), `name`, `slug`, `created_at`, `modified_at`, `created_by_id` +- **Crawl**: `id`, `urls` (not seed_id!), `persona_id` (not persona!), `label`, `notes`, `output_dir`, ... + +## Critical Gotchas and Mistakes to Avoid + +### 1. ❌ DON'T Create New Fields in SQL Migration (0023) + +**WRONG**: +```python +# In core/migrations/0023_upgrade_to_0_9_0.py +cursor.execute(""" + CREATE TABLE core_archiveresult_new ( + id INTEGER PRIMARY KEY, + plugin VARCHAR(32), # ❌ New field! + output_str TEXT, # ❌ New field! + ... + ) +""") +``` + +**RIGHT**: +```python +# In core/migrations/0023_upgrade_to_0_9_0.py - Keep OLD field names! +cursor.execute(""" + CREATE TABLE core_archiveresult_new ( + id INTEGER PRIMARY KEY, + extractor VARCHAR(32), # ✓ OLD field name + output VARCHAR(1024), # ✓ OLD field name + ... + ) +""") +``` + +**Why**: If you create new fields in SQL, Django's AddField operation in migration 0025 will overwrite them with default values, losing your data! + +### 2. ❌ DON'T Copy Data in SQL Migration + +**WRONG**: +```python +# In core/migrations/0023 +cursor.execute(""" + INSERT INTO core_archiveresult_new (plugin, output_str, ...) + SELECT COALESCE(extractor, ''), COALESCE(output, ''), ... + FROM core_archiveresult +""") +``` + +**RIGHT**: Keep old field names in SQL, let Django AddField create new columns, then copy: +```python +# In core/migrations/0025 (AFTER AddField operations) +def copy_old_to_new(apps, schema_editor): + cursor = connection.cursor() + cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '')") + cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '')") +``` + +### 3. ❌ DON'T Assume Empty Tables Mean Fresh Install + +**WRONG**: +```python +cursor.execute("SELECT COUNT(*) FROM core_archiveresult") +if cursor.fetchone()[0] == 0: + return # Skip migration +``` + +**Why**: Fresh installs run migrations 0001-0022 which CREATE empty tables with old schema. Migration 0023 must still upgrade the schema even if tables are empty! + +**RIGHT**: Detect schema version by checking column names: +```python +cols = get_table_columns('core_archiveresult') +has_extractor = 'extractor' in cols +if has_extractor: + # Old schema - needs upgrade +``` + +### 4. ❌ DON'T Run Migrations from Data Directories + +**WRONG**: +```bash +cd /path/to/data/dir +python manage.py makemigrations +``` + +**RIGHT**: +```bash +cd archivebox/ # The archivebox package directory +./manage.py makemigrations +``` + +### 5. ❌ DON'T Use WHERE Clauses to Skip SQL Selects + +**WRONG**: +```sql +INSERT INTO new_table SELECT uuid FROM old_table +WHERE EXISTS (SELECT 1 FROM pragma_table_info('old_table') WHERE name='uuid'); +``` + +**Why**: SQLite still evaluates the `uuid` column reference even if WHERE clause is false, causing "no such column" errors. + +**RIGHT**: Use Python to detect schema, then run appropriate SQL: +```python +if 'uuid' in get_table_columns('old_table'): + cursor.execute("INSERT INTO new_table SELECT uuid FROM old_table") +else: + cursor.execute("INSERT INTO new_table SELECT abid as uuid FROM old_table") +``` + +### 6. ❌ DON'T Mix UUID and INTEGER for Tag IDs + +v0.8.6rc0 has Tag.id as UUID, but v0.9.0 needs INTEGER. The conversion must: +1. Create mapping of old UUID → new INTEGER +2. Update core_tag with new IDs +3. Update core_snapshot_tags with new tag_id values + +See `core/migrations/0023_upgrade_to_0_9_0.py` PART 3 for the correct approach. + +### 7. ❌ DON'T Forget SeparateDatabaseAndState + +When you manually change the database with SQL, you MUST tell Django what the final state is: + +```python +migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython(my_sql_function), + ], + state_operations=[ + migrations.RemoveField('archiveresult', 'extractor'), + migrations.RemoveField('archiveresult', 'output'), + ], +) +``` + +Without `state_operations`, Django won't know the old fields are gone and `makemigrations --check` will show unreflected changes. + +### 8. ✅ DO Print Debug Messages + +```python +print(f'Migrating ArchiveResult from v0.7.2 schema...') +print(f'DEBUG: has_uuid={has_uuid}, has_abid={has_abid}, row_count={row_count}') +``` + +This helps diagnose which migration path is being taken. + +### 9. ✅ DO Test All Three Scenarios + +Always test: +1. Fresh install (empty database) +2. v0.7.2 upgrade (12 snapshots, 44 archiveresults, 2 tags) +3. v0.8.6rc0 upgrade (14 snapshots, 0 archiveresults, multiple tags with UUIDs) + +### 10. ✅ DO Verify No Unreflected Migrations + +After all changes: +```bash +cd archivebox/ +./manage.py makemigrations --check +# Should output: No changes detected +``` + +## Current Status + +As of 2025-01-01, migrations have these issues: + +1. ✅ Fresh install works +2. ✅ v0.7.2 → v0.9.0 migration runs without errors +3. ✅ v0.8.6rc0 → v0.9.0 migration runs without errors +4. ❌ **DATA IS LOST**: `extractor` → `plugin` field data not copied +5. ❌ **DATA IS LOST**: `output` → `output_str` field data not copied +6. ❌ Timestamps (added/updated → bookmarked_at/created_at/modified_at) may have wrong values +7. ❌ Tag relationships may be broken after UUID → INTEGER conversion + +## Files That Need Fixing + +1. **core/migrations/0023_upgrade_to_0_9_0.py** + - Line 42-58: CREATE TABLE should use OLD field names (extractor, output, added, updated) + - Lines 64-88: INSERT SELECT should just copy data as-is, no field renaming yet + - Remove all references to plugin, output_str, bookmarked_at, created_at - these are added by 0025 + +2. **core/migrations/0025_...py** + - Add RunPython operation AFTER all AddField operations + - This RunPython should copy: extractor→plugin, output→output_str, added→bookmarked_at/created_at, updated→modified_at + - Fix syntax error on line 28: `{extractor" in cols}` → `{"extractor" in cols}` + +3. **crawls/migrations/0002_upgrade_from_0_8_6.py** + - Already correctly handles conditional upgrade based on schema detection + - No changes needed if crawls table data isn't critical + +## Next Steps + +1. Fix core/migrations/0023 to preserve OLD field names +2. Fix core/migrations/0025 to copy data from old → new fields after AddField +3. Remove debug print statements (lines with `print(f'DEBUG:...`) +4. Test all three scenarios +5. Verify data integrity with SQL queries above +6. Run `./manage.py makemigrations --check` to ensure no unreflected changes + +## Reference: Field Mappings + +| Old Field (v0.7.2/v0.8.6) | New Field (v0.9.0) | Notes | +|---------------------------|-------------------|--------| +| `extractor` | `plugin` | Rename | +| `output` | `output_str` | Rename | +| `added` | `bookmarked_at` | Rename + also use for `created_at` | +| `updated` | `modified_at` | Rename | +| `abid` | `uuid` | v0.8.6 only, field rename | +| Tag.id (UUID) | Tag.id (INTEGER) | v0.8.6 only, type conversion | +| `seed_id` | `urls` | Crawl table, v0.8.6 only | +| `persona` (VARCHAR) | `persona_id` (UUID FK) | Crawl table, v0.8.6 only | + +## Testing Checklist + +- [ ] Fresh install creates correct schema +- [ ] Fresh install has 0 snapshots, 0 archiveresults +- [ ] v0.7.2 migration preserves all 12 snapshots +- [ ] v0.7.2 migration preserves all 44 archiveresults +- [ ] v0.7.2 migration preserves all 2 tags +- [ ] v0.7.2 migration copies `extractor` → `plugin` (check first 5 rows) +- [ ] v0.7.2 migration copies `output` → `output_str` (check first 5 rows) +- [ ] v0.7.2 migration copies `added` → `bookmarked_at` (compare timestamps) +- [ ] v0.7.2 migration copies `updated` → `modified_at` (compare timestamps) +- [ ] v0.8.6 migration preserves all 14 snapshots +- [ ] v0.8.6 migration converts Tag IDs from UUID → INTEGER +- [ ] v0.8.6 migration preserves tag relationships in core_snapshot_tags +- [ ] v0.8.6 migration converts `abid` → `uuid` field +- [ ] `./manage.py makemigrations --check` shows no changes +- [ ] All migrations run without errors +- [ ] `archivebox status` shows correct snapshot/link counts diff --git a/archivebox/base_models/models.py b/archivebox/base_models/models.py index 55f033b0..adfbce35 100755 --- a/archivebox/base_models/models.py +++ b/archivebox/base_models/models.py @@ -2,12 +2,9 @@ __package__ = 'archivebox.base_models' -import io -import csv -import json from uuid import UUID from archivebox.uuid_compat import uuid7 -from typing import Any, Iterable, ClassVar +from typing import ClassVar from pathlib import Path from django.contrib import admin @@ -21,7 +18,6 @@ from django.conf import settings from django_stubs_ext.db.models import TypedModelMeta from archivebox import DATA_DIR -from archivebox.misc.util import to_json from archivebox.misc.hashing import get_dir_info @@ -72,22 +68,6 @@ class ModelWithUUID(models.Model): def api_docs_url(self) -> str: return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}' - def as_json(self, keys: Iterable[str] = ()) -> dict: - default_keys = ('id', 'created_at', 'modified_at') - return {key: getattr(self, key) for key in (keys or default_keys) if hasattr(self, key)} - - -class ModelWithSerializers(ModelWithUUID): - class Meta(TypedModelMeta): - abstract = True - - def as_csv_row(self, keys: Iterable[str] = (), separator: str = ',') -> str: - buffer = io.StringIO() - csv.writer(buffer, delimiter=separator).writerow(str(getattr(self, key, '')) for key in (keys or self.as_json().keys())) - return buffer.getvalue() - - def as_jsonl_row(self, keys: Iterable[str] = (), **json_kwargs) -> str: - return json.dumps({key: getattr(self, key, '') for key in (keys or self.as_json().keys())}, sort_keys=True, indent=None, **json_kwargs) class ModelWithNotes(models.Model): @@ -125,14 +105,14 @@ class ModelWithConfig(models.Model): abstract = True -class ModelWithOutputDir(ModelWithSerializers): +class ModelWithOutputDir(ModelWithUUID): class Meta: abstract = True def save(self, *args, **kwargs): super().save(*args, **kwargs) self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - self.save_json_index() + # Note: index.json is deprecated, models should use write_index_jsonl() for full data @property def output_dir_parent(self) -> str: @@ -149,6 +129,3 @@ class ModelWithOutputDir(ModelWithSerializers): @property def OUTPUT_DIR(self) -> Path: return DATA_DIR / self.output_dir_str - - def save_json_index(self): - (self.OUTPUT_DIR / 'index.json').write_text(to_json(self.as_json())) diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py index 7d6d9c99..bc338eae 100644 --- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -32,39 +32,26 @@ def upgrade_core_tables(apps, schema_editor): has_uuid = 'uuid' in archiveresult_cols has_abid = 'abid' in archiveresult_cols + print(f'DEBUG: ArchiveResult row_count={row_count}, has_data={has_data}, has_uuid={has_uuid}, has_abid={has_abid}') + # ============================================================================ # PART 1: Upgrade core_archiveresult table # ============================================================================ + # Create minimal table with only OLD fields that exist in v0.7.2/v0.8.6rc0 + # Migration 0025 will add the NEW fields (plugin, hook_name, output_files, etc.) cursor.execute(""" CREATE TABLE IF NOT EXISTS core_archiveresult_new ( id INTEGER PRIMARY KEY AUTOINCREMENT, uuid TEXT, - created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - snapshot_id TEXT NOT NULL, - plugin VARCHAR(32) NOT NULL DEFAULT '', - hook_name VARCHAR(255) NOT NULL DEFAULT '', - cmd TEXT, pwd VARCHAR(256), cmd_version VARCHAR(128), - start_ts DATETIME, end_ts DATETIME, status VARCHAR(15) NOT NULL DEFAULT 'queued', - retry_at DATETIME, - - output_files TEXT NOT NULL DEFAULT '{}', - output_json TEXT, - output_str TEXT NOT NULL DEFAULT '', - output_size INTEGER NOT NULL DEFAULT 0, - output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', - - config TEXT, - notes TEXT NOT NULL DEFAULT '', - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0, + extractor VARCHAR(32), + output VARCHAR(1024), FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE ); @@ -76,36 +63,25 @@ def upgrade_core_tables(apps, schema_editor): print('Migrating ArchiveResult from v0.7.2 schema...') cursor.execute(""" INSERT OR IGNORE INTO core_archiveresult_new ( - id, uuid, created_at, modified_at, snapshot_id, plugin, - cmd, pwd, cmd_version, start_ts, end_ts, status, output_str + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output ) SELECT - id, uuid, - COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at, - COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at, - snapshot_id, - COALESCE(extractor, '') as plugin, - cmd, pwd, cmd_version, - start_ts, end_ts, status, - COALESCE(output, '') as output_str + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output FROM core_archiveresult; """) elif has_abid and not has_uuid: - # Migrating from v0.8.6rc0 (has abid, full fields) + # Migrating from v0.8.6rc0 (has abid instead of uuid) print('Migrating ArchiveResult from v0.8.6rc0 schema...') cursor.execute(""" INSERT OR IGNORE INTO core_archiveresult_new ( - id, uuid, created_at, modified_at, snapshot_id, plugin, - cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str + id, uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output ) SELECT - id, abid as uuid, - created_at, modified_at, - snapshot_id, - COALESCE(extractor, '') as plugin, - cmd, pwd, cmd_version, - start_ts, end_ts, status, retry_at, - COALESCE(output, '') as output_str + id, abid as uuid, snapshot_id, cmd, pwd, cmd_version, + start_ts, end_ts, status, extractor, output FROM core_archiveresult; """) else: @@ -114,13 +90,7 @@ def upgrade_core_tables(apps, schema_editor): cursor.execute("DROP TABLE IF EXISTS core_archiveresult;") cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;") - # Create indexes - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id);") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin);") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status);") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at);") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at);") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid);") + # Don't create indexes - migration 0025 will handle them # ============================================================================ # PART 2: Upgrade core_snapshot table diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py index 49533fa8..04097cc7 100644 --- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -5,7 +5,49 @@ import django.db.models.deletion import django.utils.timezone import uuid from django.conf import settings -from django.db import migrations, models +from django.db import migrations, models, connection + + +def copy_old_fields_to_new(apps, schema_editor): + """Copy data from old field names to new field names before AddField operations.""" + cursor = connection.cursor() + + # Check if old fields still exist + cursor.execute("PRAGMA table_info(core_archiveresult)") + cols = {row[1] for row in cursor.fetchall()} + print(f'DEBUG 0025: ArchiveResult columns: {sorted(cols)}') + + if 'extractor' in cols and 'plugin' in cols: + # Copy extractor -> plugin + print('DEBUG 0025: Copying extractor -> plugin') + cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL") + cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE plugin != ''") + count = cursor.fetchone()[0] + print(f'DEBUG 0025: Updated {count} rows with plugin data') + else: + print(f'DEBUG 0025: NOT copying - extractor in cols: {extractor" in cols}, plugin in cols: {"plugin" in cols}') + + if 'output' in cols and 'output_str' in cols: + # Copy output -> output_str + cursor.execute("UPDATE core_archiveresult SET output_str = COALESCE(output, '') WHERE output_str = '' OR output_str IS NULL") + + # Copy timestamps to new timestamp fields if they don't have values yet + if 'start_ts' in cols and 'created_at' in cols: + cursor.execute("UPDATE core_archiveresult SET created_at = COALESCE(start_ts, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''") + + if 'end_ts' in cols and 'modified_at' in cols: + cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''") + + # Same for Snapshot table + cursor.execute("PRAGMA table_info(core_snapshot)") + snap_cols = {row[1] for row in cursor.fetchall()} + + if 'added' in snap_cols and 'bookmarked_at' in snap_cols: + cursor.execute("UPDATE core_snapshot SET bookmarked_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE bookmarked_at IS NULL OR bookmarked_at = ''") + cursor.execute("UPDATE core_snapshot SET created_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''") + + if 'updated' in snap_cols and 'modified_at' in snap_cols: + cursor.execute("UPDATE core_snapshot SET modified_at = COALESCE(updated, added, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''") class Migration(migrations.Migration): @@ -192,6 +234,11 @@ class Migration(migrations.Migration): name='modified_at', field=models.DateTimeField(auto_now=True), ), + # Copy data from old field names to new field names after AddField operations + migrations.RunPython( + copy_old_fields_to_new, + reverse_code=migrations.RunPython.noop, + ), migrations.AlterField( model_name='archiveresult', name='end_ts', diff --git a/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py new file mode 100644 index 00000000..cb49fb57 --- /dev/null +++ b/archivebox/crawls/migrations/0002_upgrade_from_0_8_6.py @@ -0,0 +1,99 @@ +# Generated by hand on 2025-12-31 +# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 + +from django.db import migrations, connection + + +def upgrade_crawl_table_from_v086(apps, schema_editor): + """Upgrade crawls_crawl table from v0.8.6rc0 schema to v0.9.0 schema.""" + cursor = connection.cursor() + + # Check if crawls_crawl table exists + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'") + if not cursor.fetchone(): + return + + # Detect schema version + cursor.execute("PRAGMA table_info(crawls_crawl)") + crawl_cols = {row[1] for row in cursor.fetchall()} + has_seed_id = 'seed_id' in crawl_cols + has_urls = 'urls' in crawl_cols + + # Only upgrade if we have v0.8.6rc0 schema + if not (has_seed_id and not has_urls): + return + + # Check if table has any rows + cursor.execute("SELECT COUNT(*) FROM crawls_crawl") + has_data = cursor.fetchone()[0] > 0 + + # v0.8.6rc0 schema - upgrade to v0.9.0 + if has_data: + print('Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0...') + + cursor.execute(""" + CREATE TABLE IF NOT EXISTS crawls_crawl_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + urls TEXT NOT NULL, + config TEXT, + max_depth INTEGER NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id TEXT, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(512) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + created_by_id INTEGER NOT NULL, + schedule_id TEXT, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, + FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL + ); + """) + + if has_data: + cursor.execute(""" + INSERT OR IGNORE INTO crawls_crawl_new ( + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + urls, config, max_depth, tags_str, persona_id, label, notes, output_dir, + status, retry_at, created_by_id, schedule_id + ) + SELECT + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + '', config, max_depth, tags_str, NULL, '', '', '', + status, retry_at, created_by_id, schedule_id + FROM crawls_crawl; + """) + + cursor.execute("DROP TABLE crawls_crawl;") + cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl;") + + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_status_idx ON crawls_crawl(status);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_retry_at_idx ON crawls_crawl(retry_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_created_at_idx ON crawls_crawl(created_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id);") + + if has_data: + print('✓ crawls_crawl upgraded to v0.9.0') + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0001_initial'), + ] + + operations = [ + migrations.RunPython( + upgrade_crawl_table_from_v086, + reverse_code=migrations.RunPython.noop, + ), + ]