actually working migration path from 0.7.2 and 0.8.6 + renames and test coverage

This commit is contained in:
Nick Sweeting
2026-01-01 15:49:56 -08:00
parent 6fadcf5168
commit 876feac522
33 changed files with 825 additions and 333 deletions

View File

@@ -117,7 +117,7 @@ class SnapshotAdminForm(forms.ModelForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
form = SnapshotAdminForm
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'url_str')
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
@@ -488,6 +488,12 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.url[:128],
)
@admin.display(description='Health', ordering='health')
def health_display(self, obj):
h = obj.health
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
return format_html('<span style="color: {};">{}</span>', color, h)
def grid_view(self, request, extra_context=None):
# cl = self.get_changelist_instance(request)

View File

@@ -3,6 +3,7 @@
# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0
from django.db import migrations, models, connection
import django.utils.timezone
def get_table_columns(table_name):
@@ -95,31 +96,31 @@ def upgrade_core_tables(apps, schema_editor):
# ============================================================================
# PART 2: Upgrade core_snapshot table
# ============================================================================
# Create table with NEW field names for timestamps (bookmarked_at, created_at, modified_at)
# and all other fields needed by later migrations
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
title VARCHAR(512),
crawl_id TEXT,
parent_snapshot_id TEXT,
title VARCHAR(512),
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
downloaded_at DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
@@ -141,29 +142,23 @@ def upgrade_core_tables(apps, schema_editor):
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
if has_added and not has_bookmarked_at:
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
# Migrating from v0.7.2 (has added/updated fields)
print('Migrating Snapshot from v0.7.2 schema...')
# Debug: Check what data we're about to copy
cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3")
sample_data = cursor.fetchall()
print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}')
# Transform added→bookmarked_at/created_at and updated→modified_at
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
id, url, timestamp, title, bookmarked_at, created_at, modified_at
id, url, timestamp, title,
bookmarked_at, created_at, modified_at,
status
)
SELECT
id, url, timestamp, title,
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at,
'queued' as status
FROM core_snapshot;
""")
# Debug: Check what was inserted
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3")
inserted_data = cursor.fetchall()
print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}')
elif has_bookmarked_at and not has_added:
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.8.6rc0 schema...')
@@ -308,14 +303,29 @@ class Migration(migrations.Migration):
),
],
state_operations=[
# NOTE: We do NOT remove extractor/output here for ArchiveResult!
# NOTE: We do NOT remove extractor/output for ArchiveResult!
# They are still in the database and will be removed by migration 0025
# after copying their data to the new field names (plugin, output_str).
# after copying their data to plugin/output_str.
# However, for Snapshot, we DO remove added/updated here because
# the database operations above already renamed them to bookmarked_at/created_at/modified_at.
# However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields
# because the SQL above already transformed them.
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
# SnapshotTag table already exists from v0.7.2, just declare it in state
migrations.CreateModel(

View File

@@ -103,15 +103,21 @@ class Migration(migrations.Migration):
);
INSERT INTO core_snapshot_final (
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
config, notes, num_uses_succeeded, num_uses_failed,
id, url, timestamp, title,
bookmarked_at, created_at, modified_at,
crawl_id, parent_snapshot_id,
downloaded_at, depth, fs_version,
config, notes,
num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
)
SELECT
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
id, url, timestamp, title,
bookmarked_at, created_at, modified_at,
crawl_id, parent_snapshot_id,
downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''),
num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
FROM core_snapshot;

View File

@@ -9,23 +9,16 @@ from django.db import migrations, models, connection
def copy_old_fields_to_new(apps, schema_editor):
"""Copy data from old field names to new field names before AddField operations."""
"""Copy data from old field names to new field names after AddField operations."""
cursor = connection.cursor()
# Check if old fields still exist
cursor.execute("PRAGMA table_info(core_archiveresult)")
cols = {row[1] for row in cursor.fetchall()}
print(f'DEBUG 0025: ArchiveResult columns: {sorted(cols)}')
if 'extractor' in cols and 'plugin' in cols:
# Copy extractor -> plugin
print('DEBUG 0025: Copying extractor -> plugin')
cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL")
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE plugin != ''")
count = cursor.fetchone()[0]
print(f'DEBUG 0025: Updated {count} rows with plugin data')
else:
print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}')
if 'output' in cols and 'output_str' in cols:
# Copy output -> output_str
@@ -38,16 +31,13 @@ def copy_old_fields_to_new(apps, schema_editor):
if 'end_ts' in cols and 'modified_at' in cols:
cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
# Same for Snapshot table
cursor.execute("PRAGMA table_info(core_snapshot)")
snap_cols = {row[1] for row in cursor.fetchall()}
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
# transformed by migration 0023, so we don't need to copy them here.
if 'added' in snap_cols and 'bookmarked_at' in snap_cols:
cursor.execute("UPDATE core_snapshot SET bookmarked_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE bookmarked_at IS NULL OR bookmarked_at = ''")
cursor.execute("UPDATE core_snapshot SET created_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''")
if 'updated' in snap_cols and 'modified_at' in snap_cols:
cursor.execute("UPDATE core_snapshot SET modified_at = COALESCE(updated, added, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
# Debug: Check Snapshot timestamps at end of RunPython
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
snap_after = cursor.fetchall()
print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}')
class Migration(migrations.Migration):
@@ -149,21 +139,12 @@ class Migration(migrations.Migration):
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
# NOTE: bookmarked_at and created_at already added by migration 0023
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
@@ -184,11 +165,7 @@ class Migration(migrations.Migration):
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
# NOTE: modified_at already added by migration 0023
migrations.AddField(
model_name='snapshot',
name='notes',
@@ -248,7 +225,7 @@ class Migration(migrations.Migration):
model_name='archiveresult',
name='output',
),
# NOTE: Snapshot's added/updated fields were already removed by migration 0023
# NOTE: Snapshot's added/updated were already removed by migration 0023
migrations.AlterField(
model_name='archiveresult',
name='end_ts',

View File

@@ -0,0 +1,28 @@
# Generated by Django 6.0 on 2026-01-01 23:28
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
('machine', '0003_add_process_type_and_parent'),
]
operations = [
migrations.RemoveField(
model_name='archiveresult',
name='num_uses_failed',
),
migrations.RemoveField(
model_name='archiveresult',
name='num_uses_succeeded',
),
migrations.AddField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
]

View File

@@ -2285,13 +2285,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
# Added POST-v0.9.0, will be added in a separate migration
# process = models.OneToOneField(
# 'machine.Process',
# on_delete=models.PROTECT,
# null=False,
# related_name='archiveresult',
# help_text='Process execution details for this archive result'
# )
process = models.OneToOneField(
'machine.Process',
on_delete=models.PROTECT,
null=True,
blank=True,
related_name='archiveresult',
help_text='Process execution details for this archive result'
)
# New output fields (replacing old 'output' field)
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')