much better tests and add page ui

This commit is contained in:
Nick Sweeting
2025-12-29 04:02:11 -08:00
parent 9487f8a0de
commit 30c60eef76
93 changed files with 2998 additions and 2712 deletions

View File

@@ -1,494 +0,0 @@
# Generated by Django 5.0.6 on 2024-12-25
# Transforms schema from 0022 to new simplified schema (ABID system removed)
from uuid import uuid4
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
def get_or_create_system_user_pk(apps, schema_editor):
"""Get or create system user for migrations."""
User = apps.get_model('auth', 'User')
user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
return user.pk
def populate_created_by_snapshot(apps, schema_editor):
"""Populate created_by for existing snapshots."""
User = apps.get_model('auth', 'User')
Snapshot = apps.get_model('core', 'Snapshot')
system_user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
Snapshot.objects.filter(created_by__isnull=True).update(created_by=system_user)
def populate_created_by_archiveresult(apps, schema_editor):
"""Populate created_by for existing archive results."""
User = apps.get_model('auth', 'User')
ArchiveResult = apps.get_model('core', 'ArchiveResult')
system_user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
ArchiveResult.objects.filter(created_by__isnull=True).update(created_by=system_user)
def populate_created_by_tag(apps, schema_editor):
"""Populate created_by for existing tags."""
User = apps.get_model('auth', 'User')
Tag = apps.get_model('core', 'Tag')
system_user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
Tag.objects.filter(created_by__isnull=True).update(created_by=system_user)
def generate_uuid_for_archiveresults(apps, schema_editor):
"""Generate UUIDs for archive results that don't have them."""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for ar in ArchiveResult.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
ar.uuid = uuid4()
ar.save(update_fields=['uuid'])
def generate_uuid_for_tags(apps, schema_editor):
"""Generate UUIDs for tags that don't have them."""
Tag = apps.get_model('core', 'Tag')
for tag in Tag.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
tag.uuid = uuid4()
tag.save(update_fields=['uuid'])
def copy_bookmarked_at_from_added(apps, schema_editor):
"""Copy added timestamp to bookmarked_at."""
Snapshot = apps.get_model('core', 'Snapshot')
Snapshot.objects.filter(bookmarked_at__isnull=True).update(
bookmarked_at=models.F('added')
)
def copy_created_at_from_added(apps, schema_editor):
"""Copy added timestamp to created_at for snapshots."""
Snapshot = apps.get_model('core', 'Snapshot')
Snapshot.objects.filter(created_at__isnull=True).update(
created_at=models.F('added')
)
def copy_created_at_from_start_ts(apps, schema_editor):
"""Copy start_ts to created_at for archive results."""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
ArchiveResult.objects.filter(created_at__isnull=True).update(
created_at=models.F('start_ts')
)
class Migration(migrations.Migration):
"""
This migration transforms the schema from the main branch (0022) to the new
simplified schema without the ABID system.
For dev branch users who had ABID migrations (0023-0074), this replaces them
with a clean transformation.
"""
replaces = [
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
('core', '0024_auto_20240513_1143'),
('core', '0025_alter_archiveresult_uuid'),
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
('core', '0027_update_snapshot_ids'),
('core', '0028_alter_archiveresult_uuid'),
('core', '0029_alter_archiveresult_id'),
('core', '0030_alter_archiveresult_uuid'),
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
('core', '0032_alter_archiveresult_id'),
('core', '0033_rename_id_archiveresult_old_id'),
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
('core', '0037_rename_id_snapshot_old_id'),
('core', '0038_rename_uuid_snapshot_id'),
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
('core', '0040_archiveresult_snapshot'),
('core', '0041_alter_archiveresult_snapshot_and_more'),
('core', '0042_remove_archiveresult_snapshot_old'),
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
('core', '0045_alter_snapshot_old_id'),
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0047_alter_snapshottag_unique_together_and_more'),
('core', '0048_alter_archiveresult_snapshot_and_more'),
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
('core', '0050_alter_snapshottag_snapshot_old'),
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
('core', '0052_alter_snapshottag_unique_together_and_more'),
('core', '0053_remove_snapshottag_snapshot_old'),
('core', '0054_alter_snapshot_timestamp'),
('core', '0055_alter_tag_slug'),
('core', '0056_remove_tag_uuid'),
('core', '0057_rename_id_tag_old_id'),
('core', '0058_alter_tag_old_id'),
('core', '0059_tag_id'),
('core', '0060_alter_tag_id'),
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
('core', '0062_alter_snapshottag_old_tag'),
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
('core', '0064_alter_snapshottag_unique_together_and_more'),
('core', '0065_remove_snapshottag_old_tag'),
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
('core', '0067_alter_snapshottag_tag'),
('core', '0068_alter_archiveresult_options'),
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
('core', '0073_rename_created_archiveresult_created_at_and_more'),
('core', '0074_alter_snapshot_downloaded_at'),
]
dependencies = [
('core', '0022_auto_20231023_2008'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
# === SNAPSHOT CHANGES ===
# Add health stats fields to Snapshot
migrations.AddField(
model_name='snapshot',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
# Add new fields to Snapshot
migrations.AddField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(
default=None, null=True, blank=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='snapshot_set',
to=settings.AUTH_USER_MODEL,
),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(default=None, null=True, blank=True, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(default=0, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], default='queued', max_length=15, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict, blank=False),
),
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='snapshot',
name='output_dir',
field=models.CharField(max_length=256, default=None, null=True, blank=True),
),
# Copy data from old fields to new
migrations.RunPython(copy_bookmarked_at_from_added, migrations.RunPython.noop),
migrations.RunPython(copy_created_at_from_added, migrations.RunPython.noop),
migrations.RunPython(populate_created_by_snapshot, migrations.RunPython.noop),
# Make created_by non-nullable after population
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='snapshot_set',
to=settings.AUTH_USER_MODEL,
db_index=True,
),
),
# Update timestamp field constraints
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(max_length=32, unique=True, db_index=True, editable=False),
),
# Update title field size
migrations.AlterField(
model_name='snapshot',
name='title',
field=models.CharField(max_length=512, null=True, blank=True, db_index=True),
),
# Remove old 'added' and 'updated' fields
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# Register SnapshotTag through model (table already exists from 0006's ManyToManyField)
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.CreateModel(
name='SnapshotTag',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('snapshot', models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
('tag', models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
],
options={
'db_table': 'core_snapshot_tags',
},
),
],
database_operations=[], # Table already exists from 0006
),
# === TAG CHANGES ===
# Tag keeps AutoField (integer) id for migration compatibility
# Add tracking fields to Tag
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(
default=None, null=True, blank=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='tag_set',
to=settings.AUTH_USER_MODEL,
),
),
migrations.AddField(
model_name='tag',
name='created_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='tag',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
# Populate created_by for tags
migrations.RunPython(populate_created_by_tag, migrations.RunPython.noop),
# Update slug field
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(unique=True, max_length=100, editable=False),
),
# === ARCHIVERESULT CHANGES ===
# Add health stats fields to ArchiveResult
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
# Add uuid field for new ID
migrations.AddField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid4, null=True, blank=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(
default=None, null=True, blank=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='archiveresult_set',
to=settings.AUTH_USER_MODEL,
),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='output_dir',
field=models.CharField(max_length=256, default=None, null=True, blank=True),
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(default=dict, blank=False),
),
# Populate UUIDs and data for archive results
migrations.RunPython(generate_uuid_for_archiveresults, migrations.RunPython.noop),
migrations.RunPython(copy_created_at_from_start_ts, migrations.RunPython.noop),
migrations.RunPython(populate_created_by_archiveresult, migrations.RunPython.noop),
# Make created_by non-nullable
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='archiveresult_set',
to=settings.AUTH_USER_MODEL,
db_index=True,
),
),
# Update extractor choices
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(
choices=[
('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'),
('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'),
('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'),
('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'),
('title', 'title'), ('wget', 'wget'),
],
max_length=32, db_index=True,
),
),
# Update status field
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(
choices=[
('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'),
('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped'),
],
max_length=16, default='queued', db_index=True,
),
),
# Update output field size
migrations.AlterField(
model_name='archiveresult',
name='output',
field=models.CharField(max_length=1024, default=None, null=True, blank=True),
),
# Update cmd_version field size
migrations.AlterField(
model_name='archiveresult',
name='cmd_version',
field=models.CharField(max_length=128, default=None, null=True, blank=True),
),
# Make start_ts and end_ts nullable
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(default=None, null=True, blank=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(default=None, null=True, blank=True),
),
# Make pwd nullable
migrations.AlterField(
model_name='archiveresult',
name='pwd',
field=models.CharField(max_length=256, default=None, null=True, blank=True),
),
# Make cmd nullable
migrations.AlterField(
model_name='archiveresult',
name='cmd',
field=models.JSONField(default=None, null=True, blank=True),
),
# Update model options
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
),
migrations.AlterModelOptions(
name='tag',
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
),
]

View File

@@ -0,0 +1,190 @@
# Generated by hand on 2025-12-29
# Upgrades core app from v0.7.2 (migration 0022) to v0.9.0 using raw SQL
# Handles both fresh installs and upgrades from v0.7.2
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0022_auto_20231023_2008'),
('crawls', '0001_initial'),
('machine', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.RunSQL(
# Forward SQL
sql="""
-- ============================================================================
-- PART 1: Rename extractor → plugin in core_archiveresult
-- ============================================================================
-- SQLite doesn't support renaming columns directly, so we need to check if the rename is needed
-- If 'extractor' exists and 'plugin' doesn't, we do a table rebuild
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
cmd TEXT,
pwd VARCHAR(256),
cmd_version VARCHAR(128),
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
binary_id TEXT,
iface_id TEXT,
process_id TEXT,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL,
FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
);
-- Only copy if old table exists
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
)
SELECT
id, uuid,
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status,
COALESCE(output, '') as output_str
FROM core_archiveresult
WHERE EXISTS (SELECT 1 FROM sqlite_master WHERE type='table' AND name='core_archiveresult');
DROP TABLE IF EXISTS core_archiveresult;
ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;
CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id);
CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin);
CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status);
CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at);
CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at);
CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid);
-- ============================================================================
-- PART 2: Upgrade core_snapshot table
-- ============================================================================
CREATE TABLE IF NOT EXISTS core_snapshot_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
crawl_id TEXT,
parent_snapshot_id TEXT,
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
-- Copy data from old table if it exists
-- Map v0.7.2 fields: added → bookmarked_at/created_at, updated → modified_at
INSERT OR IGNORE INTO core_snapshot_new (
id, url, timestamp, title, bookmarked_at, created_at, modified_at
)
SELECT
id, url, timestamp, title,
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
FROM core_snapshot
WHERE EXISTS (SELECT 1 FROM sqlite_master WHERE type='table' AND name='core_snapshot');
DROP TABLE IF EXISTS core_snapshot;
ALTER TABLE core_snapshot_new RENAME TO core_snapshot;
CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);
CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);
CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);
CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);
CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);
CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
-- ============================================================================
-- PART 3: Upgrade core_tag table
-- ============================================================================
CREATE TABLE IF NOT EXISTS core_tag_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE,
created_by_id INTEGER,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
);
-- Copy data from old table if it exists
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
SELECT id, name, slug
FROM core_tag
WHERE EXISTS (SELECT 1 FROM sqlite_master WHERE type='table' AND name='core_tag');
DROP TABLE IF EXISTS core_tag;
ALTER TABLE core_tag_new RENAME TO core_tag;
CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);
CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);
-- core_snapshot_tags table already exists in v0.7.2, no changes needed
""",
# Reverse SQL (best effort - data loss may occur)
reverse_sql="""
-- This is a best-effort rollback - data in new fields will be lost
SELECT 'Migration 0023 cannot be fully reversed - new fields will be lost';
"""
),
]

View File

@@ -0,0 +1,118 @@
# Generated by hand on 2025-12-29
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
from django.db import migrations
import uuid
def create_default_crawl_and_assign_snapshots(apps, schema_editor):
"""
Create a default crawl for migrated snapshots and assign all snapshots without a crawl to it.
Uses raw SQL because the app registry isn't fully populated during migrations.
"""
from django.db import connection
import uuid as uuid_lib
from datetime import datetime
cursor = connection.cursor()
# Check if there are any snapshots without a crawl
cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE crawl_id IS NULL")
snapshots_without_crawl = cursor.fetchone()[0]
if snapshots_without_crawl == 0:
print('✓ Fresh install or all snapshots already have crawls')
return
# Get or create system user (pk=1)
cursor.execute("SELECT id FROM auth_user WHERE id = 1")
if not cursor.fetchone():
cursor.execute("""
INSERT INTO auth_user (id, password, is_superuser, username, first_name, last_name, email, is_staff, is_active, date_joined)
VALUES (1, '!', 1, 'system', '', '', '', 1, 1, ?)
""", [datetime.now().isoformat()])
# Create a default crawl for migrated snapshots
crawl_id = str(uuid_lib.uuid4())
now = datetime.now().isoformat()
cursor.execute("""
INSERT INTO crawls_crawl (
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
urls, max_depth, tags_str, label, notes, output_dir,
status, retry_at, created_by_id, schedule_id, config, persona_id
) VALUES (?, ?, ?, 0, 0, '', 0, '', 'Migrated from v0.7.2',
'Auto-created crawl for snapshots migrated from v0.7.2', '',
'sealed', ?, 1, NULL, '{}', NULL)
""", [crawl_id, now, now, now])
# Assign all snapshots without a crawl to the default crawl
cursor.execute("UPDATE core_snapshot SET crawl_id = ? WHERE crawl_id IS NULL", [crawl_id])
print(f'✓ Assigned {snapshots_without_crawl} snapshots to default crawl {crawl_id}')
class Migration(migrations.Migration):
dependencies = [
('core', '0023_upgrade_to_0_9_0'),
('crawls', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.RunPython(
create_default_crawl_and_assign_snapshots,
reverse_code=migrations.RunPython.noop,
),
# Now make crawl_id NOT NULL
migrations.RunSQL(
sql="""
-- Rebuild snapshot table with NOT NULL crawl_id
CREATE TABLE core_snapshot_final (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
crawl_id TEXT NOT NULL,
parent_snapshot_id TEXT,
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
INSERT INTO core_snapshot_final SELECT * FROM core_snapshot;
DROP TABLE core_snapshot;
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
""",
reverse_sql=migrations.RunSQL.noop,
),
]

View File

@@ -1,57 +0,0 @@
# Data migration to clear config fields that may contain invalid JSON
# This runs before 0025 to prevent CHECK constraint failures
from django.db import migrations
def clear_config_fields(apps, schema_editor):
"""Clear all config fields in related tables to avoid JSON validation errors."""
db_alias = schema_editor.connection.alias
# Disable foreign key checks temporarily to allow updates
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF")
tables_to_clear = [
('crawls_seed', 'config'),
('crawls_crawl', 'config'),
('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
('machine_machine', 'stats'),
('machine_machine', 'config'),
]
for table_info in tables_to_clear:
if table_info is None:
continue
table_name, field_name = table_info
try:
with schema_editor.connection.cursor() as cursor:
# Check if table exists first
cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
if not cursor.fetchone():
print(f" Skipping {table_name}.{field_name}: table does not exist")
continue
# Set all to empty JSON object
cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
print(f" Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
except Exception as e:
print(f" Skipping {table_name}.{field_name}: {e}")
# Re-enable foreign key checks
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=ON")
class Migration(migrations.Migration):
dependencies = [
('core', '0023_new_schema'),
('crawls', '0001_initial'),
('machine', '0001_squashed'),
]
operations = [
migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,28 +0,0 @@
# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
from django.db import migrations
def disable_fk_checks(apps, schema_editor):
"""Temporarily disable foreign key checks."""
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF")
print(" Disabled foreign key checks")
def enable_fk_checks(apps, schema_editor):
"""Re-enable foreign key checks."""
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=ON")
print(" Enabled foreign key checks")
class Migration(migrations.Migration):
dependencies = [
('core', '0024_b_clear_config_fields'),
]
operations = [
migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
]

View File

@@ -1,93 +0,0 @@
# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
from django.db import migrations
def fix_crawls_config(apps, schema_editor):
"""
Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
For fresh installs, crawls.0001_initial creates the correct schema.
"""
with schema_editor.connection.cursor() as cursor:
# Check if this is an upgrade from old 0.8.x or a fresh install
# In fresh installs, crawls.0001_initial was applied, creating seed FK
# In upgrades, the table was created by old migrations before 0001_initial existed
cursor.execute("""
SELECT COUNT(*) FROM django_migrations
WHERE app='crawls' AND name='0001_initial'
""")
has_crawls_0001 = cursor.fetchone()[0] > 0
if has_crawls_0001:
# Fresh install - crawls.0001_initial already created the correct schema
# Just clear config to avoid CHECK constraint issues
print(" Fresh install detected - clearing config field only")
try:
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
except Exception as e:
print(f" Skipping config clear: {e}")
return
# Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
print(" Upgrading from 0.8.x - rebuilding crawls_crawl table")
cursor.execute("PRAGMA foreign_keys=OFF")
# Backup
cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
# Recreate without config CHECK constraint, with nullable seed_id
cursor.execute("DROP TABLE crawls_crawl")
cursor.execute("""
CREATE TABLE "crawls_crawl" (
"num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
"num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
"id" char(32) NOT NULL PRIMARY KEY,
"created_at" datetime NOT NULL,
"modified_at" datetime NOT NULL,
"urls" text NOT NULL,
"config" text,
"max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
"tags_str" varchar(1024) NOT NULL,
"persona_id" char(32) NULL,
"label" varchar(64) NOT NULL,
"notes" text NOT NULL,
"output_dir" varchar(512) NOT NULL,
"status" varchar(15) NOT NULL,
"retry_at" datetime NULL,
"created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
"seed_id" char(32) NULL DEFAULT NULL,
"schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
)
""")
# Restore data
cursor.execute("""
INSERT INTO "crawls_crawl" (
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
)
SELECT
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
FROM crawls_crawl_backup
""")
cursor.execute("DROP TABLE crawls_crawl_backup")
# NULL out config to avoid any invalid JSON
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
class Migration(migrations.Migration):
dependencies = [
('core', '0024_c_disable_fk_checks'),
('crawls', '0001_initial'),
]
operations = [
migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,38 +0,0 @@
# Generated by Django 5.0.6 on 2024-12-25
# Adds crawl FK and iface FK after crawls and machine apps are created
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('core', '0024_d_fix_crawls_config'),
]
operations = [
# Add crawl FK to Snapshot
migrations.AddField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(
default=None, null=True, blank=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='snapshot_set',
to='crawls.crawl',
db_index=True,
),
),
# Add network interface FK to ArchiveResult
migrations.AddField(
model_name='archiveresult',
name='iface',
field=models.ForeignKey(
null=True, blank=True,
on_delete=django.db.models.deletion.SET_NULL,
to='machine.networkinterface',
),
),
]

View File

@@ -1,22 +0,0 @@
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_snapshot_crawl'),
]
operations = [
# Remove the unique constraint on url
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True, unique=False),
),
# Add unique constraint on (url, crawl) combination
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
),
]

View File

@@ -1,145 +0,0 @@
# Generated by Django 6.0 on 2025-12-25 09:34
import archivebox.base_models.models
import django.db.models.deletion
import django.utils.timezone
from archivebox import uuid_compat
from django.conf import settings
from django.db import migrations, models
def populate_archiveresult_uuids(apps, schema_editor):
"""Generate unique UUIDs for ArchiveResults that don't have one."""
# Check if uuid column exists before trying to populate it
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = [row[1] for row in cursor.fetchall()]
if 'uuid' not in columns:
return # uuid column doesn't exist, skip this data migration
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for result in ArchiveResult.objects.filter(uuid__isnull=True):
result.uuid = uuid_compat.uuid7()
result.save(update_fields=['uuid'])
def reverse_populate_uuids(apps, schema_editor):
"""Reverse migration - do nothing, UUIDs can stay."""
pass
def remove_output_dir_if_exists(apps, schema_editor):
"""Remove output_dir columns if they exist."""
with schema_editor.connection.cursor() as cursor:
# Check and remove from core_archiveresult
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = [row[1] for row in cursor.fetchall()]
if 'output_dir' in columns:
cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
# Check and remove from core_snapshot
cursor.execute("PRAGMA table_info(core_snapshot)")
columns = [row[1] for row in cursor.fetchall()]
if 'output_dir' in columns:
cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
class Migration(migrations.Migration):
dependencies = [
('core', '0025_allow_duplicate_urls_per_crawl'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
# FIRST: Populate UUIDs for existing NULL rows BEFORE any schema changes
migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),
# Remove output_dir fields (not needed, computed from snapshot)
migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
# Update Django's migration state to match 0.9.x schema
# Database already has correct types from 0.8.x, just update state
migrations.SeparateDatabaseAndState(
state_operations=[
# Archiveresult field alterations
migrations.AlterField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(db_index=True, max_length=32),
),
# Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
# Snapshot field alterations
migrations.AlterField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
],
database_operations=[
# No actual database changes needed - schema is already correct from 0.8.x
],
),
# SnapshotTag and Tag alterations - state only, DB already correct
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
],
database_operations=[],
),
]

View File

@@ -1,29 +0,0 @@
# Generated by Django 6.0 on 2025-12-27 01:40
import archivebox.base_models.models
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0026_remove_archiveresult_output_dir_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
# Note: Cannot alter M2M tags field via migration (Django limitation)
# The related_name change is handled by the model definition itself
]

View File

@@ -1,47 +0,0 @@
# Generated by Claude Code on 2025-12-27
from django.db import migrations, models
def set_existing_snapshots_to_old_version(apps, schema_editor):
"""Set existing snapshots to 0.8.0 since they use the old filesystem layout."""
Snapshot = apps.get_model('core', 'Snapshot')
# Set all existing snapshots to 0.8.0 (the previous version's layout)
Snapshot.objects.all().update(fs_version='0.8.0')
def reverse_migration(apps, schema_editor):
"""Reverse migration - do nothing."""
pass
class Migration(migrations.Migration):
dependencies = [
('core', '0027_alter_archiveresult_created_by_and_more'),
]
operations = [
# Add field with temporary default to allow NULL initially
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(
max_length=10,
default='0.8.0', # Temporary default for adding the column
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
),
),
# Set existing snapshots to old version
migrations.RunPython(set_existing_snapshots_to_old_version, reverse_migration),
# Update default to current version for new snapshots going forward
migrations.AlterField(
model_name='snapshot',
name='fs_version',
field=models.CharField(
max_length=10,
default='0.9.0', # Hardcoded for this migration - new migration when version bumps
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
),
),
]

View File

@@ -1,91 +0,0 @@
# Generated by Django for hook architecture support
# Phase 1: Add new ArchiveResult fields for hook output
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('core', '0028_snapshot_fs_version'),
('machine', '0002_rename_custom_cmds_to_overrides'),
]
operations = [
# Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(
blank=True,
default='',
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(
null=True,
blank=True,
default=None,
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(
default=dict,
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(
default=0,
help_text='Total recursive size in bytes of all output files'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(
max_length=512,
blank=True,
default='',
help_text='CSV of mimetypes sorted by size descending'
),
),
migrations.AddField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(
'machine.Binary',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='archiveresults',
help_text='Primary binary used by this hook (optional)'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -1,83 +0,0 @@
# Generated by Django for hook architecture support
# Phase 1: Migrate existing 'output' field to new split fields
from django.db import migrations
import json
def migrate_output_field(apps, schema_editor):
"""
Migrate existing 'output' field to new split fields.
Logic:
- If output contains JSON {...}, move to output_json
- Otherwise, move to output_str
Use raw SQL to avoid CHECK constraint issues during migration.
"""
# Use raw SQL to migrate data without triggering CHECK constraints
with schema_editor.connection.cursor() as cursor:
# Get all archive results
cursor.execute("""
SELECT id, output FROM core_archiveresult
""")
for row in cursor.fetchall():
ar_id, old_output = row
old_output = old_output or ''
# Case 1: JSON output
if old_output.strip().startswith('{'):
try:
# Validate it's actual JSON
parsed = json.loads(old_output)
# Update with JSON - cast to JSON to satisfy CHECK constraint
json_str = json.dumps(parsed)
cursor.execute("""
UPDATE core_archiveresult
SET output_str = '', output_json = json(?)
WHERE id = ?
""", (json_str, ar_id))
except json.JSONDecodeError:
# Not valid JSON, treat as string
cursor.execute("""
UPDATE core_archiveresult
SET output_str = ?, output_json = NULL
WHERE id = ?
""", (old_output, ar_id))
# Case 2: File path or plain string
else:
cursor.execute("""
UPDATE core_archiveresult
SET output_str = ?, output_json = NULL
WHERE id = ?
""", (old_output, ar_id))
def reverse_migrate(apps, schema_editor):
"""Reverse migration - copy output_str back to output."""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for ar in ArchiveResult.objects.all().iterator():
if ar.output_json:
ar.output = json.dumps(ar.output_json)
else:
ar.output = ar.output_str or ''
ar.save(update_fields=['output'])
class Migration(migrations.Migration):
dependencies = [
('core', '0029_archiveresult_hook_fields'),
]
operations = [
migrations.RunPython(migrate_output_field, reverse_migrate),
# Now safe to remove old 'output' field
migrations.RemoveField(
model_name='archiveresult',
name='output',
),
]

View File

@@ -1,27 +0,0 @@
# Generated by Django 6.0 on 2025-12-27
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0030_migrate_output_field'),
]
operations = [
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(
blank=True,
db_index=True,
help_text='Parent snapshot that discovered this URL (for recursive crawling)',
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name='child_snapshots',
to='core.snapshot'
),
),
]

View File

@@ -1,77 +0,0 @@
# Generated by Django 6.0 on 2025-12-28 05:12
import django.db.models.deletion
from archivebox import uuid_compat
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0031_snapshot_parent_snapshot'),
('crawls', '0004_alter_crawl_output_dir'),
('machine', '0004_drop_dependency_table'), # Changed from 0003 - wait until Dependency is dropped
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
# Update Django's state only - database already has correct schema from 0029
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
),
],
database_operations=[
# No database changes needed - columns already exist with correct types
],
),
# Add unique constraint without table rebuild
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
),
],
database_operations=[
migrations.RunSQL(
sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
),
],
),
]

View File

@@ -1,44 +0,0 @@
# Generated by Django 6.0 on 2025-12-28
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0032_alter_archiveresult_binary_and_more'),
]
operations = [
# Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.RenameField(
model_name='archiveresult',
old_name='extractor',
new_name='plugin',
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(
blank=True,
default='',
max_length=255,
db_index=True,
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -1,37 +0,0 @@
# Generated by Django 6.0 on 2025-12-28
# Add Snapshot.current_step field for hook step-based execution
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0033_rename_extractor_add_hook_name'),
]
operations = [
# Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(
default=0,
db_index=True,
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -1,87 +0,0 @@
# Generated migration
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
"""
Create one catchall Crawl per user for all snapshots without a crawl.
Assign those snapshots to their user's catchall crawl.
"""
Snapshot = apps.get_model('core', 'Snapshot')
Crawl = apps.get_model('crawls', 'Crawl')
User = apps.get_model(settings.AUTH_USER_MODEL)
# Get all snapshots without a crawl
snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
if not snapshots_without_crawl.exists():
return
# Group by created_by_id
snapshots_by_user = {}
for snapshot in snapshots_without_crawl:
user_id = snapshot.created_by_id
if user_id not in snapshots_by_user:
snapshots_by_user[user_id] = []
snapshots_by_user[user_id].append(snapshot)
# Create one catchall crawl per user and assign snapshots
for user_id, snapshots in snapshots_by_user.items():
try:
user = User.objects.get(pk=user_id)
username = user.username
except User.DoesNotExist:
username = 'unknown'
# Create catchall crawl for this user
crawl = Crawl.objects.create(
urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
max_depth=0,
label=f'[migration] catchall for user {username}',
created_by_id=user_id,
)
# Assign all snapshots to this crawl
for snapshot in snapshots:
snapshot.crawl = crawl
snapshot.save(update_fields=['crawl'])
class Migration(migrations.Migration):
dependencies = [
('core', '0034_snapshot_current_step'),
('crawls', '0005_drop_seed_id_column'),
]
operations = [
# Step 1: Assign all snapshots without a crawl to catchall crawls
migrations.RunPython(
create_catchall_crawls_and_assign_snapshots,
reverse_code=migrations.RunPython.noop,
),
# Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
migrations.SeparateDatabaseAndState(
state_operations=[
# Make crawl non-nullable
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
# Remove created_by field from Django's state
migrations.RemoveField(
model_name='snapshot',
name='created_by',
),
],
database_operations=[
# No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
# created_by_id column remains in database but is unused
],
),
]

View File

@@ -1,27 +0,0 @@
# Generated migration
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
]
operations = [
# Remove created_by field from ArchiveResult (state only)
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
# Leave created_by_id column in database (unused but harmless, avoids table rebuild)
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.RemoveField(
model_name='archiveresult',
name='created_by',
),
],
database_operations=[
# No database changes - leave created_by_id column in place to avoid table rebuild
],
),
]

View File

@@ -1,44 +0,0 @@
# Generated by Django 6.0 on 2025-12-29 06:45
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0036_remove_archiveresult_created_by'),
]
operations = [
# Update Django's state only - database columns remain for backwards compat
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.RemoveField(
model_name='archiveresult',
name='output_dir',
),
migrations.RemoveField(
model_name='snapshot',
name='output_dir',
),
migrations.AlterField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
],
database_operations=[
# No database changes - columns remain in place to avoid table rebuilds
],
),
]

View File

@@ -1,84 +0,0 @@
# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
from django.db import migrations, models, connection
import django.utils.timezone
def add_columns_if_not_exist(apps, schema_editor):
"""Add columns to ArchiveResult only if they don't already exist."""
with connection.cursor() as cursor:
# Get existing columns
cursor.execute("PRAGMA table_info(core_archiveresult)")
existing_columns = {row[1] for row in cursor.fetchall()}
# Add num_uses_failed if it doesn't exist
if 'num_uses_failed' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
# Add num_uses_succeeded if it doesn't exist
if 'num_uses_succeeded' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
# Add config if it doesn't exist
if 'config' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
# Add retry_at if it doesn't exist
if 'retry_at' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
class Migration(migrations.Migration):
dependencies = [
('core', '0037_remove_archiveresult_output_dir_and_more'),
]
operations = [
# Add missing columns to ArchiveResult
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
],
database_operations=[
migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
],
),
# Drop created_by_id from Snapshot (database only, already removed from model in 0035)
migrations.SeparateDatabaseAndState(
state_operations=[
# No state changes - field already removed in 0035
],
database_operations=[
migrations.RunSQL(
sql="""
-- Drop index first, then column
DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
ALTER TABLE core_snapshot DROP COLUMN created_by_id;
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -1,30 +0,0 @@
# Fix num_uses_failed and num_uses_succeeded string values to integers
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0038_fix_missing_columns'),
]
operations = [
# Fix string values that got inserted as literals instead of integers
migrations.RunSQL(
sql="""
UPDATE core_snapshot
SET num_uses_failed = 0
WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
UPDATE core_snapshot
SET num_uses_succeeded = 0
WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
UPDATE core_snapshot
SET depth = 0
WHERE typeof(depth) = 'text' OR depth = 'depth';
""",
reverse_sql=migrations.RunSQL.noop,
),
]