cleanup migrations, json, jsonl

This commit is contained in:
Nick Sweeting
2025-12-31 15:36:13 -08:00
parent 0930911a15
commit a04e4a7345
21 changed files with 993 additions and 1418 deletions

View File

@@ -252,8 +252,8 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp', 'process__cmd')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
fieldsets = (
@@ -270,7 +270,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'classes': ('card',),
}),
('Command', {
'fields': ('process', 'cmd', 'cmd_str', 'cmd_version', 'pwd'),
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
'classes': ('card',),
}),
('Output', {

View File

@@ -1,299 +1,250 @@
# Generated by hand on 2025-12-29
# Upgrades core app from v0.7.2 (migration 0022) or v0.8.6rc0 (migration 0076) to v0.9.0 using raw SQL
# Upgrades core app from v0.7.2/v0.8.6rc0 (migration 0022) to v0.9.0 using raw SQL
# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0
from django.db import migrations
from django.db import migrations, models, connection
def upgrade_from_v072_or_v086(apps, schema_editor):
"""
Upgrade core tables from either v0.7.2 or v0.8.6rc0 to v0.9.0.
Handles differences in schema between versions.
"""
with schema_editor.connection.cursor() as cursor:
# Check if uuid column exists (v0.7.2 has it, v0.8.6rc0 doesn't)
def get_table_columns(table_name):
"""Get list of column names for a table."""
cursor = connection.cursor()
cursor.execute(f"PRAGMA table_info({table_name})")
return {row[1] for row in cursor.fetchall()}
def upgrade_core_tables(apps, schema_editor):
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
cursor = connection.cursor()
# Check if core_archiveresult table exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
if not cursor.fetchone():
# Fresh install - no migration needed, tables will be created by later migrations
return
# Detect which version we're migrating from
archiveresult_cols = get_table_columns('core_archiveresult')
has_uuid = 'uuid' in archiveresult_cols
has_abid = 'abid' in archiveresult_cols
# ============================================================================
# PART 1: Upgrade core_archiveresult table
# ============================================================================
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
cmd TEXT,
pwd VARCHAR(256),
cmd_version VARCHAR(128),
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE
);
""")
if has_uuid and not has_abid:
# Migrating from v0.7.2 (has uuid, minimal fields)
print('Migrating ArchiveResult from v0.7.2 schema...')
cursor.execute("""
SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='uuid'
""")
has_uuid = cursor.fetchone()[0] > 0
# Check if id is INTEGER (v0.7.2) or TEXT/char (v0.8.6rc0)
cursor.execute("""
SELECT type FROM pragma_table_info('core_archiveresult') WHERE name='id'
""")
id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
is_v072 = 'INT' in id_type.upper()
# ============================================================================
# PART 1: Upgrade core_archiveresult table
# ============================================================================
# Create new table with v0.9.0 schema
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
cmd TEXT,
pwd VARCHAR(256),
cmd_version VARCHAR(128),
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
binary_id TEXT,
iface_id TEXT,
process_id TEXT,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL,
FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
)
SELECT
id, uuid,
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status,
COALESCE(output, '') as output_str
FROM core_archiveresult;
""")
# Copy data based on source version
if is_v072:
# Coming from v0.7.2: has INTEGER id, has uuid column, has extractor
print(" Migrating from v0.7.2 schema...")
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
)
SELECT
uuid,
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status,
COALESCE(output, '') as output_str
FROM core_archiveresult
""")
else:
# Coming from v0.8.6rc0: has TEXT id, no uuid column, has abid
print(" Migrating from v0.8.6rc0 schema...")
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
)
SELECT
id as uuid,
created_at,
modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status, retry_at,
COALESCE(output, '') as output_str
FROM core_archiveresult
""")
# Replace old table
cursor.execute("DROP TABLE IF EXISTS core_archiveresult")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
# ============================================================================
# PART 2: Upgrade core_snapshot table
# ============================================================================
# Check snapshot schema version
elif has_abid and not has_uuid:
# Migrating from v0.8.6rc0 (has abid, full fields)
print('Migrating ArchiveResult from v0.8.6rc0 schema...')
cursor.execute("""
SELECT COUNT(*) FROM pragma_table_info('core_snapshot') WHERE name='crawl_id'
""")
has_crawl_id = cursor.fetchone()[0] > 0
# Create new table
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
downloaded_at DATETIME,
url TEXT NOT NULL,
timestamp TEXT NOT NULL,
title TEXT,
crawl_id TEXT,
depth INTEGER NOT NULL DEFAULT 0,
parent_snapshot_id TEXT,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0
-- Note: crawl_id foreign key will be added in 0024 after assigning crawl_ids
-- FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
-- FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
)
SELECT
id, abid as uuid,
created_at, modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status, retry_at,
COALESCE(output, '') as output_str
FROM core_archiveresult;
""")
else:
print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
# Copy snapshot data
if has_crawl_id:
# v0.8.6rc0 schema - already has created_at, modified_at, bookmarked_at
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid);")
# ============================================================================
# PART 2: Upgrade core_snapshot table
# ============================================================================
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
crawl_id TEXT,
parent_snapshot_id TEXT,
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
""")
# Check if core_snapshot exists (it should)
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'")
if cursor.fetchone():
# Detect which version we're migrating from
snapshot_cols = get_table_columns('core_snapshot')
has_added = 'added' in snapshot_cols
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
if has_added and not has_bookmarked_at:
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.7.2 schema...')
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
id, created_at, modified_at, bookmarked_at, downloaded_at, url, timestamp,
crawl_id, status, retry_at
id, url, timestamp, title, bookmarked_at, created_at, modified_at
)
SELECT
id,
created_at,
modified_at,
bookmarked_at,
downloaded_at,
url, timestamp,
NULLIF(crawl_id, ''),
COALESCE(status, 'queued'),
retry_at
FROM core_snapshot
id, url, timestamp, title,
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
FROM core_snapshot;
""")
elif has_bookmarked_at and not has_added:
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.8.6rc0 schema...')
# Check what fields exist
has_status = 'status' in snapshot_cols
has_retry_at = 'retry_at' in snapshot_cols
has_crawl_id = 'crawl_id' in snapshot_cols
# Build column list based on what exists
cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at']
if has_crawl_id:
cols.append('crawl_id')
if has_status:
cols.append('status')
if has_retry_at:
cols.append('retry_at')
cursor.execute(f"""
INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)})
SELECT {', '.join(cols)}
FROM core_snapshot;
""")
else:
# v0.7.2 schema - will get crawl_id assigned by later migration (0024)
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
id, created_at, modified_at, bookmarked_at, url, timestamp, crawl_id
)
SELECT
id,
COALESCE(added, CURRENT_TIMESTAMP),
COALESCE(updated, added, CURRENT_TIMESTAMP),
COALESCE(added, CURRENT_TIMESTAMP),
url, timestamp,
NULL as crawl_id
FROM core_snapshot
""")
print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}')
# Replace old table
cursor.execute("DROP TABLE IF EXISTS core_snapshot")
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot")
cursor.execute("DROP TABLE IF EXISTS core_snapshot;")
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at)")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);")
cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);")
# ============================================================================
# PART 3: Upgrade core_tag table
# ============================================================================
# ============================================================================
# PART 3: Upgrade core_tag table
# ============================================================================
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_tag_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
# Check if tag id is INTEGER (v0.7.2) or TEXT (v0.8.6rc0)
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE,
created_by_id INTEGER,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
);
""")
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_tag'")
if cursor.fetchone():
cursor.execute("""
SELECT type FROM pragma_table_info('core_tag') WHERE name='id'
""")
tag_id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
tag_id_is_int = 'INT' in tag_id_type.upper()
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_tag_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
created_by_id INTEGER,
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE SET NULL
)
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
SELECT id, name, slug
FROM core_tag;
""")
if tag_id_is_int:
# v0.7.2: Direct copy (INTEGER to INTEGER)
cursor.execute("""
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
SELECT id, name, slug FROM core_tag
""")
else:
# v0.8.6rc0: Need to remap TEXT ids to new INTEGER ids
cursor.execute("SELECT id, name, slug FROM core_tag")
old_tags = cursor.fetchall()
tag_id_mapping = {} # old_text_id -> new_int_id
cursor.execute("DROP TABLE IF EXISTS core_tag;")
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag;")
for old_id, name, slug in old_tags:
cursor.execute("""
INSERT OR IGNORE INTO core_tag_new (name, slug)
VALUES (?, ?)
""", [name, slug])
cursor.execute("SELECT id FROM core_tag_new WHERE slug = ?", [slug])
new_id = cursor.fetchone()[0]
tag_id_mapping[old_id] = new_id
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);")
cursor.execute("DROP TABLE IF EXISTS core_tag")
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag")
# Recreate M2M table
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_tags_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_id TEXT NOT NULL,
tag_id INTEGER NOT NULL,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES core_tag(id) ON DELETE CASCADE,
UNIQUE(snapshot_id, tag_id)
)
""")
if tag_id_is_int:
# Direct copy for v0.7.2
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
SELECT snapshot_id, tag_id FROM core_snapshot_tags
""")
else:
# v0.8.6rc0: Use mapping to convert old TEXT ids to new INTEGER ids
cursor.execute("SELECT snapshot_id, tag_id FROM core_snapshot_tags")
m2m_entries = cursor.fetchall()
for snapshot_id, old_tag_id in m2m_entries:
new_tag_id = tag_id_mapping.get(old_tag_id)
if new_tag_id:
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
VALUES (?, ?)
""", [snapshot_id, new_tag_id])
cursor.execute("DROP TABLE IF EXISTS core_snapshot_tags")
cursor.execute("ALTER TABLE core_snapshot_tags_new RENAME TO core_snapshot_tags")
print('✓ Core tables upgraded to v0.9.0')
class Migration(migrations.Migration):
@@ -301,10 +252,49 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0022_auto_20231023_2008'),
('crawls', '0001_initial'),
('machine', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.RunPython(upgrade_from_v072_or_v086, reverse_code=migrations.RunPython.noop),
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(
upgrade_core_tables,
reverse_code=migrations.RunPython.noop,
),
],
state_operations=[
# Remove old ArchiveResult fields
migrations.RemoveField(model_name='archiveresult', name='extractor'),
migrations.RemoveField(model_name='archiveresult', name='output'),
# Remove old Snapshot fields
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# SnapshotTag table already exists from v0.7.2, just declare it in state
migrations.CreateModel(
name='SnapshotTag',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)),
('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)),
],
options={
'db_table': 'core_snapshot_tags',
'unique_together': {('snapshot', 'tag')},
},
),
# Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2)
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(
'Tag',
blank=True,
related_name='snapshot_set',
through='SnapshotTag',
through_fields=('snapshot', 'tag'),
),
),
],
),
]

View File

@@ -1,7 +1,7 @@
# Generated by hand on 2025-12-29
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
from django.db import migrations
from django.db import migrations, models
import uuid
@@ -56,8 +56,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0023_upgrade_to_0_9_0'),
('crawls', '0002_upgrade_to_0_9_0'),
('machine', '0001_initial'),
('crawls', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
@@ -66,65 +65,80 @@ class Migration(migrations.Migration):
create_default_crawl_and_assign_snapshots,
reverse_code=migrations.RunPython.noop,
),
# Now make crawl_id NOT NULL
migrations.RunSQL(
sql="""
-- Rebuild snapshot table with NOT NULL crawl_id
CREATE TABLE core_snapshot_final (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
migrations.SeparateDatabaseAndState(
database_operations=[
# Now make crawl_id NOT NULL
migrations.RunSQL(
sql="""
-- Rebuild snapshot table with NOT NULL crawl_id
CREATE TABLE core_snapshot_final (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
crawl_id TEXT NOT NULL,
parent_snapshot_id TEXT,
crawl_id TEXT NOT NULL,
parent_snapshot_id TEXT,
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
INSERT INTO core_snapshot_final (
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
config, notes, num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
)
SELECT
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
FROM core_snapshot;
INSERT INTO core_snapshot_final (
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
config, notes, num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
)
SELECT
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
FROM core_snapshot;
DROP TABLE core_snapshot;
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
DROP TABLE core_snapshot;
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
""",
reverse_sql=migrations.RunSQL.noop,
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
state_operations=[
migrations.AddField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(
on_delete=models.deletion.CASCADE,
to='crawls.crawl',
help_text='Crawl that created this snapshot'
),
),
],
),
]

View File

@@ -0,0 +1,258 @@
# Generated by Django 6.0 on 2025-12-31 23:09
import archivebox.base_models.models
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_assign_default_crawl'),
('crawls', '0001_initial'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
),
migrations.RemoveField(
model_name='archiveresult',
name='cmd',
),
migrations.RemoveField(
model_name='archiveresult',
name='cmd_version',
),
migrations.RemoveField(
model_name='archiveresult',
name='pwd',
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AddField(
model_name='archiveresult',
name='plugin',
field=models.CharField(db_index=True, default='', max_length=32),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
),
migrations.AddField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(db_index=True, default=0),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
),
migrations.AddField(
model_name='tag',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='tag',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True),
),
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(editable=False, max_length=100, unique=True),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
),
]

View File

@@ -1,484 +0,0 @@
# Generated by hand on 2025-12-29
# Cleans up extra columns from raw SQL migrations and ensures schema matches models
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
import archivebox.base_models.models
def cleanup_extra_columns(apps, schema_editor):
"""
Create Process records from old cmd/pwd/cmd_version columns and remove those columns.
This preserves the execution details by moving them to the Process model.
"""
with schema_editor.connection.cursor() as cursor:
# Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'")
has_cmd = cursor.fetchone()[0] > 0
if has_cmd:
print(" Migrating cmd/pwd/cmd_version data to Process records...")
# For each ArchiveResult, create a Process record with cmd/pwd data
# Note: cmd_version from old schema is not preserved (it's now derived from Binary)
cursor.execute("""
SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status
FROM core_archiveresult
""")
archive_results = cursor.fetchall()
from archivebox.uuid_compat import uuid7
from archivebox.base_models.models import get_or_create_system_user_pk
# Get or create a Machine record
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
if result:
machine_id = result[0]
print(f" Using existing Machine: {machine_id}")
else:
# Create a minimal Machine record with raw SQL (can't use model during migration)
print(" Creating Machine record for Process migration...")
import platform
import socket
# Generate minimal machine data without using the model
machine_id = str(uuid7())
guid = f"{socket.gethostname()}-{platform.machine()}"
hostname = socket.gethostname()
# Check schema version
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='config'")
has_config = cursor.fetchone()[0] > 0
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='abid'")
has_abid = cursor.fetchone()[0] > 0
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='num_uses_succeeded'")
has_num_uses = cursor.fetchone()[0] > 0
# Insert directly with SQL (use INSERT OR IGNORE in case it already exists)
if has_config:
# v0.9.0+ schema
cursor.execute("""
INSERT OR IGNORE INTO machine_machine (
id, created_at, modified_at,
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats, config
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', '{}')
""", (
machine_id, guid, hostname,
platform.machine(), platform.system(), platform.platform(), platform.release()
))
elif has_abid and has_num_uses:
# v0.8.6rc0 schema (has abid and num_uses columns)
cursor.execute("""
INSERT OR IGNORE INTO machine_machine (
id, abid, created_at, modified_at,
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats, num_uses_failed, num_uses_succeeded
) VALUES (?, '', datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', 0, 0)
""", (
machine_id, guid, hostname,
platform.machine(), platform.system(), platform.platform(), platform.release()
))
else:
# v0.7.2 or other schema
cursor.execute("""
INSERT OR IGNORE INTO machine_machine (
id, created_at, modified_at,
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}')
""", (
machine_id, guid, hostname,
platform.machine(), platform.system(), platform.platform(), platform.release()
))
# Re-query to get the actual id (in case INSERT OR IGNORE skipped it)
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
if result:
machine_id = result[0]
print(f" ✓ Using/Created Machine: {machine_id}")
else:
# INSERT OR IGNORE failed - try again without IGNORE to see the error
raise Exception("Failed to create Machine record - machine_machine table is empty after INSERT")
for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
# Create Process record
process_id = str(uuid7())
cursor.execute("""
INSERT INTO machine_process (
id, created_at, modified_at,
machine_id, binary_id, iface_id,
pwd, cmd, env, timeout,
pid, exit_code, stdout, stderr,
started_at, ended_at, url, status, retry_at
) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL)
""", (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued'))
# Update ArchiveResult to point to new Process
cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id))
print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data")
# Now rebuild table without the extra columns
print(" Rebuilding core_archiveresult table...")
cursor.execute("""
CREATE TABLE core_archiveresult_final (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
process_id TEXT NOT NULL,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
)
""")
# Copy data (cmd, pwd, etc. are now in Process records)
cursor.execute("""
INSERT INTO core_archiveresult_final SELECT
id, uuid, created_at, modified_at,
snapshot_id, plugin, hook_name,
start_ts, end_ts, status, retry_at,
output_files, output_json, output_str, output_size, output_mimetypes,
config, notes, num_uses_succeeded, num_uses_failed,
process_id
FROM core_archiveresult
""")
# Replace table
cursor.execute("DROP TABLE core_archiveresult")
cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult")
# Recreate indexes
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
print(" ✓ Cleaned up core_archiveresult schema")
class Migration(migrations.Migration):
dependencies = [
('core', '0024_assign_default_crawl'),
('machine', '0005_add_process_table'),
('crawls', '0002_upgrade_to_0_9_0'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(
cleanup_extra_columns,
reverse_code=migrations.RunPython.noop,
),
],
state_operations=[
# Tell Django about all the fields that exist after raw SQL migrations
# ArchiveResult model options
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
# Remove old fields
migrations.RemoveField(model_name='archiveresult', name='cmd'),
migrations.RemoveField(model_name='archiveresult', name='pwd'),
migrations.RemoveField(model_name='archiveresult', name='cmd_version'),
migrations.RemoveField(model_name='archiveresult', name='extractor'),
migrations.RemoveField(model_name='archiveresult', name='output'),
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# Add new ArchiveResult fields
migrations.AddField(
model_name='archiveresult',
name='plugin',
field=models.CharField(blank=True, default='', max_length=32),
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, default='', max_length=255),
),
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(blank=True, default=dict),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', max_length=512),
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
# Update Snapshot model
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
),
migrations.AddField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', max_length=10),
),
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(blank=True, default=dict),
),
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
# Update Tag model
migrations.AlterModelOptions(
name='tag',
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
),
migrations.AddField(
model_name='tag',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='tag',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
# Alter field types
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(max_length=2048),
),
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(editable=False, max_length=100, unique=True),
),
# Create M2M model for snapshot tags
migrations.CreateModel(
name='SnapshotTag',
fields=[
('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')),
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
],
options={
'db_table': 'core_snapshot_tags',
},
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
# Update tags field on Snapshot to use the through model
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
),
# Add constraints
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
),
],
),
]

View File

@@ -1,76 +0,0 @@
# Generated by hand on 2025-12-30
# Final field adjustments to match model definitions exactly
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from archivebox.uuid_compat import uuid7
class Migration(migrations.Migration):
dependencies = [
('core', '0025_cleanup_schema'),
('crawls', '0002_upgrade_to_0_9_0'),
]
operations = [
# Alter Snapshot fields to match model exactly
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True, unique=False),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
# Alter SnapshotTag fields
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
),
migrations.AlterField(
model_name='snapshottag',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
),
]

View File

@@ -1,108 +0,0 @@
# Generated by Django 6.0 on 2025-12-31 09:04
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0026_final_field_adjustments'),
('crawls', '0002_upgrade_to_0_9_0'),
('machine', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='plugin',
field=models.CharField(db_index=True, default='', max_length=32),
),
migrations.AlterField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
migrations.AlterField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict),
),
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
migrations.AlterField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
),
migrations.AlterField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(db_index=True, default=0),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
]

View File

@@ -91,9 +91,9 @@ class Tag(ModelWithSerializers):
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Tag model instance to a JSONL record.
Convert Tag model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
@@ -105,12 +105,12 @@ class Tag(ModelWithSerializers):
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
"""
Create/update Tag from JSONL record.
Create/update Tag from JSON dict.
Args:
record: JSONL record with 'name' field
record: JSON dict with 'name' field
overrides: Optional dict with 'snapshot' to auto-attach tag
Returns:
@@ -982,8 +982,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
processes_seen = set()
with open(index_path, 'w') as f:
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
f.write(json.dumps(self.to_jsonl()) + '\n')
# Write Snapshot record first (to_json includes crawl_id, fs_version)
f.write(json.dumps(self.to_json()) + '\n')
# Write ArchiveResult records with their associated Binary and Process
# Use select_related to optimize queries
@@ -991,15 +991,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Write Binary record if not already written
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
binaries_seen.add(ar.process.binary_id)
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
f.write(json.dumps(ar.process.binary.to_json()) + '\n')
# Write Process record if not already written
if ar.process and ar.process_id not in processes_seen:
processes_seen.add(ar.process_id)
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
f.write(json.dumps(ar.process.to_json()) + '\n')
# Write ArchiveResult record
f.write(json.dumps(ar.to_jsonl()) + '\n')
f.write(json.dumps(ar.to_json()) + '\n')
def read_index_jsonl(self) -> dict:
"""
@@ -1422,9 +1422,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return False
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Snapshot model instance to a JSONL record.
Convert Snapshot model instance to a JSON-serializable dict.
Includes all fields needed to fully reconstruct/identify this snapshot.
"""
from archivebox.config import VERSION
@@ -1445,9 +1445,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
"""
Create/update Snapshot from JSONL record or dict.
Create/update Snapshot from JSON dict.
Unified method that handles:
- ID-based patching: {"id": "...", "title": "new title"}
@@ -2106,8 +2106,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
result['canonical'] = self.canonical_outputs()
return result
def to_json(self, indent: int = 4) -> str:
"""Convert to JSON string"""
def to_json_str(self, indent: int = 4) -> str:
"""Convert to JSON string (legacy method, use to_json() for dict)"""
return to_json(self.to_dict(extended=True), indent=indent)
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2284,14 +2284,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
# Required - every ArchiveResult must have a Process
process = models.OneToOneField(
'machine.Process',
on_delete=models.PROTECT,
null=False, # Required after migration 4
related_name='archiveresult',
help_text='Process execution details for this archive result'
)
# Added POST-v0.9.0, will be added in a separate migration
# process = models.OneToOneField(
# 'machine.Process',
# on_delete=models.PROTECT,
# null=False,
# related_name='archiveresult',
# help_text='Process execution details for this archive result'
# )
# New output fields (replacing old 'output' field)
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
@@ -2326,9 +2326,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
return self.snapshot.crawl.created_by
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert ArchiveResult model instance to a JSONL record.
Convert ArchiveResult model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
@@ -2360,6 +2360,50 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
record['process_id'] = str(self.process_id)
return record
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
"""
Create/update ArchiveResult from JSON dict.
Args:
record: JSON dict with 'snapshot_id', 'plugin', etc.
overrides: Optional dict of field overrides
Returns:
ArchiveResult instance or None
"""
snapshot_id = record.get('snapshot_id')
plugin = record.get('plugin')
if not snapshot_id or not plugin:
return None
# Try to get existing by ID first
result_id = record.get('id')
if result_id:
try:
return ArchiveResult.objects.get(id=result_id)
except ArchiveResult.DoesNotExist:
pass
# Get or create by snapshot_id + plugin
try:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.get(id=snapshot_id)
result, _ = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin,
defaults={
'hook_name': record.get('hook_name', ''),
'status': record.get('status', 'queued'),
'output_str': record.get('output_str', ''),
}
)
return result
except Snapshot.DoesNotExist:
return None
def save(self, *args, **kwargs):
is_new = self._state.adding