mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
cleanup migrations, json, jsonl
This commit is contained in:
@@ -252,8 +252,8 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp', 'process__cmd')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
@@ -270,7 +270,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Command', {
|
||||
'fields': ('process', 'cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Output', {
|
||||
|
||||
@@ -1,299 +1,250 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Upgrades core app from v0.7.2 (migration 0022) or v0.8.6rc0 (migration 0076) to v0.9.0 using raw SQL
|
||||
# Upgrades core app from v0.7.2/v0.8.6rc0 (migration 0022) to v0.9.0 using raw SQL
|
||||
# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import migrations, models, connection
|
||||
|
||||
|
||||
def upgrade_from_v072_or_v086(apps, schema_editor):
|
||||
"""
|
||||
Upgrade core tables from either v0.7.2 or v0.8.6rc0 to v0.9.0.
|
||||
Handles differences in schema between versions.
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if uuid column exists (v0.7.2 has it, v0.8.6rc0 doesn't)
|
||||
def get_table_columns(table_name):
|
||||
"""Get list of column names for a table."""
|
||||
cursor = connection.cursor()
|
||||
cursor.execute(f"PRAGMA table_info({table_name})")
|
||||
return {row[1] for row in cursor.fetchall()}
|
||||
|
||||
|
||||
def upgrade_core_tables(apps, schema_editor):
|
||||
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if core_archiveresult table exists
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
|
||||
if not cursor.fetchone():
|
||||
# Fresh install - no migration needed, tables will be created by later migrations
|
||||
return
|
||||
|
||||
# Detect which version we're migrating from
|
||||
archiveresult_cols = get_table_columns('core_archiveresult')
|
||||
has_uuid = 'uuid' in archiveresult_cols
|
||||
has_abid = 'abid' in archiveresult_cols
|
||||
|
||||
# ============================================================================
|
||||
# PART 1: Upgrade core_archiveresult table
|
||||
# ============================================================================
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid TEXT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
snapshot_id TEXT NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL DEFAULT '',
|
||||
hook_name VARCHAR(255) NOT NULL DEFAULT '',
|
||||
|
||||
cmd TEXT,
|
||||
pwd VARCHAR(256),
|
||||
cmd_version VARCHAR(128),
|
||||
|
||||
start_ts DATETIME,
|
||||
end_ts DATETIME,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
output_files TEXT NOT NULL DEFAULT '{}',
|
||||
output_json TEXT,
|
||||
output_str TEXT NOT NULL DEFAULT '',
|
||||
output_size INTEGER NOT NULL DEFAULT 0,
|
||||
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE
|
||||
);
|
||||
""")
|
||||
|
||||
if has_uuid and not has_abid:
|
||||
# Migrating from v0.7.2 (has uuid, minimal fields)
|
||||
print('Migrating ArchiveResult from v0.7.2 schema...')
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='uuid'
|
||||
""")
|
||||
has_uuid = cursor.fetchone()[0] > 0
|
||||
|
||||
# Check if id is INTEGER (v0.7.2) or TEXT/char (v0.8.6rc0)
|
||||
cursor.execute("""
|
||||
SELECT type FROM pragma_table_info('core_archiveresult') WHERE name='id'
|
||||
""")
|
||||
id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
|
||||
is_v072 = 'INT' in id_type.upper()
|
||||
|
||||
# ============================================================================
|
||||
# PART 1: Upgrade core_archiveresult table
|
||||
# ============================================================================
|
||||
|
||||
# Create new table with v0.9.0 schema
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid TEXT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
snapshot_id TEXT NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL DEFAULT '',
|
||||
hook_name VARCHAR(255) NOT NULL DEFAULT '',
|
||||
|
||||
cmd TEXT,
|
||||
pwd VARCHAR(256),
|
||||
cmd_version VARCHAR(128),
|
||||
|
||||
start_ts DATETIME,
|
||||
end_ts DATETIME,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
output_files TEXT NOT NULL DEFAULT '{}',
|
||||
output_json TEXT,
|
||||
output_str TEXT NOT NULL DEFAULT '',
|
||||
output_size INTEGER NOT NULL DEFAULT 0,
|
||||
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
binary_id TEXT,
|
||||
iface_id TEXT,
|
||||
process_id TEXT,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL,
|
||||
FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL,
|
||||
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
|
||||
)
|
||||
SELECT
|
||||
id, uuid,
|
||||
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
|
||||
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult;
|
||||
""")
|
||||
|
||||
# Copy data based on source version
|
||||
if is_v072:
|
||||
# Coming from v0.7.2: has INTEGER id, has uuid column, has extractor
|
||||
print(" Migrating from v0.7.2 schema...")
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
|
||||
)
|
||||
SELECT
|
||||
uuid,
|
||||
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
|
||||
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
else:
|
||||
# Coming from v0.8.6rc0: has TEXT id, no uuid column, has abid
|
||||
print(" Migrating from v0.8.6rc0 schema...")
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
|
||||
)
|
||||
SELECT
|
||||
id as uuid,
|
||||
created_at,
|
||||
modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status, retry_at,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
|
||||
# Replace old table
|
||||
cursor.execute("DROP TABLE IF EXISTS core_archiveresult")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
|
||||
|
||||
# ============================================================================
|
||||
# PART 2: Upgrade core_snapshot table
|
||||
# ============================================================================
|
||||
|
||||
# Check snapshot schema version
|
||||
elif has_abid and not has_uuid:
|
||||
# Migrating from v0.8.6rc0 (has abid, full fields)
|
||||
print('Migrating ArchiveResult from v0.8.6rc0 schema...')
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM pragma_table_info('core_snapshot') WHERE name='crawl_id'
|
||||
""")
|
||||
has_crawl_id = cursor.fetchone()[0] > 0
|
||||
|
||||
# Create new table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_snapshot_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
downloaded_at DATETIME,
|
||||
|
||||
url TEXT NOT NULL,
|
||||
timestamp TEXT NOT NULL,
|
||||
title TEXT,
|
||||
|
||||
crawl_id TEXT,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
parent_snapshot_id TEXT,
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0
|
||||
|
||||
-- Note: crawl_id foreign key will be added in 0024 after assigning crawl_ids
|
||||
-- FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
-- FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
|
||||
)
|
||||
SELECT
|
||||
id, abid as uuid,
|
||||
created_at, modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status, retry_at,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult;
|
||||
""")
|
||||
else:
|
||||
print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
|
||||
|
||||
# Copy snapshot data
|
||||
if has_crawl_id:
|
||||
# v0.8.6rc0 schema - already has created_at, modified_at, bookmarked_at
|
||||
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid);")
|
||||
|
||||
# ============================================================================
|
||||
# PART 2: Upgrade core_snapshot table
|
||||
# ============================================================================
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_snapshot_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
url TEXT NOT NULL,
|
||||
timestamp VARCHAR(32) NOT NULL UNIQUE,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
crawl_id TEXT,
|
||||
parent_snapshot_id TEXT,
|
||||
|
||||
title VARCHAR(512),
|
||||
downloaded_at DATETIME,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
);
|
||||
""")
|
||||
|
||||
# Check if core_snapshot exists (it should)
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'")
|
||||
if cursor.fetchone():
|
||||
# Detect which version we're migrating from
|
||||
snapshot_cols = get_table_columns('core_snapshot')
|
||||
has_added = 'added' in snapshot_cols
|
||||
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
|
||||
|
||||
if has_added and not has_bookmarked_at:
|
||||
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
|
||||
print('Migrating Snapshot from v0.7.2 schema...')
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_new (
|
||||
id, created_at, modified_at, bookmarked_at, downloaded_at, url, timestamp,
|
||||
crawl_id, status, retry_at
|
||||
id, url, timestamp, title, bookmarked_at, created_at, modified_at
|
||||
)
|
||||
SELECT
|
||||
id,
|
||||
created_at,
|
||||
modified_at,
|
||||
bookmarked_at,
|
||||
downloaded_at,
|
||||
url, timestamp,
|
||||
NULLIF(crawl_id, ''),
|
||||
COALESCE(status, 'queued'),
|
||||
retry_at
|
||||
FROM core_snapshot
|
||||
id, url, timestamp, title,
|
||||
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
|
||||
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
|
||||
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
|
||||
FROM core_snapshot;
|
||||
""")
|
||||
elif has_bookmarked_at and not has_added:
|
||||
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
|
||||
print('Migrating Snapshot from v0.8.6rc0 schema...')
|
||||
# Check what fields exist
|
||||
has_status = 'status' in snapshot_cols
|
||||
has_retry_at = 'retry_at' in snapshot_cols
|
||||
has_crawl_id = 'crawl_id' in snapshot_cols
|
||||
|
||||
# Build column list based on what exists
|
||||
cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at']
|
||||
if has_crawl_id:
|
||||
cols.append('crawl_id')
|
||||
if has_status:
|
||||
cols.append('status')
|
||||
if has_retry_at:
|
||||
cols.append('retry_at')
|
||||
|
||||
cursor.execute(f"""
|
||||
INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)})
|
||||
SELECT {', '.join(cols)}
|
||||
FROM core_snapshot;
|
||||
""")
|
||||
else:
|
||||
# v0.7.2 schema - will get crawl_id assigned by later migration (0024)
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_new (
|
||||
id, created_at, modified_at, bookmarked_at, url, timestamp, crawl_id
|
||||
)
|
||||
SELECT
|
||||
id,
|
||||
COALESCE(added, CURRENT_TIMESTAMP),
|
||||
COALESCE(updated, added, CURRENT_TIMESTAMP),
|
||||
COALESCE(added, CURRENT_TIMESTAMP),
|
||||
url, timestamp,
|
||||
NULL as crawl_id
|
||||
FROM core_snapshot
|
||||
""")
|
||||
print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}')
|
||||
|
||||
# Replace old table
|
||||
cursor.execute("DROP TABLE IF EXISTS core_snapshot")
|
||||
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot")
|
||||
cursor.execute("DROP TABLE IF EXISTS core_snapshot;")
|
||||
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at)")
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);")
|
||||
cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);")
|
||||
|
||||
# ============================================================================
|
||||
# PART 3: Upgrade core_tag table
|
||||
# ============================================================================
|
||||
# ============================================================================
|
||||
# PART 3: Upgrade core_tag table
|
||||
# ============================================================================
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_tag_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
# Check if tag id is INTEGER (v0.7.2) or TEXT (v0.8.6rc0)
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
slug VARCHAR(100) NOT NULL UNIQUE,
|
||||
|
||||
created_by_id INTEGER,
|
||||
|
||||
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
|
||||
);
|
||||
""")
|
||||
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_tag'")
|
||||
if cursor.fetchone():
|
||||
cursor.execute("""
|
||||
SELECT type FROM pragma_table_info('core_tag') WHERE name='id'
|
||||
""")
|
||||
tag_id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
|
||||
tag_id_is_int = 'INT' in tag_id_type.upper()
|
||||
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_tag_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
created_by_id INTEGER,
|
||||
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
slug VARCHAR(100) NOT NULL UNIQUE,
|
||||
|
||||
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE SET NULL
|
||||
)
|
||||
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
|
||||
SELECT id, name, slug
|
||||
FROM core_tag;
|
||||
""")
|
||||
|
||||
if tag_id_is_int:
|
||||
# v0.7.2: Direct copy (INTEGER to INTEGER)
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
|
||||
SELECT id, name, slug FROM core_tag
|
||||
""")
|
||||
else:
|
||||
# v0.8.6rc0: Need to remap TEXT ids to new INTEGER ids
|
||||
cursor.execute("SELECT id, name, slug FROM core_tag")
|
||||
old_tags = cursor.fetchall()
|
||||
tag_id_mapping = {} # old_text_id -> new_int_id
|
||||
cursor.execute("DROP TABLE IF EXISTS core_tag;")
|
||||
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag;")
|
||||
|
||||
for old_id, name, slug in old_tags:
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_tag_new (name, slug)
|
||||
VALUES (?, ?)
|
||||
""", [name, slug])
|
||||
cursor.execute("SELECT id FROM core_tag_new WHERE slug = ?", [slug])
|
||||
new_id = cursor.fetchone()[0]
|
||||
tag_id_mapping[old_id] = new_id
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);")
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS core_tag")
|
||||
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag")
|
||||
|
||||
# Recreate M2M table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_snapshot_tags_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
snapshot_id TEXT NOT NULL,
|
||||
tag_id INTEGER NOT NULL,
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (tag_id) REFERENCES core_tag(id) ON DELETE CASCADE,
|
||||
UNIQUE(snapshot_id, tag_id)
|
||||
)
|
||||
""")
|
||||
|
||||
if tag_id_is_int:
|
||||
# Direct copy for v0.7.2
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
|
||||
SELECT snapshot_id, tag_id FROM core_snapshot_tags
|
||||
""")
|
||||
else:
|
||||
# v0.8.6rc0: Use mapping to convert old TEXT ids to new INTEGER ids
|
||||
cursor.execute("SELECT snapshot_id, tag_id FROM core_snapshot_tags")
|
||||
m2m_entries = cursor.fetchall()
|
||||
for snapshot_id, old_tag_id in m2m_entries:
|
||||
new_tag_id = tag_id_mapping.get(old_tag_id)
|
||||
if new_tag_id:
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
""", [snapshot_id, new_tag_id])
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS core_snapshot_tags")
|
||||
cursor.execute("ALTER TABLE core_snapshot_tags_new RENAME TO core_snapshot_tags")
|
||||
print('✓ Core tables upgraded to v0.9.0')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -301,10 +252,49 @@ class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
('crawls', '0001_initial'),
|
||||
('machine', '0001_initial'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(upgrade_from_v072_or_v086, reverse_code=migrations.RunPython.noop),
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunPython(
|
||||
upgrade_core_tables,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# Remove old ArchiveResult fields
|
||||
migrations.RemoveField(model_name='archiveresult', name='extractor'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='output'),
|
||||
# Remove old Snapshot fields
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
# SnapshotTag table already exists from v0.7.2, just declare it in state
|
||||
migrations.CreateModel(
|
||||
name='SnapshotTag',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)),
|
||||
('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'core_snapshot_tags',
|
||||
'unique_together': {('snapshot', 'tag')},
|
||||
},
|
||||
),
|
||||
# Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2)
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(
|
||||
'Tag',
|
||||
blank=True,
|
||||
related_name='snapshot_set',
|
||||
through='SnapshotTag',
|
||||
through_fields=('snapshot', 'tag'),
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import migrations, models
|
||||
import uuid
|
||||
|
||||
|
||||
@@ -56,8 +56,7 @@ class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0023_upgrade_to_0_9_0'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
('machine', '0001_initial'),
|
||||
('crawls', '0001_initial'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
@@ -66,65 +65,80 @@ class Migration(migrations.Migration):
|
||||
create_default_crawl_and_assign_snapshots,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
# Now make crawl_id NOT NULL
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
-- Rebuild snapshot table with NOT NULL crawl_id
|
||||
CREATE TABLE core_snapshot_final (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
# Now make crawl_id NOT NULL
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
-- Rebuild snapshot table with NOT NULL crawl_id
|
||||
CREATE TABLE core_snapshot_final (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
url TEXT NOT NULL,
|
||||
timestamp VARCHAR(32) NOT NULL UNIQUE,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
url TEXT NOT NULL,
|
||||
timestamp VARCHAR(32) NOT NULL UNIQUE,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
crawl_id TEXT NOT NULL,
|
||||
parent_snapshot_id TEXT,
|
||||
crawl_id TEXT NOT NULL,
|
||||
parent_snapshot_id TEXT,
|
||||
|
||||
title VARCHAR(512),
|
||||
downloaded_at DATETIME,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
title VARCHAR(512),
|
||||
downloaded_at DATETIME,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
);
|
||||
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
);
|
||||
|
||||
INSERT INTO core_snapshot_final (
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
config, notes, num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
)
|
||||
SELECT
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
FROM core_snapshot;
|
||||
INSERT INTO core_snapshot_final (
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
config, notes, num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
)
|
||||
SELECT
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
FROM core_snapshot;
|
||||
|
||||
DROP TABLE core_snapshot;
|
||||
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
|
||||
DROP TABLE core_snapshot;
|
||||
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
|
||||
|
||||
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
|
||||
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
|
||||
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
|
||||
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
|
||||
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
|
||||
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
|
||||
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
|
||||
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
|
||||
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
|
||||
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
|
||||
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
|
||||
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
|
||||
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
|
||||
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
|
||||
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(
|
||||
on_delete=models.deletion.CASCADE,
|
||||
to='crawls.crawl',
|
||||
help_text='Crawl that created this snapshot'
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -0,0 +1,258 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 23:09
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_assign_default_crawl'),
|
||||
('crawls', '0001_initial'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='snapshot',
|
||||
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='pwd',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(db_index=True, default='', max_length=32),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='end_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(db_index=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(editable=False, max_length=100, unique=True),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
|
||||
),
|
||||
]
|
||||
@@ -1,484 +0,0 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Cleans up extra columns from raw SQL migrations and ensures schema matches models
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
import archivebox.base_models.models
|
||||
|
||||
|
||||
def cleanup_extra_columns(apps, schema_editor):
|
||||
"""
|
||||
Create Process records from old cmd/pwd/cmd_version columns and remove those columns.
|
||||
This preserves the execution details by moving them to the Process model.
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'")
|
||||
has_cmd = cursor.fetchone()[0] > 0
|
||||
|
||||
if has_cmd:
|
||||
print(" Migrating cmd/pwd/cmd_version data to Process records...")
|
||||
|
||||
# For each ArchiveResult, create a Process record with cmd/pwd data
|
||||
# Note: cmd_version from old schema is not preserved (it's now derived from Binary)
|
||||
cursor.execute("""
|
||||
SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
archive_results = cursor.fetchall()
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
# Get or create a Machine record
|
||||
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
|
||||
if result:
|
||||
machine_id = result[0]
|
||||
print(f" Using existing Machine: {machine_id}")
|
||||
else:
|
||||
# Create a minimal Machine record with raw SQL (can't use model during migration)
|
||||
print(" Creating Machine record for Process migration...")
|
||||
import platform
|
||||
import socket
|
||||
|
||||
# Generate minimal machine data without using the model
|
||||
machine_id = str(uuid7())
|
||||
guid = f"{socket.gethostname()}-{platform.machine()}"
|
||||
hostname = socket.gethostname()
|
||||
|
||||
# Check schema version
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='config'")
|
||||
has_config = cursor.fetchone()[0] > 0
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='abid'")
|
||||
has_abid = cursor.fetchone()[0] > 0
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='num_uses_succeeded'")
|
||||
has_num_uses = cursor.fetchone()[0] > 0
|
||||
|
||||
# Insert directly with SQL (use INSERT OR IGNORE in case it already exists)
|
||||
if has_config:
|
||||
# v0.9.0+ schema
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO machine_machine (
|
||||
id, created_at, modified_at,
|
||||
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, config
|
||||
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', '{}')
|
||||
""", (
|
||||
machine_id, guid, hostname,
|
||||
platform.machine(), platform.system(), platform.platform(), platform.release()
|
||||
))
|
||||
elif has_abid and has_num_uses:
|
||||
# v0.8.6rc0 schema (has abid and num_uses columns)
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO machine_machine (
|
||||
id, abid, created_at, modified_at,
|
||||
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, '', datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', 0, 0)
|
||||
""", (
|
||||
machine_id, guid, hostname,
|
||||
platform.machine(), platform.system(), platform.platform(), platform.release()
|
||||
))
|
||||
else:
|
||||
# v0.7.2 or other schema
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO machine_machine (
|
||||
id, created_at, modified_at,
|
||||
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats
|
||||
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}')
|
||||
""", (
|
||||
machine_id, guid, hostname,
|
||||
platform.machine(), platform.system(), platform.platform(), platform.release()
|
||||
))
|
||||
# Re-query to get the actual id (in case INSERT OR IGNORE skipped it)
|
||||
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
|
||||
if result:
|
||||
machine_id = result[0]
|
||||
print(f" ✓ Using/Created Machine: {machine_id}")
|
||||
else:
|
||||
# INSERT OR IGNORE failed - try again without IGNORE to see the error
|
||||
raise Exception("Failed to create Machine record - machine_machine table is empty after INSERT")
|
||||
|
||||
for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
|
||||
# Create Process record
|
||||
process_id = str(uuid7())
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_process (
|
||||
id, created_at, modified_at,
|
||||
machine_id, binary_id, iface_id,
|
||||
pwd, cmd, env, timeout,
|
||||
pid, exit_code, stdout, stderr,
|
||||
started_at, ended_at, url, status, retry_at
|
||||
) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL)
|
||||
""", (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued'))
|
||||
|
||||
# Update ArchiveResult to point to new Process
|
||||
cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id))
|
||||
|
||||
print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data")
|
||||
|
||||
# Now rebuild table without the extra columns
|
||||
print(" Rebuilding core_archiveresult table...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE core_archiveresult_final (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid TEXT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
snapshot_id TEXT NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL DEFAULT '',
|
||||
hook_name VARCHAR(255) NOT NULL DEFAULT '',
|
||||
|
||||
start_ts DATETIME,
|
||||
end_ts DATETIME,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
output_files TEXT NOT NULL DEFAULT '{}',
|
||||
output_json TEXT,
|
||||
output_str TEXT NOT NULL DEFAULT '',
|
||||
output_size INTEGER NOT NULL DEFAULT 0,
|
||||
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
process_id TEXT NOT NULL,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
|
||||
)
|
||||
""")
|
||||
|
||||
# Copy data (cmd, pwd, etc. are now in Process records)
|
||||
cursor.execute("""
|
||||
INSERT INTO core_archiveresult_final SELECT
|
||||
id, uuid, created_at, modified_at,
|
||||
snapshot_id, plugin, hook_name,
|
||||
start_ts, end_ts, status, retry_at,
|
||||
output_files, output_json, output_str, output_size, output_mimetypes,
|
||||
config, notes, num_uses_succeeded, num_uses_failed,
|
||||
process_id
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
|
||||
# Replace table
|
||||
cursor.execute("DROP TABLE core_archiveresult")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult")
|
||||
|
||||
# Recreate indexes
|
||||
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
|
||||
|
||||
print(" ✓ Cleaned up core_archiveresult schema")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_assign_default_crawl'),
|
||||
('machine', '0005_add_process_table'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunPython(
|
||||
cleanup_extra_columns,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# Tell Django about all the fields that exist after raw SQL migrations
|
||||
# ArchiveResult model options
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
|
||||
),
|
||||
|
||||
# Remove old fields
|
||||
migrations.RemoveField(model_name='archiveresult', name='cmd'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='pwd'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='cmd_version'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='extractor'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='output'),
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
|
||||
# Add new ArchiveResult fields
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(blank=True, default='', max_length=32),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, default='', max_length=255),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(blank=True, default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', max_length=512),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='process',
|
||||
field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
|
||||
),
|
||||
|
||||
# Update Snapshot model
|
||||
migrations.AlterModelOptions(
|
||||
name='snapshot',
|
||||
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', max_length=10),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
|
||||
# Update Tag model
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
|
||||
# Alter field types
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='end_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, max_length=32, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(max_length=2048),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(editable=False, max_length=100, unique=True),
|
||||
),
|
||||
|
||||
# Create M2M model for snapshot tags
|
||||
migrations.CreateModel(
|
||||
name='SnapshotTag',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
|
||||
('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'core_snapshot_tags',
|
||||
},
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'tag')},
|
||||
),
|
||||
|
||||
# Update tags field on Snapshot to use the through model
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
|
||||
),
|
||||
|
||||
# Add constraints
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -1,76 +0,0 @@
|
||||
# Generated by hand on 2025-12-30
|
||||
# Final field adjustments to match model definitions exactly
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_cleanup_schema'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Alter Snapshot fields to match model exactly
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(db_index=True, unique=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||
),
|
||||
|
||||
# Alter SnapshotTag fields
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='tag',
|
||||
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
|
||||
),
|
||||
]
|
||||
@@ -1,108 +0,0 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 09:04
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0026_final_field_adjustments'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
('machine', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(db_index=True, default='', max_length=32),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='process',
|
||||
field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False),
|
||||
),
|
||||
]
|
||||
@@ -91,9 +91,9 @@ class Tag(ModelWithSerializers):
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_tag', args=[self.id])
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Tag model instance to a JSONL record.
|
||||
Convert Tag model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
@@ -105,12 +105,12 @@ class Tag(ModelWithSerializers):
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
"""
|
||||
Create/update Tag from JSONL record.
|
||||
Create/update Tag from JSON dict.
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'name' field
|
||||
record: JSON dict with 'name' field
|
||||
overrides: Optional dict with 'snapshot' to auto-attach tag
|
||||
|
||||
Returns:
|
||||
@@ -982,8 +982,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
processes_seen = set()
|
||||
|
||||
with open(index_path, 'w') as f:
|
||||
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
|
||||
f.write(json.dumps(self.to_jsonl()) + '\n')
|
||||
# Write Snapshot record first (to_json includes crawl_id, fs_version)
|
||||
f.write(json.dumps(self.to_json()) + '\n')
|
||||
|
||||
# Write ArchiveResult records with their associated Binary and Process
|
||||
# Use select_related to optimize queries
|
||||
@@ -991,15 +991,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# Write Binary record if not already written
|
||||
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
|
||||
binaries_seen.add(ar.process.binary_id)
|
||||
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
|
||||
f.write(json.dumps(ar.process.binary.to_json()) + '\n')
|
||||
|
||||
# Write Process record if not already written
|
||||
if ar.process and ar.process_id not in processes_seen:
|
||||
processes_seen.add(ar.process_id)
|
||||
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
|
||||
f.write(json.dumps(ar.process.to_json()) + '\n')
|
||||
|
||||
# Write ArchiveResult record
|
||||
f.write(json.dumps(ar.to_jsonl()) + '\n')
|
||||
f.write(json.dumps(ar.to_json()) + '\n')
|
||||
|
||||
def read_index_jsonl(self) -> dict:
|
||||
"""
|
||||
@@ -1422,9 +1422,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
return False
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Snapshot model instance to a JSONL record.
|
||||
Convert Snapshot model instance to a JSON-serializable dict.
|
||||
Includes all fields needed to fully reconstruct/identify this snapshot.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
@@ -1445,9 +1445,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
"""
|
||||
Create/update Snapshot from JSONL record or dict.
|
||||
Create/update Snapshot from JSON dict.
|
||||
|
||||
Unified method that handles:
|
||||
- ID-based patching: {"id": "...", "title": "new title"}
|
||||
@@ -2106,8 +2106,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
result['canonical'] = self.canonical_outputs()
|
||||
return result
|
||||
|
||||
def to_json(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string"""
|
||||
def to_json_str(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string (legacy method, use to_json() for dict)"""
|
||||
return to_json(self.to_dict(extended=True), indent=indent)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
|
||||
@@ -2284,14 +2284,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
|
||||
|
||||
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
|
||||
# Required - every ArchiveResult must have a Process
|
||||
process = models.OneToOneField(
|
||||
'machine.Process',
|
||||
on_delete=models.PROTECT,
|
||||
null=False, # Required after migration 4
|
||||
related_name='archiveresult',
|
||||
help_text='Process execution details for this archive result'
|
||||
)
|
||||
# Added POST-v0.9.0, will be added in a separate migration
|
||||
# process = models.OneToOneField(
|
||||
# 'machine.Process',
|
||||
# on_delete=models.PROTECT,
|
||||
# null=False,
|
||||
# related_name='archiveresult',
|
||||
# help_text='Process execution details for this archive result'
|
||||
# )
|
||||
|
||||
# New output fields (replacing old 'output' field)
|
||||
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
|
||||
@@ -2326,9 +2326,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
|
||||
return self.snapshot.crawl.created_by
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert ArchiveResult model instance to a JSONL record.
|
||||
Convert ArchiveResult model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
record = {
|
||||
@@ -2360,6 +2360,50 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
record['process_id'] = str(self.process_id)
|
||||
return record
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
"""
|
||||
Create/update ArchiveResult from JSON dict.
|
||||
|
||||
Args:
|
||||
record: JSON dict with 'snapshot_id', 'plugin', etc.
|
||||
overrides: Optional dict of field overrides
|
||||
|
||||
Returns:
|
||||
ArchiveResult instance or None
|
||||
"""
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin = record.get('plugin')
|
||||
|
||||
if not snapshot_id or not plugin:
|
||||
return None
|
||||
|
||||
# Try to get existing by ID first
|
||||
result_id = record.get('id')
|
||||
if result_id:
|
||||
try:
|
||||
return ArchiveResult.objects.get(id=result_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
pass
|
||||
|
||||
# Get or create by snapshot_id + plugin
|
||||
try:
|
||||
from archivebox.core.models import Snapshot
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
result, _ = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin,
|
||||
defaults={
|
||||
'hook_name': record.get('hook_name', ''),
|
||||
'status': record.get('status', 'queued'),
|
||||
'output_str': record.get('output_str', ''),
|
||||
}
|
||||
)
|
||||
return result
|
||||
except Snapshot.DoesNotExist:
|
||||
return None
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
|
||||
|
||||
Reference in New Issue
Block a user