From a04e4a73457737124b0abbc0781deceb0eb03eb0 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 31 Dec 2025 15:36:13 -0800 Subject: [PATCH] cleanup migrations, json, jsonl --- archivebox/cli/archivebox_extract.py | 2 +- archivebox/core/admin_archiveresults.py | 6 +- .../core/migrations/0023_upgrade_to_0_9_0.py | 534 +++++++++--------- .../migrations/0024_assign_default_crawl.py | 122 ++-- ...options_alter_snapshot_options_and_more.py | 258 +++++++++ .../core/migrations/0025_cleanup_schema.py | 484 ---------------- .../0026_final_field_adjustments.py | 76 --- ...ok_name_alter_archiveresult_id_and_more.py | 108 ---- archivebox/core/models.py | 96 +++- .../migrations/0002_upgrade_to_0_9_0.py | 90 --- archivebox/crawls/models.py | 8 +- archivebox/hooks.py | 10 +- archivebox/machine/migrations/0001_initial.py | 81 +-- archivebox/machine/migrations/0002_process.py | 45 ++ .../0002_process_parent_and_type.py | 101 ---- archivebox/machine/models.py | 111 ++-- .../machine/tests/test_machine_models.py | 12 +- .../on_Crawl__25_twocaptcha_config.js | 43 +- .../twocaptcha/tests/test_twocaptcha.py | 139 ++++- archivebox/workers/orchestrator.py | 83 +-- bin/test_plugins.sh | 2 +- 21 files changed, 993 insertions(+), 1418 deletions(-) create mode 100644 archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py delete mode 100644 archivebox/core/migrations/0025_cleanup_schema.py delete mode 100644 archivebox/core/migrations/0026_final_field_adjustments.py delete mode 100644 archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py delete mode 100644 archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py create mode 100644 archivebox/machine/migrations/0002_process.py delete mode 100644 archivebox/machine/migrations/0002_process_parent_and_type.py diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index 7dc043ae..99d84d5c 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -207,7 +207,7 @@ def run_plugins( }.get(result.status, 'dim') rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr) else: - write_record(result.to_jsonl()) + write_record(result.to_json()) except Snapshot.DoesNotExist: continue diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 2edfca69..4064d85e 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -252,8 +252,8 @@ class ArchiveResultInline(admin.TabularInline): class ArchiveResultAdmin(BaseModelAdmin): list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str') sort_fields = ('id', 'created_at', 'plugin', 'status') - readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process') - search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp', 'process__cmd') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon') + search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp') autocomplete_fields = ['snapshot'] fieldsets = ( @@ -270,7 +270,7 @@ class ArchiveResultAdmin(BaseModelAdmin): 'classes': ('card',), }), ('Command', { - 'fields': ('process', 'cmd', 'cmd_str', 'cmd_version', 'pwd'), + 'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'), 'classes': ('card',), }), ('Output', { diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py index ca7e9b0b..0f5ac5ac 100644 --- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -1,299 +1,250 @@ # Generated by hand on 2025-12-29 -# Upgrades core app from v0.7.2 (migration 0022) or v0.8.6rc0 (migration 0076) to v0.9.0 using raw SQL +# Upgrades core app from v0.7.2/v0.8.6rc0 (migration 0022) to v0.9.0 using raw SQL +# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0 -from django.db import migrations +from django.db import migrations, models, connection -def upgrade_from_v072_or_v086(apps, schema_editor): - """ - Upgrade core tables from either v0.7.2 or v0.8.6rc0 to v0.9.0. - Handles differences in schema between versions. - """ - with schema_editor.connection.cursor() as cursor: - # Check if uuid column exists (v0.7.2 has it, v0.8.6rc0 doesn't) +def get_table_columns(table_name): + """Get list of column names for a table.""" + cursor = connection.cursor() + cursor.execute(f"PRAGMA table_info({table_name})") + return {row[1] for row in cursor.fetchall()} + + +def upgrade_core_tables(apps, schema_editor): + """Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0.""" + cursor = connection.cursor() + + # Check if core_archiveresult table exists + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'") + if not cursor.fetchone(): + # Fresh install - no migration needed, tables will be created by later migrations + return + + # Detect which version we're migrating from + archiveresult_cols = get_table_columns('core_archiveresult') + has_uuid = 'uuid' in archiveresult_cols + has_abid = 'abid' in archiveresult_cols + + # ============================================================================ + # PART 1: Upgrade core_archiveresult table + # ============================================================================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_archiveresult_new ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid TEXT, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + snapshot_id TEXT NOT NULL, + plugin VARCHAR(32) NOT NULL DEFAULT '', + hook_name VARCHAR(255) NOT NULL DEFAULT '', + + cmd TEXT, + pwd VARCHAR(256), + cmd_version VARCHAR(128), + + start_ts DATETIME, + end_ts DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + output_files TEXT NOT NULL DEFAULT '{}', + output_json TEXT, + output_str TEXT NOT NULL DEFAULT '', + output_size INTEGER NOT NULL DEFAULT 0, + output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', + + config TEXT, + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE + ); + """) + + if has_uuid and not has_abid: + # Migrating from v0.7.2 (has uuid, minimal fields) + print('Migrating ArchiveResult from v0.7.2 schema...') cursor.execute(""" - SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='uuid' - """) - has_uuid = cursor.fetchone()[0] > 0 - - # Check if id is INTEGER (v0.7.2) or TEXT/char (v0.8.6rc0) - cursor.execute(""" - SELECT type FROM pragma_table_info('core_archiveresult') WHERE name='id' - """) - id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER' - is_v072 = 'INT' in id_type.upper() - - # ============================================================================ - # PART 1: Upgrade core_archiveresult table - # ============================================================================ - - # Create new table with v0.9.0 schema - cursor.execute(""" - CREATE TABLE IF NOT EXISTS core_archiveresult_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - uuid TEXT, - created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - - snapshot_id TEXT NOT NULL, - plugin VARCHAR(32) NOT NULL DEFAULT '', - hook_name VARCHAR(255) NOT NULL DEFAULT '', - - cmd TEXT, - pwd VARCHAR(256), - cmd_version VARCHAR(128), - - start_ts DATETIME, - end_ts DATETIME, - status VARCHAR(15) NOT NULL DEFAULT 'queued', - retry_at DATETIME, - - output_files TEXT NOT NULL DEFAULT '{}', - output_json TEXT, - output_str TEXT NOT NULL DEFAULT '', - output_size INTEGER NOT NULL DEFAULT 0, - output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', - - config TEXT, - notes TEXT NOT NULL DEFAULT '', - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0, - - binary_id TEXT, - iface_id TEXT, - process_id TEXT, - - FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, - FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL, - FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL, - FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, created_at, modified_at, snapshot_id, plugin, + cmd, pwd, cmd_version, start_ts, end_ts, status, output_str ) + SELECT + id, uuid, + COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at, + COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at, + snapshot_id, + COALESCE(extractor, '') as plugin, + cmd, pwd, cmd_version, + start_ts, end_ts, status, + COALESCE(output, '') as output_str + FROM core_archiveresult; """) - - # Copy data based on source version - if is_v072: - # Coming from v0.7.2: has INTEGER id, has uuid column, has extractor - print(" Migrating from v0.7.2 schema...") - cursor.execute(""" - INSERT OR IGNORE INTO core_archiveresult_new ( - uuid, created_at, modified_at, snapshot_id, plugin, - cmd, pwd, cmd_version, start_ts, end_ts, status, output_str - ) - SELECT - uuid, - COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at, - COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at, - snapshot_id, - COALESCE(extractor, '') as plugin, - cmd, pwd, cmd_version, - start_ts, end_ts, status, - COALESCE(output, '') as output_str - FROM core_archiveresult - """) - else: - # Coming from v0.8.6rc0: has TEXT id, no uuid column, has abid - print(" Migrating from v0.8.6rc0 schema...") - cursor.execute(""" - INSERT OR IGNORE INTO core_archiveresult_new ( - uuid, created_at, modified_at, snapshot_id, plugin, - cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str - ) - SELECT - id as uuid, - created_at, - modified_at, - snapshot_id, - COALESCE(extractor, '') as plugin, - cmd, pwd, cmd_version, - start_ts, end_ts, status, retry_at, - COALESCE(output, '') as output_str - FROM core_archiveresult - """) - - # Replace old table - cursor.execute("DROP TABLE IF EXISTS core_archiveresult") - cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult") - - # Create indexes - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid)") - - # ============================================================================ - # PART 2: Upgrade core_snapshot table - # ============================================================================ - - # Check snapshot schema version + elif has_abid and not has_uuid: + # Migrating from v0.8.6rc0 (has abid, full fields) + print('Migrating ArchiveResult from v0.8.6rc0 schema...') cursor.execute(""" - SELECT COUNT(*) FROM pragma_table_info('core_snapshot') WHERE name='crawl_id' - """) - has_crawl_id = cursor.fetchone()[0] > 0 - - # Create new table - cursor.execute(""" - CREATE TABLE IF NOT EXISTS core_snapshot_new ( - id TEXT PRIMARY KEY NOT NULL, - created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - downloaded_at DATETIME, - - url TEXT NOT NULL, - timestamp TEXT NOT NULL, - title TEXT, - - crawl_id TEXT, - depth INTEGER NOT NULL DEFAULT 0, - parent_snapshot_id TEXT, - - status VARCHAR(15) NOT NULL DEFAULT 'queued', - retry_at DATETIME, - current_step INTEGER NOT NULL DEFAULT 0, - - fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', - config TEXT, - notes TEXT NOT NULL DEFAULT '', - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0 - - -- Note: crawl_id foreign key will be added in 0024 after assigning crawl_ids - -- FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, - -- FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL + INSERT OR IGNORE INTO core_archiveresult_new ( + id, uuid, created_at, modified_at, snapshot_id, plugin, + cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str ) + SELECT + id, abid as uuid, + created_at, modified_at, + snapshot_id, + COALESCE(extractor, '') as plugin, + cmd, pwd, cmd_version, + start_ts, end_ts, status, retry_at, + COALESCE(output, '') as output_str + FROM core_archiveresult; """) + else: + print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}') - # Copy snapshot data - if has_crawl_id: - # v0.8.6rc0 schema - already has created_at, modified_at, bookmarked_at + cursor.execute("DROP TABLE IF EXISTS core_archiveresult;") + cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;") + + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid);") + + # ============================================================================ + # PART 2: Upgrade core_snapshot table + # ============================================================================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_snapshot_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + url TEXT NOT NULL, + timestamp VARCHAR(32) NOT NULL UNIQUE, + bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + crawl_id TEXT, + parent_snapshot_id TEXT, + + title VARCHAR(512), + downloaded_at DATETIME, + depth INTEGER NOT NULL DEFAULT 0, + fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', + + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + current_step INTEGER NOT NULL DEFAULT 0, + + FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, + FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL + ); + """) + + # Check if core_snapshot exists (it should) + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'") + if cursor.fetchone(): + # Detect which version we're migrating from + snapshot_cols = get_table_columns('core_snapshot') + has_added = 'added' in snapshot_cols + has_bookmarked_at = 'bookmarked_at' in snapshot_cols + + if has_added and not has_bookmarked_at: + # Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at) + print('Migrating Snapshot from v0.7.2 schema...') cursor.execute(""" INSERT OR IGNORE INTO core_snapshot_new ( - id, created_at, modified_at, bookmarked_at, downloaded_at, url, timestamp, - crawl_id, status, retry_at + id, url, timestamp, title, bookmarked_at, created_at, modified_at ) SELECT - id, - created_at, - modified_at, - bookmarked_at, - downloaded_at, - url, timestamp, - NULLIF(crawl_id, ''), - COALESCE(status, 'queued'), - retry_at - FROM core_snapshot + id, url, timestamp, title, + COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at, + COALESCE(added, CURRENT_TIMESTAMP) as created_at, + COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at + FROM core_snapshot; + """) + elif has_bookmarked_at and not has_added: + # Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at) + print('Migrating Snapshot from v0.8.6rc0 schema...') + # Check what fields exist + has_status = 'status' in snapshot_cols + has_retry_at = 'retry_at' in snapshot_cols + has_crawl_id = 'crawl_id' in snapshot_cols + + # Build column list based on what exists + cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'] + if has_crawl_id: + cols.append('crawl_id') + if has_status: + cols.append('status') + if has_retry_at: + cols.append('retry_at') + + cursor.execute(f""" + INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)}) + SELECT {', '.join(cols)} + FROM core_snapshot; """) else: - # v0.7.2 schema - will get crawl_id assigned by later migration (0024) - cursor.execute(""" - INSERT OR IGNORE INTO core_snapshot_new ( - id, created_at, modified_at, bookmarked_at, url, timestamp, crawl_id - ) - SELECT - id, - COALESCE(added, CURRENT_TIMESTAMP), - COALESCE(updated, added, CURRENT_TIMESTAMP), - COALESCE(added, CURRENT_TIMESTAMP), - url, timestamp, - NULL as crawl_id - FROM core_snapshot - """) + print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}') - # Replace old table - cursor.execute("DROP TABLE IF EXISTS core_snapshot") - cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot") + cursor.execute("DROP TABLE IF EXISTS core_snapshot;") + cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;") - # Create indexes - cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at)") - cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at)") + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);") + cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);") - # ============================================================================ - # PART 3: Upgrade core_tag table - # ============================================================================ + # ============================================================================ + # PART 3: Upgrade core_tag table + # ============================================================================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS core_tag_new ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - # Check if tag id is INTEGER (v0.7.2) or TEXT (v0.8.6rc0) + name VARCHAR(100) NOT NULL UNIQUE, + slug VARCHAR(100) NOT NULL UNIQUE, + + created_by_id INTEGER, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE + ); + """) + + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_tag'") + if cursor.fetchone(): cursor.execute(""" - SELECT type FROM pragma_table_info('core_tag') WHERE name='id' - """) - tag_id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER' - tag_id_is_int = 'INT' in tag_id_type.upper() - - cursor.execute(""" - CREATE TABLE IF NOT EXISTS core_tag_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - created_by_id INTEGER, - - name VARCHAR(100) NOT NULL UNIQUE, - slug VARCHAR(100) NOT NULL UNIQUE, - - FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE SET NULL - ) + INSERT OR IGNORE INTO core_tag_new (id, name, slug) + SELECT id, name, slug + FROM core_tag; """) - if tag_id_is_int: - # v0.7.2: Direct copy (INTEGER to INTEGER) - cursor.execute(""" - INSERT OR IGNORE INTO core_tag_new (id, name, slug) - SELECT id, name, slug FROM core_tag - """) - else: - # v0.8.6rc0: Need to remap TEXT ids to new INTEGER ids - cursor.execute("SELECT id, name, slug FROM core_tag") - old_tags = cursor.fetchall() - tag_id_mapping = {} # old_text_id -> new_int_id + cursor.execute("DROP TABLE IF EXISTS core_tag;") + cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag;") - for old_id, name, slug in old_tags: - cursor.execute(""" - INSERT OR IGNORE INTO core_tag_new (name, slug) - VALUES (?, ?) - """, [name, slug]) - cursor.execute("SELECT id FROM core_tag_new WHERE slug = ?", [slug]) - new_id = cursor.fetchone()[0] - tag_id_mapping[old_id] = new_id + # Create indexes + cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);") + cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);") - cursor.execute("DROP TABLE IF EXISTS core_tag") - cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag") - - # Recreate M2M table - cursor.execute(""" - CREATE TABLE IF NOT EXISTS core_snapshot_tags_new ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - snapshot_id TEXT NOT NULL, - tag_id INTEGER NOT NULL, - FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, - FOREIGN KEY (tag_id) REFERENCES core_tag(id) ON DELETE CASCADE, - UNIQUE(snapshot_id, tag_id) - ) - """) - - if tag_id_is_int: - # Direct copy for v0.7.2 - cursor.execute(""" - INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id) - SELECT snapshot_id, tag_id FROM core_snapshot_tags - """) - else: - # v0.8.6rc0: Use mapping to convert old TEXT ids to new INTEGER ids - cursor.execute("SELECT snapshot_id, tag_id FROM core_snapshot_tags") - m2m_entries = cursor.fetchall() - for snapshot_id, old_tag_id in m2m_entries: - new_tag_id = tag_id_mapping.get(old_tag_id) - if new_tag_id: - cursor.execute(""" - INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id) - VALUES (?, ?) - """, [snapshot_id, new_tag_id]) - - cursor.execute("DROP TABLE IF EXISTS core_snapshot_tags") - cursor.execute("ALTER TABLE core_snapshot_tags_new RENAME TO core_snapshot_tags") + print('✓ Core tables upgraded to v0.9.0') class Migration(migrations.Migration): @@ -301,10 +252,49 @@ class Migration(migrations.Migration): dependencies = [ ('core', '0022_auto_20231023_2008'), ('crawls', '0001_initial'), - ('machine', '0001_initial'), ('auth', '0012_alter_user_first_name_max_length'), ] operations = [ - migrations.RunPython(upgrade_from_v072_or_v086, reverse_code=migrations.RunPython.noop), + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + upgrade_core_tables, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # Remove old ArchiveResult fields + migrations.RemoveField(model_name='archiveresult', name='extractor'), + migrations.RemoveField(model_name='archiveresult', name='output'), + # Remove old Snapshot fields + migrations.RemoveField(model_name='snapshot', name='added'), + migrations.RemoveField(model_name='snapshot', name='updated'), + # SnapshotTag table already exists from v0.7.2, just declare it in state + migrations.CreateModel( + name='SnapshotTag', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)), + ('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)), + ], + options={ + 'db_table': 'core_snapshot_tags', + 'unique_together': {('snapshot', 'tag')}, + }, + ), + # Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2) + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField( + 'Tag', + blank=True, + related_name='snapshot_set', + through='SnapshotTag', + through_fields=('snapshot', 'tag'), + ), + ), + ], + ), ] diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py index 02cf2bdb..8e985032 100644 --- a/archivebox/core/migrations/0024_assign_default_crawl.py +++ b/archivebox/core/migrations/0024_assign_default_crawl.py @@ -1,7 +1,7 @@ # Generated by hand on 2025-12-29 # Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL -from django.db import migrations +from django.db import migrations, models import uuid @@ -56,8 +56,7 @@ class Migration(migrations.Migration): dependencies = [ ('core', '0023_upgrade_to_0_9_0'), - ('crawls', '0002_upgrade_to_0_9_0'), - ('machine', '0001_initial'), + ('crawls', '0001_initial'), ('auth', '0012_alter_user_first_name_max_length'), ] @@ -66,65 +65,80 @@ class Migration(migrations.Migration): create_default_crawl_and_assign_snapshots, reverse_code=migrations.RunPython.noop, ), - # Now make crawl_id NOT NULL - migrations.RunSQL( - sql=""" - -- Rebuild snapshot table with NOT NULL crawl_id - CREATE TABLE core_snapshot_final ( - id TEXT PRIMARY KEY NOT NULL, - created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + migrations.SeparateDatabaseAndState( + database_operations=[ + # Now make crawl_id NOT NULL + migrations.RunSQL( + sql=""" + -- Rebuild snapshot table with NOT NULL crawl_id + CREATE TABLE core_snapshot_final ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - url TEXT NOT NULL, - timestamp VARCHAR(32) NOT NULL UNIQUE, - bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + url TEXT NOT NULL, + timestamp VARCHAR(32) NOT NULL UNIQUE, + bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - crawl_id TEXT NOT NULL, - parent_snapshot_id TEXT, + crawl_id TEXT NOT NULL, + parent_snapshot_id TEXT, - title VARCHAR(512), - downloaded_at DATETIME, - depth INTEGER NOT NULL DEFAULT 0, - fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', + title VARCHAR(512), + downloaded_at DATETIME, + depth INTEGER NOT NULL DEFAULT 0, + fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', - config TEXT NOT NULL DEFAULT '{}', - notes TEXT NOT NULL DEFAULT '', - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0, + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, - status VARCHAR(15) NOT NULL DEFAULT 'queued', - retry_at DATETIME, - current_step INTEGER NOT NULL DEFAULT 0, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + current_step INTEGER NOT NULL DEFAULT 0, - FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, - FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL - ); + FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, + FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL + ); - INSERT INTO core_snapshot_final ( - id, created_at, modified_at, url, timestamp, bookmarked_at, - crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version, - config, notes, num_uses_succeeded, num_uses_failed, - status, retry_at, current_step - ) - SELECT - id, created_at, modified_at, url, timestamp, bookmarked_at, - crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version, - COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed, - status, retry_at, current_step - FROM core_snapshot; + INSERT INTO core_snapshot_final ( + id, created_at, modified_at, url, timestamp, bookmarked_at, + crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version, + config, notes, num_uses_succeeded, num_uses_failed, + status, retry_at, current_step + ) + SELECT + id, created_at, modified_at, url, timestamp, bookmarked_at, + crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version, + COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed, + status, retry_at, current_step + FROM core_snapshot; - DROP TABLE core_snapshot; - ALTER TABLE core_snapshot_final RENAME TO core_snapshot; + DROP TABLE core_snapshot; + ALTER TABLE core_snapshot_final RENAME TO core_snapshot; - CREATE INDEX core_snapshot_url_idx ON core_snapshot(url); - CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp); - CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at); - CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id); - CREATE INDEX core_snapshot_status_idx ON core_snapshot(status); - CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at); - CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at); - CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id); - """, - reverse_sql=migrations.RunSQL.noop, + CREATE INDEX core_snapshot_url_idx ON core_snapshot(url); + CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp); + CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at); + CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id); + CREATE INDEX core_snapshot_status_idx ON core_snapshot(status); + CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at); + CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at); + CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id); + """, + reverse_sql=migrations.RunSQL.noop, + ), + ], + state_operations=[ + migrations.AddField( + model_name='snapshot', + name='crawl', + field=models.ForeignKey( + on_delete=models.deletion.CASCADE, + to='crawls.crawl', + help_text='Crawl that created this snapshot' + ), + ), + ], ), ] diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py new file mode 100644 index 00000000..49533fa8 --- /dev/null +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -0,0 +1,258 @@ +# Generated by Django 6.0 on 2025-12-31 23:09 + +import archivebox.base_models.models +import django.db.models.deletion +import django.utils.timezone +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_assign_default_crawl'), + ('crawls', '0001_initial'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AlterModelOptions( + name='archiveresult', + options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'}, + ), + migrations.AlterModelOptions( + name='snapshot', + options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'}, + ), + migrations.RemoveField( + model_name='archiveresult', + name='cmd', + ), + migrations.RemoveField( + model_name='archiveresult', + name='cmd_version', + ), + migrations.RemoveField( + model_name='archiveresult', + name='pwd', + ), + migrations.AddField( + model_name='archiveresult', + name='config', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='archiveresult', + name='hook_name', + field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255), + ), + migrations.AddField( + model_name='archiveresult', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='archiveresult', + name='notes', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='archiveresult', + name='num_uses_failed', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='num_uses_succeeded', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='output_files', + field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'), + ), + migrations.AddField( + model_name='archiveresult', + name='output_json', + field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512), + ), + migrations.AddField( + model_name='archiveresult', + name='output_size', + field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'), + ), + migrations.AddField( + model_name='archiveresult', + name='output_str', + field=models.TextField(blank=True, default='', help_text='Human-readable output summary'), + ), + migrations.AddField( + model_name='archiveresult', + name='plugin', + field=models.CharField(db_index=True, default='', max_length=32), + ), + migrations.AddField( + model_name='archiveresult', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name='snapshot', + name='bookmarked_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='config', + field=models.JSONField(default=dict), + ), + migrations.AddField( + model_name='snapshot', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='current_step', + field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'), + ), + migrations.AddField( + model_name='snapshot', + name='depth', + field=models.PositiveSmallIntegerField(db_index=True, default=0), + ), + migrations.AddField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + migrations.AddField( + model_name='snapshot', + name='fs_version', + field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), + ), + migrations.AddField( + model_name='snapshot', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='snapshot', + name='notes', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='snapshot', + name='num_uses_failed', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='num_uses_succeeded', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='parent_snapshot', + field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), + ), + migrations.AddField( + model_name='snapshot', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name='snapshot', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15), + ), + migrations.AddField( + model_name='tag', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name='tag', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AddField( + model_name='tag', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='end_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.AutoField(editable=False, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='archiveresult', + name='start_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='crawl', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + ), + migrations.AlterField( + model_name='snapshot', + name='timestamp', + field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='url', + field=models.URLField(db_index=True), + ), + migrations.AlterField( + model_name='tag', + name='slug', + field=models.SlugField(editable=False, max_length=100, unique=True), + ), + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'), + ), + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'), + ), + ] diff --git a/archivebox/core/migrations/0025_cleanup_schema.py b/archivebox/core/migrations/0025_cleanup_schema.py deleted file mode 100644 index 4ab000c4..00000000 --- a/archivebox/core/migrations/0025_cleanup_schema.py +++ /dev/null @@ -1,484 +0,0 @@ -# Generated by hand on 2025-12-29 -# Cleans up extra columns from raw SQL migrations and ensures schema matches models - -from django.db import migrations, models -import django.db.models.deletion -import django.utils.timezone -from django.conf import settings -import archivebox.base_models.models - - -def cleanup_extra_columns(apps, schema_editor): - """ - Create Process records from old cmd/pwd/cmd_version columns and remove those columns. - This preserves the execution details by moving them to the Process model. - """ - with schema_editor.connection.cursor() as cursor: - # Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0) - cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'") - has_cmd = cursor.fetchone()[0] > 0 - - if has_cmd: - print(" Migrating cmd/pwd/cmd_version data to Process records...") - - # For each ArchiveResult, create a Process record with cmd/pwd data - # Note: cmd_version from old schema is not preserved (it's now derived from Binary) - cursor.execute(""" - SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status - FROM core_archiveresult - """) - archive_results = cursor.fetchall() - - from archivebox.uuid_compat import uuid7 - from archivebox.base_models.models import get_or_create_system_user_pk - - # Get or create a Machine record - result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone() - if result: - machine_id = result[0] - print(f" Using existing Machine: {machine_id}") - else: - # Create a minimal Machine record with raw SQL (can't use model during migration) - print(" Creating Machine record for Process migration...") - import platform - import socket - - # Generate minimal machine data without using the model - machine_id = str(uuid7()) - guid = f"{socket.gethostname()}-{platform.machine()}" - hostname = socket.gethostname() - - # Check schema version - cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='config'") - has_config = cursor.fetchone()[0] > 0 - cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='abid'") - has_abid = cursor.fetchone()[0] > 0 - cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='num_uses_succeeded'") - has_num_uses = cursor.fetchone()[0] > 0 - - # Insert directly with SQL (use INSERT OR IGNORE in case it already exists) - if has_config: - # v0.9.0+ schema - cursor.execute(""" - INSERT OR IGNORE INTO machine_machine ( - id, created_at, modified_at, - guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, - os_arch, os_family, os_platform, os_release, os_kernel, - stats, config - ) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', '{}') - """, ( - machine_id, guid, hostname, - platform.machine(), platform.system(), platform.platform(), platform.release() - )) - elif has_abid and has_num_uses: - # v0.8.6rc0 schema (has abid and num_uses columns) - cursor.execute(""" - INSERT OR IGNORE INTO machine_machine ( - id, abid, created_at, modified_at, - guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, - os_arch, os_family, os_platform, os_release, os_kernel, - stats, num_uses_failed, num_uses_succeeded - ) VALUES (?, '', datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', 0, 0) - """, ( - machine_id, guid, hostname, - platform.machine(), platform.system(), platform.platform(), platform.release() - )) - else: - # v0.7.2 or other schema - cursor.execute(""" - INSERT OR IGNORE INTO machine_machine ( - id, created_at, modified_at, - guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, - os_arch, os_family, os_platform, os_release, os_kernel, - stats - ) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}') - """, ( - machine_id, guid, hostname, - platform.machine(), platform.system(), platform.platform(), platform.release() - )) - # Re-query to get the actual id (in case INSERT OR IGNORE skipped it) - result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone() - if result: - machine_id = result[0] - print(f" ✓ Using/Created Machine: {machine_id}") - else: - # INSERT OR IGNORE failed - try again without IGNORE to see the error - raise Exception("Failed to create Machine record - machine_machine table is empty after INSERT") - - for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results: - # Create Process record - process_id = str(uuid7()) - cursor.execute(""" - INSERT INTO machine_process ( - id, created_at, modified_at, - machine_id, binary_id, iface_id, - pwd, cmd, env, timeout, - pid, exit_code, stdout, stderr, - started_at, ended_at, url, status, retry_at - ) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL) - """, (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued')) - - # Update ArchiveResult to point to new Process - cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id)) - - print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data") - - # Now rebuild table without the extra columns - print(" Rebuilding core_archiveresult table...") - cursor.execute(""" - CREATE TABLE core_archiveresult_final ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - uuid TEXT, - created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - - snapshot_id TEXT NOT NULL, - plugin VARCHAR(32) NOT NULL DEFAULT '', - hook_name VARCHAR(255) NOT NULL DEFAULT '', - - start_ts DATETIME, - end_ts DATETIME, - status VARCHAR(15) NOT NULL DEFAULT 'queued', - retry_at DATETIME, - - output_files TEXT NOT NULL DEFAULT '{}', - output_json TEXT, - output_str TEXT NOT NULL DEFAULT '', - output_size INTEGER NOT NULL DEFAULT 0, - output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', - - config TEXT, - notes TEXT NOT NULL DEFAULT '', - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0, - - process_id TEXT NOT NULL, - - FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, - FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT - ) - """) - - # Copy data (cmd, pwd, etc. are now in Process records) - cursor.execute(""" - INSERT INTO core_archiveresult_final SELECT - id, uuid, created_at, modified_at, - snapshot_id, plugin, hook_name, - start_ts, end_ts, status, retry_at, - output_files, output_json, output_str, output_size, output_mimetypes, - config, notes, num_uses_succeeded, num_uses_failed, - process_id - FROM core_archiveresult - """) - - # Replace table - cursor.execute("DROP TABLE core_archiveresult") - cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult") - - # Recreate indexes - cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)") - cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)") - cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)") - cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") - cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)") - cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)") - - print(" ✓ Cleaned up core_archiveresult schema") - - -class Migration(migrations.Migration): - - dependencies = [ - ('core', '0024_assign_default_crawl'), - ('machine', '0005_add_process_table'), - ('crawls', '0002_upgrade_to_0_9_0'), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.SeparateDatabaseAndState( - database_operations=[ - migrations.RunPython( - cleanup_extra_columns, - reverse_code=migrations.RunPython.noop, - ), - ], - state_operations=[ - # Tell Django about all the fields that exist after raw SQL migrations - # ArchiveResult model options - migrations.AlterModelOptions( - name='archiveresult', - options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'}, - ), - - # Remove old fields - migrations.RemoveField(model_name='archiveresult', name='cmd'), - migrations.RemoveField(model_name='archiveresult', name='pwd'), - migrations.RemoveField(model_name='archiveresult', name='cmd_version'), - migrations.RemoveField(model_name='archiveresult', name='extractor'), - migrations.RemoveField(model_name='archiveresult', name='output'), - migrations.RemoveField(model_name='snapshot', name='added'), - migrations.RemoveField(model_name='snapshot', name='updated'), - - # Add new ArchiveResult fields - migrations.AddField( - model_name='archiveresult', - name='plugin', - field=models.CharField(blank=True, default='', max_length=32), - ), - migrations.AddField( - model_name='archiveresult', - name='hook_name', - field=models.CharField(blank=True, default='', max_length=255), - ), - migrations.AddField( - model_name='archiveresult', - name='output_str', - field=models.TextField(blank=True, default=''), - ), - migrations.AddField( - model_name='archiveresult', - name='output_json', - field=models.JSONField(blank=True, default=dict, null=True), - ), - migrations.AddField( - model_name='archiveresult', - name='output_files', - field=models.JSONField(blank=True, default=dict), - ), - migrations.AddField( - model_name='archiveresult', - name='output_size', - field=models.PositiveIntegerField(default=0), - ), - migrations.AddField( - model_name='archiveresult', - name='output_mimetypes', - field=models.CharField(blank=True, default='', max_length=512), - ), - migrations.AddField( - model_name='archiveresult', - name='config', - field=models.JSONField(blank=True, default=dict, null=True), - ), - migrations.AddField( - model_name='archiveresult', - name='notes', - field=models.TextField(blank=True, default=''), - ), - migrations.AddField( - model_name='archiveresult', - name='num_uses_succeeded', - field=models.PositiveIntegerField(default=0), - ), - migrations.AddField( - model_name='archiveresult', - name='num_uses_failed', - field=models.PositiveIntegerField(default=0), - ), - migrations.AddField( - model_name='archiveresult', - name='retry_at', - field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), - ), - migrations.AddField( - model_name='archiveresult', - name='created_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), - migrations.AddField( - model_name='archiveresult', - name='modified_at', - field=models.DateTimeField(auto_now=True), - ), - migrations.AddField( - model_name='archiveresult', - name='process', - field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'), - ), - - # Update Snapshot model - migrations.AlterModelOptions( - name='snapshot', - options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'}, - ), - migrations.AddField( - model_name='snapshot', - name='created_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), - migrations.AddField( - model_name='snapshot', - name='modified_at', - field=models.DateTimeField(auto_now=True), - ), - migrations.AddField( - model_name='snapshot', - name='bookmarked_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), - migrations.AddField( - model_name='snapshot', - name='downloaded_at', - field=models.DateTimeField(blank=True, null=True), - ), - migrations.AddField( - model_name='snapshot', - name='crawl', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'), - ), - migrations.AddField( - model_name='snapshot', - name='depth', - field=models.PositiveSmallIntegerField(default=0), - ), - migrations.AddField( - model_name='snapshot', - name='parent_snapshot', - field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), - ), - migrations.AddField( - model_name='snapshot', - name='status', - field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15), - ), - migrations.AddField( - model_name='snapshot', - name='retry_at', - field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), - ), - migrations.AddField( - model_name='snapshot', - name='current_step', - field=models.PositiveSmallIntegerField(default=0), - ), - migrations.AddField( - model_name='snapshot', - name='fs_version', - field=models.CharField(default='0.9.0', max_length=10), - ), - migrations.AddField( - model_name='snapshot', - name='config', - field=models.JSONField(blank=True, default=dict), - ), - migrations.AddField( - model_name='snapshot', - name='notes', - field=models.TextField(blank=True, default=''), - ), - migrations.AddField( - model_name='snapshot', - name='num_uses_succeeded', - field=models.PositiveIntegerField(default=0), - ), - migrations.AddField( - model_name='snapshot', - name='num_uses_failed', - field=models.PositiveIntegerField(default=0), - ), - - # Update Tag model - migrations.AlterModelOptions( - name='tag', - options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'}, - ), - migrations.AddField( - model_name='tag', - name='created_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True), - ), - migrations.AddField( - model_name='tag', - name='modified_at', - field=models.DateTimeField(auto_now=True), - ), - migrations.AddField( - model_name='tag', - name='created_by', - field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), - ), - - # Alter field types - migrations.AlterField( - model_name='archiveresult', - name='id', - field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), - ), - migrations.AlterField( - model_name='archiveresult', - name='uuid', - field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True), - ), - migrations.AlterField( - model_name='archiveresult', - name='end_ts', - field=models.DateTimeField(blank=True, default=None, null=True), - ), - migrations.AlterField( - model_name='archiveresult', - name='start_ts', - field=models.DateTimeField(blank=True, default=None, null=True), - ), - migrations.AlterField( - model_name='archiveresult', - name='status', - field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15), - ), - migrations.AlterField( - model_name='snapshot', - name='id', - field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True), - ), - migrations.AlterField( - model_name='snapshot', - name='timestamp', - field=models.CharField(db_index=True, max_length=32, unique=True), - ), - migrations.AlterField( - model_name='snapshot', - name='url', - field=models.URLField(max_length=2048), - ), - migrations.AlterField( - model_name='tag', - name='slug', - field=models.SlugField(editable=False, max_length=100, unique=True), - ), - - # Create M2M model for snapshot tags - migrations.CreateModel( - name='SnapshotTag', - fields=[ - ('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')), - ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')), - ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')), - ], - options={ - 'db_table': 'core_snapshot_tags', - }, - ), - migrations.AlterUniqueTogether( - name='snapshottag', - unique_together={('snapshot', 'tag')}, - ), - - # Update tags field on Snapshot to use the through model - migrations.AlterField( - model_name='snapshot', - name='tags', - field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'), - ), - - # Add constraints - migrations.AddConstraint( - model_name='snapshot', - constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), - ), - migrations.AddConstraint( - model_name='snapshot', - constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'), - ), - ], - ), - ] diff --git a/archivebox/core/migrations/0026_final_field_adjustments.py b/archivebox/core/migrations/0026_final_field_adjustments.py deleted file mode 100644 index a7d16774..00000000 --- a/archivebox/core/migrations/0026_final_field_adjustments.py +++ /dev/null @@ -1,76 +0,0 @@ -# Generated by hand on 2025-12-30 -# Final field adjustments to match model definitions exactly - -from django.db import migrations, models -import django.db.models.deletion -import django.utils.timezone -from archivebox.uuid_compat import uuid7 - - -class Migration(migrations.Migration): - - dependencies = [ - ('core', '0025_cleanup_schema'), - ('crawls', '0002_upgrade_to_0_9_0'), - ] - - operations = [ - # Alter Snapshot fields to match model exactly - migrations.AlterField( - model_name='snapshot', - name='id', - field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True), - ), - migrations.AlterField( - model_name='snapshot', - name='timestamp', - field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), - ), - migrations.AlterField( - model_name='snapshot', - name='url', - field=models.URLField(db_index=True, unique=False), - ), - migrations.AlterField( - model_name='snapshot', - name='downloaded_at', - field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), - ), - migrations.AlterField( - model_name='snapshot', - name='parent_snapshot', - field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), - ), - migrations.AlterField( - model_name='snapshot', - name='retry_at', - field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), - ), - migrations.AlterField( - model_name='snapshot', - name='fs_version', - field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), - ), - migrations.AlterField( - model_name='snapshot', - name='tags', - field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), - ), - - # Alter SnapshotTag fields - migrations.AlterField( - model_name='snapshottag', - name='id', - field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), - ), - migrations.AlterField( - model_name='snapshottag', - name='snapshot', - field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), - ), - migrations.AlterField( - model_name='snapshottag', - name='tag', - field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), - ), - ] diff --git a/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py b/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py deleted file mode 100644 index 4f4ed92b..00000000 --- a/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py +++ /dev/null @@ -1,108 +0,0 @@ -# Generated by Django 6.0 on 2025-12-31 09:04 - -import django.db.models.deletion -import django.utils.timezone -import uuid -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('core', '0026_final_field_adjustments'), - ('crawls', '0002_upgrade_to_0_9_0'), - ('machine', '0001_initial'), - ] - - operations = [ - migrations.AlterField( - model_name='archiveresult', - name='hook_name', - field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255), - ), - migrations.AlterField( - model_name='archiveresult', - name='id', - field=models.AutoField(editable=False, primary_key=True, serialize=False), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_files', - field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_json', - field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_mimetypes', - field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_size', - field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'), - ), - migrations.AlterField( - model_name='archiveresult', - name='output_str', - field=models.TextField(blank=True, default='', help_text='Human-readable output summary'), - ), - migrations.AlterField( - model_name='archiveresult', - name='plugin', - field=models.CharField(db_index=True, default='', max_length=32), - ), - migrations.AlterField( - model_name='archiveresult', - name='process', - field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'), - ), - migrations.AlterField( - model_name='archiveresult', - name='retry_at', - field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), - ), - migrations.AlterField( - model_name='archiveresult', - name='status', - field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), - ), - migrations.AlterField( - model_name='archiveresult', - name='uuid', - field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True), - ), - migrations.AlterField( - model_name='snapshot', - name='config', - field=models.JSONField(default=dict), - ), - migrations.AlterField( - model_name='snapshot', - name='crawl', - field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), - ), - migrations.AlterField( - model_name='snapshot', - name='current_step', - field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'), - ), - migrations.AlterField( - model_name='snapshot', - name='depth', - field=models.PositiveSmallIntegerField(db_index=True, default=0), - ), - migrations.AlterField( - model_name='snapshot', - name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), - ), - migrations.AlterField( - model_name='snapshottag', - name='id', - field=models.AutoField(primary_key=True, serialize=False), - ), - ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index b4cf9045..b8aa660c 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -91,9 +91,9 @@ class Tag(ModelWithSerializers): def api_url(self) -> str: return reverse_lazy('api-1:get_tag', args=[self.id]) - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Tag model instance to a JSONL record. + Convert Tag model instance to a JSON-serializable dict. """ from archivebox.config import VERSION return { @@ -105,12 +105,12 @@ class Tag(ModelWithSerializers): } @staticmethod - def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None): + def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None): """ - Create/update Tag from JSONL record. + Create/update Tag from JSON dict. Args: - record: JSONL record with 'name' field + record: JSON dict with 'name' field overrides: Optional dict with 'snapshot' to auto-attach tag Returns: @@ -982,8 +982,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea processes_seen = set() with open(index_path, 'w') as f: - # Write Snapshot record first (to_jsonl includes crawl_id, fs_version) - f.write(json.dumps(self.to_jsonl()) + '\n') + # Write Snapshot record first (to_json includes crawl_id, fs_version) + f.write(json.dumps(self.to_json()) + '\n') # Write ArchiveResult records with their associated Binary and Process # Use select_related to optimize queries @@ -991,15 +991,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Write Binary record if not already written if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen: binaries_seen.add(ar.process.binary_id) - f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n') + f.write(json.dumps(ar.process.binary.to_json()) + '\n') # Write Process record if not already written if ar.process and ar.process_id not in processes_seen: processes_seen.add(ar.process_id) - f.write(json.dumps(ar.process.to_jsonl()) + '\n') + f.write(json.dumps(ar.process.to_json()) + '\n') # Write ArchiveResult record - f.write(json.dumps(ar.to_jsonl()) + '\n') + f.write(json.dumps(ar.to_json()) + '\n') def read_index_jsonl(self) -> dict: """ @@ -1422,9 +1422,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return False - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Snapshot model instance to a JSONL record. + Convert Snapshot model instance to a JSON-serializable dict. Includes all fields needed to fully reconstruct/identify this snapshot. """ from archivebox.config import VERSION @@ -1445,9 +1445,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea } @staticmethod - def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True): + def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True): """ - Create/update Snapshot from JSONL record or dict. + Create/update Snapshot from JSON dict. Unified method that handles: - ID-based patching: {"id": "...", "title": "new title"} @@ -2106,8 +2106,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea result['canonical'] = self.canonical_outputs() return result - def to_json(self, indent: int = 4) -> str: - """Convert to JSON string""" + def to_json_str(self, indent: int = 4) -> str: + """Convert to JSON string (legacy method, use to_json() for dict)""" return to_json(self.to_dict(extended=True), indent=indent) def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str: @@ -2284,14 +2284,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)') # Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.) - # Required - every ArchiveResult must have a Process - process = models.OneToOneField( - 'machine.Process', - on_delete=models.PROTECT, - null=False, # Required after migration 4 - related_name='archiveresult', - help_text='Process execution details for this archive result' - ) + # Added POST-v0.9.0, will be added in a separate migration + # process = models.OneToOneField( + # 'machine.Process', + # on_delete=models.PROTECT, + # null=False, + # related_name='archiveresult', + # help_text='Process execution details for this archive result' + # ) # New output fields (replacing old 'output' field) output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary') @@ -2326,9 +2326,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi """Convenience property to access the user who created this archive result via its snapshot's crawl.""" return self.snapshot.crawl.created_by - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert ArchiveResult model instance to a JSONL record. + Convert ArchiveResult model instance to a JSON-serializable dict. """ from archivebox.config import VERSION record = { @@ -2360,6 +2360,50 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi record['process_id'] = str(self.process_id) return record + @staticmethod + def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None): + """ + Create/update ArchiveResult from JSON dict. + + Args: + record: JSON dict with 'snapshot_id', 'plugin', etc. + overrides: Optional dict of field overrides + + Returns: + ArchiveResult instance or None + """ + snapshot_id = record.get('snapshot_id') + plugin = record.get('plugin') + + if not snapshot_id or not plugin: + return None + + # Try to get existing by ID first + result_id = record.get('id') + if result_id: + try: + return ArchiveResult.objects.get(id=result_id) + except ArchiveResult.DoesNotExist: + pass + + # Get or create by snapshot_id + plugin + try: + from archivebox.core.models import Snapshot + snapshot = Snapshot.objects.get(id=snapshot_id) + + result, _ = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin, + defaults={ + 'hook_name': record.get('hook_name', ''), + 'status': record.get('status', 'queued'), + 'output_str': record.get('output_str', ''), + } + ) + return result + except Snapshot.DoesNotExist: + return None + def save(self, *args, **kwargs): is_new = self._state.adding diff --git a/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py b/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py deleted file mode 100644 index 7afca909..00000000 --- a/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py +++ /dev/null @@ -1,90 +0,0 @@ -# Generated by hand on 2025-12-29 -# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema - -from django.db import migrations - - -def upgrade_crawl_schema_if_needed(apps, schema_editor): - """ - Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column). - """ - with schema_editor.connection.cursor() as cursor: - # Check if we need to upgrade (missing urls column means v0.8.6rc0) - cursor.execute(""" - SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls' - """) - has_urls = cursor.fetchone()[0] > 0 - - if not has_urls: - print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...") - - # Create new table with v0.9.0 schema - cursor.execute(""" - CREATE TABLE crawls_crawl_new ( - id TEXT PRIMARY KEY NOT NULL, - created_at DATETIME NOT NULL, - modified_at DATETIME NOT NULL, - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0, - - urls TEXT NOT NULL DEFAULT '[]', - config TEXT, - max_depth INTEGER NOT NULL DEFAULT 0, - tags_str VARCHAR(1024) NOT NULL DEFAULT '', - persona_id TEXT, - label VARCHAR(64) NOT NULL DEFAULT '', - notes TEXT NOT NULL DEFAULT '', - output_dir VARCHAR(512) NOT NULL DEFAULT '', - - status VARCHAR(15) NOT NULL DEFAULT 'queued', - retry_at DATETIME, - created_by_id INTEGER NOT NULL, - schedule_id TEXT, - - FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, - FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL - ) - """) - - # Copy data from old table (v0.8.6rc0 schema) - cursor.execute(""" - INSERT INTO crawls_crawl_new ( - id, created_at, modified_at, num_uses_succeeded, num_uses_failed, - urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id - ) - SELECT - id, created_at, modified_at, num_uses_succeeded, num_uses_failed, - '[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id, - CAST(schedule_id AS TEXT) - FROM crawls_crawl - """) - - # Replace old table - cursor.execute("DROP TABLE crawls_crawl") - cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl") - - # Create indexes - cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)") - cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)") - cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)") - cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)") - cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)") - - print(" ✓ Upgraded crawls_crawl to v0.9.0 schema") - else: - print(" ✓ crawls_crawl already has v0.9.0 schema") - - -class Migration(migrations.Migration): - - dependencies = [ - ('crawls', '0001_initial'), - ('auth', '0012_alter_user_first_name_max_length'), - ] - - operations = [ - migrations.RunPython( - upgrade_crawl_schema_if_needed, - reverse_code=migrations.RunPython.noop, - ), - ] diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index c3b588c4..276d02f8 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -134,9 +134,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def api_url(self) -> str: return reverse_lazy('api-1:get_crawl', args=[self.id]) - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Crawl model instance to a JSONL record. + Convert Crawl model instance to a JSON-serializable dict. """ from archivebox.config import VERSION return { @@ -152,9 +152,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith } @staticmethod - def from_jsonl(record: dict, overrides: dict = None): + def from_json(record: dict, overrides: dict = None): """ - Create or get a Crawl from a JSONL record. + Create or get a Crawl from a JSON dict. Args: record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label' diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 73febfa0..a9bb671f 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -1176,7 +1176,7 @@ def create_model_record(record: Dict[str, Any]) -> Any: def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]: """ Process JSONL records from hook output. - Dispatches to Model.from_jsonl() for each record type. + Dispatches to Model.from_json() for each record type. Args: records: List of JSONL record dicts from result['records'] @@ -1201,25 +1201,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any # Dispatch to appropriate model's from_jsonl() method if record_type == 'Snapshot': from archivebox.core.models import Snapshot - obj = Snapshot.from_jsonl(record.copy(), overrides) + obj = Snapshot.from_jsonll(record.copy(), overrides) if obj: stats['Snapshot'] = stats.get('Snapshot', 0) + 1 elif record_type == 'Tag': from archivebox.core.models import Tag - obj = Tag.from_jsonl(record.copy(), overrides) + obj = Tag.from_json(record.copy(), overrides) if obj: stats['Tag'] = stats.get('Tag', 0) + 1 elif record_type == 'Binary': from archivebox.machine.models import Binary - obj = Binary.from_jsonl(record.copy(), overrides) + obj = Binary.from_json(record.copy(), overrides) if obj: stats['Binary'] = stats.get('Binary', 0) + 1 elif record_type == 'Machine': from archivebox.machine.models import Machine - obj = Machine.from_jsonl(record.copy(), overrides) + obj = Machine.from_json(record.copy(), overrides) if obj: stats['Machine'] = stats.get('Machine', 0) + 1 diff --git a/archivebox/machine/migrations/0001_initial.py b/archivebox/machine/migrations/0001_initial.py index 37c42fd0..cd9c4291 100644 --- a/archivebox/machine/migrations/0001_initial.py +++ b/archivebox/machine/migrations/0001_initial.py @@ -100,46 +100,8 @@ class Migration(migrations.Migration): CREATE INDEX IF NOT EXISTS machine_binary_status_idx ON machine_binary(status); CREATE INDEX IF NOT EXISTS machine_binary_retry_at_idx ON machine_binary(retry_at); - -- Create machine_process table - CREATE TABLE IF NOT EXISTS machine_process ( - id TEXT PRIMARY KEY NOT NULL, - created_at DATETIME NOT NULL, - modified_at DATETIME NOT NULL, - - machine_id TEXT NOT NULL, - binary_id TEXT, - iface_id TEXT, - - pwd VARCHAR(512) NOT NULL DEFAULT '', - cmd TEXT NOT NULL DEFAULT '[]', - env TEXT NOT NULL DEFAULT '{}', - timeout INTEGER NOT NULL DEFAULT 120, - - pid INTEGER, - exit_code INTEGER, - stdout TEXT NOT NULL DEFAULT '', - stderr TEXT NOT NULL DEFAULT '', - - started_at DATETIME, - ended_at DATETIME, - - url VARCHAR(2048), - - status VARCHAR(16) NOT NULL DEFAULT 'queued', - retry_at DATETIME, - - FOREIGN KEY (machine_id) REFERENCES machine_machine(id) ON DELETE CASCADE, - FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL, - FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL - ); - CREATE INDEX IF NOT EXISTS machine_process_status_idx ON machine_process(status); - CREATE INDEX IF NOT EXISTS machine_process_retry_at_idx ON machine_process(retry_at); - CREATE INDEX IF NOT EXISTS machine_process_machine_id_idx ON machine_process(machine_id); - CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id); - CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at); """, reverse_sql=""" - DROP TABLE IF EXISTS machine_process; DROP TABLE IF EXISTS machine_binary; DROP TABLE IF EXISTS machine_networkinterface; DROP TABLE IF EXISTS machine_machine; @@ -167,6 +129,8 @@ class Migration(migrations.Migration): ('os_kernel', models.CharField(default=None, max_length=255)), ('stats', models.JSONField(blank=True, default=dict, null=True)), ('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('num_uses_failed', models.PositiveIntegerField(default=0)), ], options={ 'app_label': 'machine', @@ -189,6 +153,8 @@ class Migration(migrations.Migration): ('region', models.CharField(default=None, max_length=63)), ('country', models.CharField(default=None, max_length=63)), ('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('num_uses_failed', models.PositiveIntegerField(default=0)), ], options={ 'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')}, @@ -212,6 +178,8 @@ class Migration(migrations.Migration): ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)), ('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)), ('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('num_uses_failed', models.PositiveIntegerField(default=0)), ], options={ 'verbose_name': 'Binary', @@ -220,43 +188,6 @@ class Migration(migrations.Migration): 'app_label': 'machine', }, ), - migrations.CreateModel( - name='Process', - fields=[ - ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), - ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), - ('modified_at', models.DateTimeField(auto_now=True)), - ('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)), - ('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')), - ('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')), - ('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')), - ('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)), - ('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)), - ('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')), - ('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')), - ('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)), - ('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)), - ('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)), - ('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)), - ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)), - ('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='process_set', to='machine.machine')), - ('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.binary')), - ('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.networkinterface')), - ], - options={ - 'verbose_name': 'Process', - 'verbose_name_plural': 'Processes', - 'app_label': 'machine', - }, - ), - migrations.AddIndex( - model_name='process', - index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_5e3a87_idx'), - ), - migrations.AddIndex( - model_name='process', - index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__7bd19c_idx'), - ), ], ), ] diff --git a/archivebox/machine/migrations/0002_process.py b/archivebox/machine/migrations/0002_process.py new file mode 100644 index 00000000..c3aed18e --- /dev/null +++ b/archivebox/machine/migrations/0002_process.py @@ -0,0 +1,45 @@ +# Generated by Django 6.0 on 2025-12-31 22:54 + +import django.db.models.deletion +import django.utils.timezone +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='Process', + fields=[ + ('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)), + ('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')), + ('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')), + ('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')), + ('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)), + ('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)), + ('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')), + ('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')), + ('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)), + ('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)), + ('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)), + ('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)), + ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)), + ('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.binary')), + ('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.networkinterface')), + ('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='process_set', to='machine.machine')), + ], + options={ + 'verbose_name': 'Process', + 'verbose_name_plural': 'Processes', + 'indexes': [models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_5e3a87_idx'), models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__7bd19c_idx')], + }, + ), + ] diff --git a/archivebox/machine/migrations/0002_process_parent_and_type.py b/archivebox/machine/migrations/0002_process_parent_and_type.py deleted file mode 100644 index e70de360..00000000 --- a/archivebox/machine/migrations/0002_process_parent_and_type.py +++ /dev/null @@ -1,101 +0,0 @@ -# Generated on 2025-12-31 -# Adds parent FK and process_type field to Process model - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('machine', '0001_initial'), - ] - - operations = [ - migrations.SeparateDatabaseAndState( - database_operations=[ - migrations.RunSQL( - sql=""" - -- Add parent_id FK column to machine_process - ALTER TABLE machine_process ADD COLUMN parent_id TEXT REFERENCES machine_process(id) ON DELETE SET NULL; - CREATE INDEX IF NOT EXISTS machine_process_parent_id_idx ON machine_process(parent_id); - - -- Add process_type column with default 'binary' - ALTER TABLE machine_process ADD COLUMN process_type VARCHAR(16) NOT NULL DEFAULT 'binary'; - CREATE INDEX IF NOT EXISTS machine_process_process_type_idx ON machine_process(process_type); - - -- Add composite index for parent + status queries - CREATE INDEX IF NOT EXISTS machine_process_parent_status_idx ON machine_process(parent_id, status); - - -- Add composite index for machine + pid + started_at (for PID reuse protection) - CREATE INDEX IF NOT EXISTS machine_process_machine_pid_started_idx ON machine_process(machine_id, pid, started_at); - """, - # Migration is irreversible due to SQLite limitations - # SQLite doesn't support DROP COLUMN, would require table rebuild - reverse_sql=migrations.RunSQL.noop - ), - ], - state_operations=[ - # Add parent FK - migrations.AddField( - model_name='process', - name='parent', - field=models.ForeignKey( - blank=True, - help_text='Parent process that spawned this one', - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name='children', - to='machine.process', - ), - ), - # Add process_type field - migrations.AddField( - model_name='process', - name='process_type', - field=models.CharField( - choices=[ - ('cli', 'CLI Command'), - ('supervisord', 'Supervisord Daemon'), - ('orchestrator', 'Orchestrator'), - ('worker', 'Worker Process'), - ('hook', 'Hook Script'), - ('binary', 'Binary Execution'), - ], - default='binary', - help_text='Type of process in the execution hierarchy', - max_length=16, - ), - ), - # Add indexes - must match the SQL index names exactly - migrations.AddIndex( - model_name='process', - index=models.Index( - fields=['parent'], - name='machine_process_parent_id_idx', - ), - ), - migrations.AddIndex( - model_name='process', - index=models.Index( - fields=['process_type'], - name='machine_process_process_type_idx', - ), - ), - migrations.AddIndex( - model_name='process', - index=models.Index( - fields=['parent', 'status'], - name='machine_process_parent_status_idx', - ), - ), - migrations.AddIndex( - model_name='process', - index=models.Index( - fields=['machine', 'pid', 'started_at'], - name='machine_process_machine_pid_started_idx', - ), - ), - ], - ), - ] diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 428633b3..814b5c1a 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -82,13 +82,38 @@ class Machine(ModelWithHealthStats): ) return _CURRENT_MACHINE - @staticmethod - def from_jsonl(record: dict, overrides: dict = None): + def to_json(self) -> dict: """ - Update Machine config from JSONL record. + Convert Machine model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + return { + 'type': 'Machine', + 'schema_version': VERSION, + 'id': str(self.id), + 'guid': self.guid, + 'hostname': self.hostname, + 'hw_in_docker': self.hw_in_docker, + 'hw_in_vm': self.hw_in_vm, + 'hw_manufacturer': self.hw_manufacturer, + 'hw_product': self.hw_product, + 'hw_uuid': self.hw_uuid, + 'os_arch': self.os_arch, + 'os_family': self.os_family, + 'os_platform': self.os_platform, + 'os_kernel': self.os_kernel, + 'os_release': self.os_release, + 'stats': self.stats, + 'config': self.config or {}, + } + + @staticmethod + def from_json(record: dict, overrides: dict = None): + """ + Update Machine config from JSON dict. Args: - record: JSONL record with '_method': 'update', 'key': '...', 'value': '...' + record: JSON dict with '_method': 'update', 'key': '...', 'value': '...' overrides: Not used Returns: @@ -255,9 +280,9 @@ class Binary(ModelWithHealthStats): 'is_valid': self.is_valid, } - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Binary model instance to a JSONL record. + Convert Binary model instance to a JSON-serializable dict. """ from archivebox.config import VERSION return { @@ -274,17 +299,17 @@ class Binary(ModelWithHealthStats): } @staticmethod - def from_jsonl(record: dict, overrides: dict = None): + def from_json(record: dict, overrides: dict = None): """ - Create/update Binary from JSONL record. + Create/update Binary from JSON dict. Handles two cases: - 1. From binaries.jsonl: creates queued binary with name, binproviders, overrides + 1. From binaries.json: creates queued binary with name, binproviders, overrides 2. From hook output: updates binary with abspath, version, sha256, binprovider Args: - record: JSONL record with 'name' and either: - - 'binproviders', 'overrides' (from binaries.jsonl) + record: JSON dict with 'name' and either: + - 'binproviders', 'overrides' (from binaries.json) - 'abspath', 'version', 'sha256', 'binprovider' (from hook output) overrides: Not used @@ -542,7 +567,7 @@ class ProcessManager(models.Manager): return process -class Process(ModelWithHealthStats): +class Process(models.Model): """ Tracks a single OS process execution. @@ -563,38 +588,11 @@ class Process(ModelWithHealthStats): RUNNING = 'running', 'Running' EXITED = 'exited', 'Exited' - class TypeChoices(models.TextChoices): - CLI = 'cli', 'CLI Command' - SUPERVISORD = 'supervisord', 'Supervisord Daemon' - ORCHESTRATOR = 'orchestrator', 'Orchestrator' - WORKER = 'worker', 'Worker Process' - HOOK = 'hook', 'Hook Script' - BINARY = 'binary', 'Binary Execution' - # Primary fields id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) - # Parent process FK for hierarchy tracking - parent = models.ForeignKey( - 'self', - on_delete=models.SET_NULL, - null=True, - blank=True, - related_name='children', - help_text='Parent process that spawned this one' - ) - - # Process type for distinguishing in hierarchy - process_type = models.CharField( - max_length=16, - choices=TypeChoices.choices, - default=TypeChoices.BINARY, - db_index=True, - help_text='Type of process in the execution hierarchy' - ) - # Machine FK - required (every process runs on a machine) machine = models.ForeignKey( Machine, @@ -667,10 +665,6 @@ class Process(ModelWithHealthStats): help_text='When to retry this process' ) - # Health stats - num_uses_failed = models.PositiveIntegerField(default=0) - num_uses_succeeded = models.PositiveIntegerField(default=0) - state_machine_name: str = 'archivebox.machine.models.ProcessMachine' objects: ProcessManager = ProcessManager() @@ -682,8 +676,6 @@ class Process(ModelWithHealthStats): indexes = [ models.Index(fields=['machine', 'status', 'retry_at']), models.Index(fields=['binary', 'exit_code']), - models.Index(fields=['parent', 'status']), - models.Index(fields=['machine', 'pid', 'started_at']), ] def __str__(self) -> str: @@ -716,9 +708,9 @@ class Process(ModelWithHealthStats): return self.archiveresult.hook_name return '' - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Process model instance to a JSONL record. + Convert Process model instance to a JSON-serializable dict. """ from archivebox.config import VERSION record = { @@ -742,6 +734,26 @@ class Process(ModelWithHealthStats): record['timeout'] = self.timeout return record + @staticmethod + def from_json(record: dict, overrides: dict = None): + """ + Create/update Process from JSON dict. + + Args: + record: JSON dict with 'id' or process details + overrides: Optional dict of field overrides + + Returns: + Process instance or None + """ + process_id = record.get('id') + if process_id: + try: + return Process.objects.get(id=process_id) + except Process.DoesNotExist: + pass + return None + def update_and_requeue(self, **kwargs): """ Update process fields and requeue for worker state machine. @@ -1751,17 +1763,12 @@ class ProcessMachine(BaseStateMachine, strict_states=True): @exited.enter def enter_exited(self): """Process has exited.""" - success = self.process.exit_code == 0 - self.process.update_and_requeue( retry_at=None, status=Process.StatusChoices.EXITED, ended_at=timezone.now(), ) - # Increment health stats based on exit code - self.process.increment_health_stats(success=success) - # ============================================================================= # State Machine Registration diff --git a/archivebox/machine/tests/test_machine_models.py b/archivebox/machine/tests/test_machine_models.py index 427c98d8..83875057 100644 --- a/archivebox/machine/tests/test_machine_models.py +++ b/archivebox/machine/tests/test_machine_models.py @@ -76,7 +76,7 @@ class TestMachineModel(TestCase): self.assertEqual(machine1.guid, machine2.guid) def test_machine_from_jsonl_update(self): - """Machine.from_jsonl() should update machine config.""" + """Machine.from_json() should update machine config.""" Machine.current() # Ensure machine exists record = { '_method': 'update', @@ -84,14 +84,14 @@ class TestMachineModel(TestCase): 'value': '/usr/bin/wget', } - result = Machine.from_jsonl(record) + result = Machine.from_json(record) self.assertIsNotNone(result) self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget') def test_machine_from_jsonl_invalid(self): - """Machine.from_jsonl() should return None for invalid records.""" - result = Machine.from_jsonl({'invalid': 'record'}) + """Machine.from_json() should return None for invalid records.""" + result = Machine.from_json({'invalid': 'record'}) self.assertIsNone(result) def test_machine_manager_current(self): @@ -254,14 +254,14 @@ class TestProcessModel(TestCase): self.assertIsNone(process.exit_code) def test_process_to_jsonl(self): - """Process.to_jsonl() should serialize correctly.""" + """Process.to_json() should serialize correctly.""" process = Process.objects.create( machine=self.machine, cmd=['echo', 'hello'], pwd='/tmp', timeout=60, ) - json_data = process.to_jsonl() + json_data = process.to_json() self.assertEqual(json_data['type'], 'Process') self.assertEqual(json_data['cmd'], ['echo', 'hello']) diff --git a/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js b/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js index a3e1235a..5848cc97 100755 --- a/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js +++ b/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js @@ -271,10 +271,51 @@ async function configure2Captcha() { if (result.success) { console.error(`[+] 2captcha configured via ${result.method}`); + + // Verify config was applied by reloading options page and checking form values + console.error('[*] Verifying config by reloading options page...'); + try { + await configPage.reload({ waitUntil: 'networkidle0', timeout: 10000 }); + } catch (e) { + console.error(`[*] Reload threw error (may still work): ${e.message}`); + } + + await new Promise(r => setTimeout(r, 2000)); + + // Wait for Config object again + await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 }); + + // Read back the config using Config.getAll() + const verifyConfig = await configPage.evaluate(async () => { + if (typeof Config !== 'undefined' && typeof Config.getAll === 'function') { + return await Config.getAll(); + } + return null; + }); + + if (!verifyConfig) { + return { success: false, error: 'Could not verify config - Config.getAll() not available' }; + } + + // Check that API key was actually set + const actualApiKey = verifyConfig.apiKey || verifyConfig.api_key; + if (!actualApiKey || actualApiKey !== config.apiKey) { + console.error(`[!] Config verification FAILED - API key mismatch`); + console.error(`[!] Expected: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`); + console.error(`[!] Got: ${actualApiKey ? actualApiKey.slice(0, 8) + '...' + actualApiKey.slice(-4) : 'null'}`); + return { success: false, error: 'Config verification failed - API key not set correctly' }; + } + + console.error('[+] Config verified successfully!'); + console.error(`[+] API Key: ${actualApiKey.slice(0, 8)}...${actualApiKey.slice(-4)}`); + console.error(`[+] Plugin Enabled: ${verifyConfig.isPluginEnabled}`); + console.error(`[+] Auto Solve Turnstile: ${verifyConfig.autoSolveTurnstile}`); + fs.writeFileSync(CONFIG_MARKER, JSON.stringify({ timestamp: new Date().toISOString(), method: result.method, extensionId: extensionId, + verified: true, config: { apiKeySet: !!config.apiKey, isPluginEnabled: config.isPluginEnabled, @@ -284,7 +325,7 @@ async function configure2Captcha() { autoSolveEnabled: true, } }, null, 2)); - return { success: true, method: result.method }; + return { success: true, method: result.method, verified: true }; } return { success: false, error: result.error || 'Config failed' }; diff --git a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py index f81b55da..c68b8158 100644 --- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py +++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py @@ -29,7 +29,7 @@ PLUGIN_DIR = Path(__file__).parent.parent INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js' CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js' -TEST_URL = 'https://2captcha.com/demo/recaptcha-v2' +TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile' # Alias for backward compatibility with existing test names @@ -70,8 +70,17 @@ class TestTwoCaptcha: process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) try: - exts = json.loads((chrome_dir / 'extensions.json').read_text()) - assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}" + # Wait for extensions.json to be written + extensions_file = chrome_dir / 'extensions.json' + for i in range(20): + if extensions_file.exists(): + break + time.sleep(0.5) + + assert extensions_file.exists(), f"extensions.json not created. Chrome dir files: {list(chrome_dir.iterdir())}" + + exts = json.loads(extensions_file.read_text()) + assert any(e['name'] == 'twocaptcha' for e in exts), f"twocaptcha not loaded: {exts}" print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}") finally: kill_chrome(process, chrome_dir) @@ -95,6 +104,14 @@ class TestTwoCaptcha: process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) try: + # Wait for extensions.json to be written + extensions_file = chrome_dir / 'extensions.json' + for i in range(20): + if extensions_file.exists(): + break + time.sleep(0.5) + assert extensions_file.exists(), f"extensions.json not created" + result = subprocess.run( ['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'], env=env, timeout=30, capture_output=True, text=True @@ -163,7 +180,34 @@ const puppeteer = require('puppeteer-core'); kill_chrome(process, chrome_dir) def test_solves_recaptcha(self): - """Extension solves reCAPTCHA on demo page.""" + """Extension attempts to solve CAPTCHA on demo page. + + CRITICAL: DO NOT SKIP OR DISABLE THIS TEST EVEN IF IT'S FLAKY! + + This test is INTENTIONALLY left enabled to expose the REAL, ACTUAL flakiness + of the 2captcha service and demo page. The test failures you see here are NOT + test bugs - they are ACCURATE representations of the real-world reliability + of this CAPTCHA solving service. + + If this test is flaky, that's because 2captcha IS FLAKY in production. + If this test fails intermittently, that's because 2captcha FAILS INTERMITTENTLY in production. + + NEVER EVER hide real flakiness by disabling tests or adding @pytest.mark.skip. + Users NEED to see this failure rate to understand what they're getting into. + + When this test DOES pass, it confirms: + - Extension loads and configures correctly + - 2captcha API key is accepted + - Extension can successfully auto-solve CAPTCHAs + - The entire flow works end-to-end + + When it fails (as it often does): + - Demo page has JavaScript errors (representing real-world broken sites) + - Turnstile tokens expire before solving (representing real-world timing issues) + - 2captcha service may be slow/down (representing real-world service issues) + + This is VALUABLE INFORMATION about the service. DO NOT HIDE IT. + """ with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) env = setup_test_env(tmpdir) @@ -179,6 +223,14 @@ const puppeteer = require('puppeteer-core'); process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) try: + # Wait for extensions.json to be written + extensions_file = chrome_dir / 'extensions.json' + for i in range(20): + if extensions_file.exists(): + break + time.sleep(0.5) + assert extensions_file.exists(), f"extensions.json not created" + subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True) script = f''' @@ -187,48 +239,97 @@ const puppeteer = require('puppeteer-core'); (async () => {{ const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); const page = await browser.newPage(); + + // Capture console messages from the page (including extension messages) + page.on('console', msg => {{ + const text = msg.text(); + if (text.includes('2captcha') || text.includes('turnstile') || text.includes('captcha')) {{ + console.error('[CONSOLE]', text); + }} + }}); + await page.setViewport({{ width: 1440, height: 900 }}); console.error('[*] Loading {TEST_URL}...'); await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }}); - await new Promise(r => setTimeout(r, 3000)); + // Wait for CAPTCHA iframe (minimal wait to avoid token expiration) + console.error('[*] Waiting for CAPTCHA iframe...'); + await page.waitForSelector('iframe', {{ timeout: 30000 }}); + console.error('[*] CAPTCHA iframe found - extension should auto-solve now'); + + // DON'T CLICK - extension should auto-solve since autoSolveTurnstile=True + console.error('[*] Waiting for auto-solve (extension configured with autoSolveTurnstile=True)...'); + + // Poll for data-state changes with debug output + console.error('[*] Waiting for CAPTCHA to be solved (up to 150s)...'); const start = Date.now(); - const maxWait = 90000; + let solved = false; + let lastState = null; - while (Date.now() - start < maxWait) {{ + while (!solved && (Date.now() - start) < 150000) {{ const state = await page.evaluate(() => {{ - const resp = document.querySelector('textarea[name="g-recaptcha-response"]'); const solver = document.querySelector('.captcha-solver'); return {{ - solved: resp ? resp.value.length > 0 : false, state: solver?.getAttribute('data-state'), - text: solver?.textContent?.trim() || '' + text: solver?.textContent?.trim(), + classList: solver?.className }}; }}); - const sec = Math.round((Date.now() - start) / 1000); - console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30)); - if (state.solved) {{ console.error('[+] SOLVED!'); break; }} - if (state.state === 'error') {{ console.error('[!] ERROR'); break; }} + + if (state.state !== lastState) {{ + const elapsed = Math.round((Date.now() - start) / 1000); + console.error(`[*] State change at ${{elapsed}}s: "${{lastState}}" -> "${{state.state}}" (text: "${{state.text?.slice(0, 50)}}")`); + lastState = state.state; + }} + + if (state.state === 'solved') {{ + solved = true; + const elapsed = Math.round((Date.now() - start) / 1000); + console.error('[+] SOLVED in ' + elapsed + 's!'); + break; + }} + + // Check every 2 seconds await new Promise(r => setTimeout(r, 2000)); }} + if (!solved) {{ + const elapsed = Math.round((Date.now() - start) / 1000); + const finalState = await page.evaluate(() => {{ + const solver = document.querySelector('.captcha-solver'); + return {{ + state: solver?.getAttribute('data-state'), + text: solver?.textContent?.trim(), + html: solver?.outerHTML?.slice(0, 200) + }}; + }}); + console.error(`[!] TIMEOUT after ${{elapsed}}s. Final state: ${{JSON.stringify(finalState)}}`); + browser.disconnect(); + process.exit(1); + }} + const final = await page.evaluate(() => {{ - const resp = document.querySelector('textarea[name="g-recaptcha-response"]'); - return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }}; + const solver = document.querySelector('.captcha-solver'); + return {{ + solved: true, + state: solver?.getAttribute('data-state'), + text: solver?.textContent?.trim() + }}; }}); browser.disconnect(); console.log(JSON.stringify(final)); }})(); ''' (tmpdir / 's.js').write_text(script) - print("\n[*] Solving CAPTCHA (10-60s)...") - r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True) + print("\n[*] Solving CAPTCHA (this can take up to 150s for 2captcha API)...") + r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=200, capture_output=True, text=True) print(r.stderr) assert r.returncode == 0, f"Failed: {r.stderr}" final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1]) assert final.get('solved'), f"Not solved: {final}" - print(f"[+] SOLVED! {final.get('preview','')[:30]}...") + assert final.get('state') == 'solved', f"State not 'solved': {final}" + print(f"[+] SUCCESS! CAPTCHA solved: {final.get('text','')[:50]}") finally: kill_chrome(process, chrome_dir) diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 2cc3d9fb..a6bce7fd 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -265,57 +265,60 @@ class Orchestrator: def runloop(self) -> None: """Main orchestrator loop.""" - from archivebox.misc.logging import IS_TTY, CONSOLE - import sys + from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn + from archivebox.misc.logging import IS_TTY + import archivebox.misc.logging as logging_module self.on_startup() # Enable progress bars only in TTY + foreground mode show_progress = IS_TTY and self.exit_on_idle - last_progress_output = "" + + progress = Progress( + TextColumn("[cyan]{task.description}"), + BarColumn(bar_width=40), + TaskProgressColumn(), + transient=False, + ) if show_progress else None + + task_ids = {} # snapshot_id -> task_id + + # Replace global CONSOLE with progress.console when active + original_console = logging_module.CONSOLE + original_stderr = logging_module.STDERR try: + if progress: + progress.start() + # Redirect all logging through progress.console + logging_module.CONSOLE = progress.console + logging_module.STDERR = progress.console + while True: # Check queues and spawn workers queue_sizes = self.check_queues_and_spawn_workers() - # Update progress bars (simple inline update) - if show_progress: + # Update progress bars + if progress: from archivebox.core.models import Snapshot - active_snapshots = list(Snapshot.objects.filter(status='started').iterator(chunk_size=100)) + active_snapshots = Snapshot.objects.filter(status='started').iterator(chunk_size=100) - if active_snapshots: - # Build progress string - progress_lines = [] - for snapshot in active_snapshots[:5]: # Limit to 5 snapshots - total = snapshot.archiveresult_set.count() - if total == 0: - continue + for snapshot in active_snapshots: + total = snapshot.archiveresult_set.count() + if total == 0: + continue - completed = snapshot.archiveresult_set.filter( - status__in=['succeeded', 'skipped', 'failed'] - ).count() + completed = snapshot.archiveresult_set.filter( + status__in=['succeeded', 'skipped', 'failed'] + ).count() - percentage = (completed / total) * 100 - bar_width = 30 - filled = int(bar_width * completed / total) - bar = '█' * filled + '░' * (bar_width - filled) + # Create or update task + if snapshot.id not in task_ids: + url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url + task_ids[snapshot.id] = progress.add_task(url, total=total) - url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url - progress_lines.append(f"{url} {bar} {percentage:>3.0f}%") - - progress_output = "\n".join(progress_lines) - - # Only update if changed - if progress_output != last_progress_output: - # Clear previous lines and print new ones - if last_progress_output: - num_lines = last_progress_output.count('\n') + 1 - sys.stderr.write(f"\r\033[{num_lines}A\033[J") - sys.stderr.write(progress_output + "\n") - sys.stderr.flush() - last_progress_output = progress_output + progress.update(task_ids[snapshot.id], completed=completed) # Track idle state if self.has_pending_work(queue_sizes) or self.has_running_workers(): @@ -327,12 +330,6 @@ class Orchestrator: # Check if we should exit if self.should_exit(queue_sizes): - # Clear progress lines - if show_progress and last_progress_output: - num_lines = last_progress_output.count('\n') + 1 - sys.stderr.write(f"\r\033[{num_lines}A\033[J") - sys.stderr.flush() - log_worker_event( worker_type='Orchestrator', event='All work complete', @@ -350,6 +347,12 @@ class Orchestrator: raise else: self.on_shutdown() + finally: + if progress: + # Restore original consoles + logging_module.CONSOLE = original_console + logging_module.STDERR = original_stderr + progress.stop() def start(self) -> int: """ diff --git a/bin/test_plugins.sh b/bin/test_plugins.sh index 790328a7..eead957a 100755 --- a/bin/test_plugins.sh +++ b/bin/test_plugins.sh @@ -67,7 +67,7 @@ for test_dir in $TEST_DIRS; do echo -e "${YELLOW}[RUNNING]${NC} $plugin_name" - if python -m pytest "$test_dir" -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then + if python -m pytest "$test_dir" -p no:django -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then echo -e "${GREEN}[PASSED]${NC} $plugin_name" PASSED_PLUGINS=$((PASSED_PLUGINS + 1)) else