cleanup migrations, json, jsonl

This commit is contained in:
Nick Sweeting
2025-12-31 15:36:13 -08:00
parent 0930911a15
commit a04e4a7345
21 changed files with 993 additions and 1418 deletions

View File

@@ -207,7 +207,7 @@ def run_plugins(
}.get(result.status, 'dim')
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin}{result.output_str or ""}', file=sys.stderr)
else:
write_record(result.to_jsonl())
write_record(result.to_json())
except Snapshot.DoesNotExist:
continue

View File

@@ -252,8 +252,8 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp', 'process__cmd')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
fieldsets = (
@@ -270,7 +270,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
'classes': ('card',),
}),
('Command', {
'fields': ('process', 'cmd', 'cmd_str', 'cmd_version', 'pwd'),
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
'classes': ('card',),
}),
('Output', {

View File

@@ -1,299 +1,250 @@
# Generated by hand on 2025-12-29
# Upgrades core app from v0.7.2 (migration 0022) or v0.8.6rc0 (migration 0076) to v0.9.0 using raw SQL
# Upgrades core app from v0.7.2/v0.8.6rc0 (migration 0022) to v0.9.0 using raw SQL
# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0
from django.db import migrations
from django.db import migrations, models, connection
def upgrade_from_v072_or_v086(apps, schema_editor):
"""
Upgrade core tables from either v0.7.2 or v0.8.6rc0 to v0.9.0.
Handles differences in schema between versions.
"""
with schema_editor.connection.cursor() as cursor:
# Check if uuid column exists (v0.7.2 has it, v0.8.6rc0 doesn't)
def get_table_columns(table_name):
"""Get list of column names for a table."""
cursor = connection.cursor()
cursor.execute(f"PRAGMA table_info({table_name})")
return {row[1] for row in cursor.fetchall()}
def upgrade_core_tables(apps, schema_editor):
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
cursor = connection.cursor()
# Check if core_archiveresult table exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
if not cursor.fetchone():
# Fresh install - no migration needed, tables will be created by later migrations
return
# Detect which version we're migrating from
archiveresult_cols = get_table_columns('core_archiveresult')
has_uuid = 'uuid' in archiveresult_cols
has_abid = 'abid' in archiveresult_cols
# ============================================================================
# PART 1: Upgrade core_archiveresult table
# ============================================================================
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
cmd TEXT,
pwd VARCHAR(256),
cmd_version VARCHAR(128),
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE
);
""")
if has_uuid and not has_abid:
# Migrating from v0.7.2 (has uuid, minimal fields)
print('Migrating ArchiveResult from v0.7.2 schema...')
cursor.execute("""
SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='uuid'
""")
has_uuid = cursor.fetchone()[0] > 0
# Check if id is INTEGER (v0.7.2) or TEXT/char (v0.8.6rc0)
cursor.execute("""
SELECT type FROM pragma_table_info('core_archiveresult') WHERE name='id'
""")
id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
is_v072 = 'INT' in id_type.upper()
# ============================================================================
# PART 1: Upgrade core_archiveresult table
# ============================================================================
# Create new table with v0.9.0 schema
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
cmd TEXT,
pwd VARCHAR(256),
cmd_version VARCHAR(128),
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
binary_id TEXT,
iface_id TEXT,
process_id TEXT,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL,
FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
)
SELECT
id, uuid,
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status,
COALESCE(output, '') as output_str
FROM core_archiveresult;
""")
# Copy data based on source version
if is_v072:
# Coming from v0.7.2: has INTEGER id, has uuid column, has extractor
print(" Migrating from v0.7.2 schema...")
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
)
SELECT
uuid,
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status,
COALESCE(output, '') as output_str
FROM core_archiveresult
""")
else:
# Coming from v0.8.6rc0: has TEXT id, no uuid column, has abid
print(" Migrating from v0.8.6rc0 schema...")
cursor.execute("""
INSERT OR IGNORE INTO core_archiveresult_new (
uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
)
SELECT
id as uuid,
created_at,
modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status, retry_at,
COALESCE(output, '') as output_str
FROM core_archiveresult
""")
# Replace old table
cursor.execute("DROP TABLE IF EXISTS core_archiveresult")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
# ============================================================================
# PART 2: Upgrade core_snapshot table
# ============================================================================
# Check snapshot schema version
elif has_abid and not has_uuid:
# Migrating from v0.8.6rc0 (has abid, full fields)
print('Migrating ArchiveResult from v0.8.6rc0 schema...')
cursor.execute("""
SELECT COUNT(*) FROM pragma_table_info('core_snapshot') WHERE name='crawl_id'
""")
has_crawl_id = cursor.fetchone()[0] > 0
# Create new table
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
downloaded_at DATETIME,
url TEXT NOT NULL,
timestamp TEXT NOT NULL,
title TEXT,
crawl_id TEXT,
depth INTEGER NOT NULL DEFAULT 0,
parent_snapshot_id TEXT,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0
-- Note: crawl_id foreign key will be added in 0024 after assigning crawl_ids
-- FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
-- FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
INSERT OR IGNORE INTO core_archiveresult_new (
id, uuid, created_at, modified_at, snapshot_id, plugin,
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
)
SELECT
id, abid as uuid,
created_at, modified_at,
snapshot_id,
COALESCE(extractor, '') as plugin,
cmd, pwd, cmd_version,
start_ts, end_ts, status, retry_at,
COALESCE(output, '') as output_str
FROM core_archiveresult;
""")
else:
print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
# Copy snapshot data
if has_crawl_id:
# v0.8.6rc0 schema - already has created_at, modified_at, bookmarked_at
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid);")
# ============================================================================
# PART 2: Upgrade core_snapshot table
# ============================================================================
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
crawl_id TEXT,
parent_snapshot_id TEXT,
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
""")
# Check if core_snapshot exists (it should)
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'")
if cursor.fetchone():
# Detect which version we're migrating from
snapshot_cols = get_table_columns('core_snapshot')
has_added = 'added' in snapshot_cols
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
if has_added and not has_bookmarked_at:
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.7.2 schema...')
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
id, created_at, modified_at, bookmarked_at, downloaded_at, url, timestamp,
crawl_id, status, retry_at
id, url, timestamp, title, bookmarked_at, created_at, modified_at
)
SELECT
id,
created_at,
modified_at,
bookmarked_at,
downloaded_at,
url, timestamp,
NULLIF(crawl_id, ''),
COALESCE(status, 'queued'),
retry_at
FROM core_snapshot
id, url, timestamp, title,
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
FROM core_snapshot;
""")
elif has_bookmarked_at and not has_added:
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.8.6rc0 schema...')
# Check what fields exist
has_status = 'status' in snapshot_cols
has_retry_at = 'retry_at' in snapshot_cols
has_crawl_id = 'crawl_id' in snapshot_cols
# Build column list based on what exists
cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at']
if has_crawl_id:
cols.append('crawl_id')
if has_status:
cols.append('status')
if has_retry_at:
cols.append('retry_at')
cursor.execute(f"""
INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)})
SELECT {', '.join(cols)}
FROM core_snapshot;
""")
else:
# v0.7.2 schema - will get crawl_id assigned by later migration (0024)
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
id, created_at, modified_at, bookmarked_at, url, timestamp, crawl_id
)
SELECT
id,
COALESCE(added, CURRENT_TIMESTAMP),
COALESCE(updated, added, CURRENT_TIMESTAMP),
COALESCE(added, CURRENT_TIMESTAMP),
url, timestamp,
NULL as crawl_id
FROM core_snapshot
""")
print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}')
# Replace old table
cursor.execute("DROP TABLE IF EXISTS core_snapshot")
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot")
cursor.execute("DROP TABLE IF EXISTS core_snapshot;")
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at)")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at)")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);")
cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);")
# ============================================================================
# PART 3: Upgrade core_tag table
# ============================================================================
# ============================================================================
# PART 3: Upgrade core_tag table
# ============================================================================
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_tag_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
# Check if tag id is INTEGER (v0.7.2) or TEXT (v0.8.6rc0)
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE,
created_by_id INTEGER,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
);
""")
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_tag'")
if cursor.fetchone():
cursor.execute("""
SELECT type FROM pragma_table_info('core_tag') WHERE name='id'
""")
tag_id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
tag_id_is_int = 'INT' in tag_id_type.upper()
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_tag_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
created_by_id INTEGER,
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE SET NULL
)
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
SELECT id, name, slug
FROM core_tag;
""")
if tag_id_is_int:
# v0.7.2: Direct copy (INTEGER to INTEGER)
cursor.execute("""
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
SELECT id, name, slug FROM core_tag
""")
else:
# v0.8.6rc0: Need to remap TEXT ids to new INTEGER ids
cursor.execute("SELECT id, name, slug FROM core_tag")
old_tags = cursor.fetchall()
tag_id_mapping = {} # old_text_id -> new_int_id
cursor.execute("DROP TABLE IF EXISTS core_tag;")
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag;")
for old_id, name, slug in old_tags:
cursor.execute("""
INSERT OR IGNORE INTO core_tag_new (name, slug)
VALUES (?, ?)
""", [name, slug])
cursor.execute("SELECT id FROM core_tag_new WHERE slug = ?", [slug])
new_id = cursor.fetchone()[0]
tag_id_mapping[old_id] = new_id
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);")
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);")
cursor.execute("DROP TABLE IF EXISTS core_tag")
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag")
# Recreate M2M table
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_tags_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_id TEXT NOT NULL,
tag_id INTEGER NOT NULL,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES core_tag(id) ON DELETE CASCADE,
UNIQUE(snapshot_id, tag_id)
)
""")
if tag_id_is_int:
# Direct copy for v0.7.2
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
SELECT snapshot_id, tag_id FROM core_snapshot_tags
""")
else:
# v0.8.6rc0: Use mapping to convert old TEXT ids to new INTEGER ids
cursor.execute("SELECT snapshot_id, tag_id FROM core_snapshot_tags")
m2m_entries = cursor.fetchall()
for snapshot_id, old_tag_id in m2m_entries:
new_tag_id = tag_id_mapping.get(old_tag_id)
if new_tag_id:
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
VALUES (?, ?)
""", [snapshot_id, new_tag_id])
cursor.execute("DROP TABLE IF EXISTS core_snapshot_tags")
cursor.execute("ALTER TABLE core_snapshot_tags_new RENAME TO core_snapshot_tags")
print('✓ Core tables upgraded to v0.9.0')
class Migration(migrations.Migration):
@@ -301,10 +252,49 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0022_auto_20231023_2008'),
('crawls', '0001_initial'),
('machine', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.RunPython(upgrade_from_v072_or_v086, reverse_code=migrations.RunPython.noop),
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(
upgrade_core_tables,
reverse_code=migrations.RunPython.noop,
),
],
state_operations=[
# Remove old ArchiveResult fields
migrations.RemoveField(model_name='archiveresult', name='extractor'),
migrations.RemoveField(model_name='archiveresult', name='output'),
# Remove old Snapshot fields
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# SnapshotTag table already exists from v0.7.2, just declare it in state
migrations.CreateModel(
name='SnapshotTag',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)),
('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)),
],
options={
'db_table': 'core_snapshot_tags',
'unique_together': {('snapshot', 'tag')},
},
),
# Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2)
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(
'Tag',
blank=True,
related_name='snapshot_set',
through='SnapshotTag',
through_fields=('snapshot', 'tag'),
),
),
],
),
]

View File

@@ -1,7 +1,7 @@
# Generated by hand on 2025-12-29
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
from django.db import migrations
from django.db import migrations, models
import uuid
@@ -56,8 +56,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0023_upgrade_to_0_9_0'),
('crawls', '0002_upgrade_to_0_9_0'),
('machine', '0001_initial'),
('crawls', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
@@ -66,65 +65,80 @@ class Migration(migrations.Migration):
create_default_crawl_and_assign_snapshots,
reverse_code=migrations.RunPython.noop,
),
# Now make crawl_id NOT NULL
migrations.RunSQL(
sql="""
-- Rebuild snapshot table with NOT NULL crawl_id
CREATE TABLE core_snapshot_final (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
migrations.SeparateDatabaseAndState(
database_operations=[
# Now make crawl_id NOT NULL
migrations.RunSQL(
sql="""
-- Rebuild snapshot table with NOT NULL crawl_id
CREATE TABLE core_snapshot_final (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
crawl_id TEXT NOT NULL,
parent_snapshot_id TEXT,
crawl_id TEXT NOT NULL,
parent_snapshot_id TEXT,
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
title VARCHAR(512),
downloaded_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
);
INSERT INTO core_snapshot_final (
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
config, notes, num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
)
SELECT
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
FROM core_snapshot;
INSERT INTO core_snapshot_final (
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
config, notes, num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
)
SELECT
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
FROM core_snapshot;
DROP TABLE core_snapshot;
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
DROP TABLE core_snapshot;
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
""",
reverse_sql=migrations.RunSQL.noop,
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
state_operations=[
migrations.AddField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(
on_delete=models.deletion.CASCADE,
to='crawls.crawl',
help_text='Crawl that created this snapshot'
),
),
],
),
]

View File

@@ -0,0 +1,258 @@
# Generated by Django 6.0 on 2025-12-31 23:09
import archivebox.base_models.models
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_assign_default_crawl'),
('crawls', '0001_initial'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
),
migrations.RemoveField(
model_name='archiveresult',
name='cmd',
),
migrations.RemoveField(
model_name='archiveresult',
name='cmd_version',
),
migrations.RemoveField(
model_name='archiveresult',
name='pwd',
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AddField(
model_name='archiveresult',
name='plugin',
field=models.CharField(db_index=True, default='', max_length=32),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
),
migrations.AddField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(db_index=True, default=0),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
),
migrations.AddField(
model_name='tag',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='tag',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True),
),
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(editable=False, max_length=100, unique=True),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
),
]

View File

@@ -1,484 +0,0 @@
# Generated by hand on 2025-12-29
# Cleans up extra columns from raw SQL migrations and ensures schema matches models
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
import archivebox.base_models.models
def cleanup_extra_columns(apps, schema_editor):
"""
Create Process records from old cmd/pwd/cmd_version columns and remove those columns.
This preserves the execution details by moving them to the Process model.
"""
with schema_editor.connection.cursor() as cursor:
# Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'")
has_cmd = cursor.fetchone()[0] > 0
if has_cmd:
print(" Migrating cmd/pwd/cmd_version data to Process records...")
# For each ArchiveResult, create a Process record with cmd/pwd data
# Note: cmd_version from old schema is not preserved (it's now derived from Binary)
cursor.execute("""
SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status
FROM core_archiveresult
""")
archive_results = cursor.fetchall()
from archivebox.uuid_compat import uuid7
from archivebox.base_models.models import get_or_create_system_user_pk
# Get or create a Machine record
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
if result:
machine_id = result[0]
print(f" Using existing Machine: {machine_id}")
else:
# Create a minimal Machine record with raw SQL (can't use model during migration)
print(" Creating Machine record for Process migration...")
import platform
import socket
# Generate minimal machine data without using the model
machine_id = str(uuid7())
guid = f"{socket.gethostname()}-{platform.machine()}"
hostname = socket.gethostname()
# Check schema version
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='config'")
has_config = cursor.fetchone()[0] > 0
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='abid'")
has_abid = cursor.fetchone()[0] > 0
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='num_uses_succeeded'")
has_num_uses = cursor.fetchone()[0] > 0
# Insert directly with SQL (use INSERT OR IGNORE in case it already exists)
if has_config:
# v0.9.0+ schema
cursor.execute("""
INSERT OR IGNORE INTO machine_machine (
id, created_at, modified_at,
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats, config
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', '{}')
""", (
machine_id, guid, hostname,
platform.machine(), platform.system(), platform.platform(), platform.release()
))
elif has_abid and has_num_uses:
# v0.8.6rc0 schema (has abid and num_uses columns)
cursor.execute("""
INSERT OR IGNORE INTO machine_machine (
id, abid, created_at, modified_at,
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats, num_uses_failed, num_uses_succeeded
) VALUES (?, '', datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', 0, 0)
""", (
machine_id, guid, hostname,
platform.machine(), platform.system(), platform.platform(), platform.release()
))
else:
# v0.7.2 or other schema
cursor.execute("""
INSERT OR IGNORE INTO machine_machine (
id, created_at, modified_at,
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}')
""", (
machine_id, guid, hostname,
platform.machine(), platform.system(), platform.platform(), platform.release()
))
# Re-query to get the actual id (in case INSERT OR IGNORE skipped it)
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
if result:
machine_id = result[0]
print(f" ✓ Using/Created Machine: {machine_id}")
else:
# INSERT OR IGNORE failed - try again without IGNORE to see the error
raise Exception("Failed to create Machine record - machine_machine table is empty after INSERT")
for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
# Create Process record
process_id = str(uuid7())
cursor.execute("""
INSERT INTO machine_process (
id, created_at, modified_at,
machine_id, binary_id, iface_id,
pwd, cmd, env, timeout,
pid, exit_code, stdout, stderr,
started_at, ended_at, url, status, retry_at
) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL)
""", (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued'))
# Update ArchiveResult to point to new Process
cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id))
print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data")
# Now rebuild table without the extra columns
print(" Rebuilding core_archiveresult table...")
cursor.execute("""
CREATE TABLE core_archiveresult_final (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid TEXT,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
snapshot_id TEXT NOT NULL,
plugin VARCHAR(32) NOT NULL DEFAULT '',
hook_name VARCHAR(255) NOT NULL DEFAULT '',
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
output_files TEXT NOT NULL DEFAULT '{}',
output_json TEXT,
output_str TEXT NOT NULL DEFAULT '',
output_size INTEGER NOT NULL DEFAULT 0,
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
config TEXT,
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
process_id TEXT NOT NULL,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
)
""")
# Copy data (cmd, pwd, etc. are now in Process records)
cursor.execute("""
INSERT INTO core_archiveresult_final SELECT
id, uuid, created_at, modified_at,
snapshot_id, plugin, hook_name,
start_ts, end_ts, status, retry_at,
output_files, output_json, output_str, output_size, output_mimetypes,
config, notes, num_uses_succeeded, num_uses_failed,
process_id
FROM core_archiveresult
""")
# Replace table
cursor.execute("DROP TABLE core_archiveresult")
cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult")
# Recreate indexes
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
print(" ✓ Cleaned up core_archiveresult schema")
class Migration(migrations.Migration):
dependencies = [
('core', '0024_assign_default_crawl'),
('machine', '0005_add_process_table'),
('crawls', '0002_upgrade_to_0_9_0'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunPython(
cleanup_extra_columns,
reverse_code=migrations.RunPython.noop,
),
],
state_operations=[
# Tell Django about all the fields that exist after raw SQL migrations
# ArchiveResult model options
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
# Remove old fields
migrations.RemoveField(model_name='archiveresult', name='cmd'),
migrations.RemoveField(model_name='archiveresult', name='pwd'),
migrations.RemoveField(model_name='archiveresult', name='cmd_version'),
migrations.RemoveField(model_name='archiveresult', name='extractor'),
migrations.RemoveField(model_name='archiveresult', name='output'),
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# Add new ArchiveResult fields
migrations.AddField(
model_name='archiveresult',
name='plugin',
field=models.CharField(blank=True, default='', max_length=32),
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, default='', max_length=255),
),
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(blank=True, default=dict),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', max_length=512),
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
# Update Snapshot model
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
),
migrations.AddField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', max_length=10),
),
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(blank=True, default=dict),
),
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='snapshot',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
# Update Tag model
migrations.AlterModelOptions(
name='tag',
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
),
migrations.AddField(
model_name='tag',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='tag',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
# Alter field types
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(max_length=2048),
),
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(editable=False, max_length=100, unique=True),
),
# Create M2M model for snapshot tags
migrations.CreateModel(
name='SnapshotTag',
fields=[
('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')),
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
],
options={
'db_table': 'core_snapshot_tags',
},
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
# Update tags field on Snapshot to use the through model
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
),
# Add constraints
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
),
],
),
]

View File

@@ -1,76 +0,0 @@
# Generated by hand on 2025-12-30
# Final field adjustments to match model definitions exactly
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from archivebox.uuid_compat import uuid7
class Migration(migrations.Migration):
dependencies = [
('core', '0025_cleanup_schema'),
('crawls', '0002_upgrade_to_0_9_0'),
]
operations = [
# Alter Snapshot fields to match model exactly
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True, unique=False),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='parent_snapshot',
field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
# Alter SnapshotTag fields
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
),
migrations.AlterField(
model_name='snapshottag',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
),
]

View File

@@ -1,108 +0,0 @@
# Generated by Django 6.0 on 2025-12-31 09:04
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0026_final_field_adjustments'),
('crawls', '0002_upgrade_to_0_9_0'),
('machine', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='plugin',
field=models.CharField(db_index=True, default='', max_length=32),
),
migrations.AlterField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
migrations.AlterField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict),
),
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
migrations.AlterField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
),
migrations.AlterField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(db_index=True, default=0),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
]

View File

@@ -91,9 +91,9 @@ class Tag(ModelWithSerializers):
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Tag model instance to a JSONL record.
Convert Tag model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
@@ -105,12 +105,12 @@ class Tag(ModelWithSerializers):
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
"""
Create/update Tag from JSONL record.
Create/update Tag from JSON dict.
Args:
record: JSONL record with 'name' field
record: JSON dict with 'name' field
overrides: Optional dict with 'snapshot' to auto-attach tag
Returns:
@@ -982,8 +982,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
processes_seen = set()
with open(index_path, 'w') as f:
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
f.write(json.dumps(self.to_jsonl()) + '\n')
# Write Snapshot record first (to_json includes crawl_id, fs_version)
f.write(json.dumps(self.to_json()) + '\n')
# Write ArchiveResult records with their associated Binary and Process
# Use select_related to optimize queries
@@ -991,15 +991,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Write Binary record if not already written
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
binaries_seen.add(ar.process.binary_id)
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
f.write(json.dumps(ar.process.binary.to_json()) + '\n')
# Write Process record if not already written
if ar.process and ar.process_id not in processes_seen:
processes_seen.add(ar.process_id)
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
f.write(json.dumps(ar.process.to_json()) + '\n')
# Write ArchiveResult record
f.write(json.dumps(ar.to_jsonl()) + '\n')
f.write(json.dumps(ar.to_json()) + '\n')
def read_index_jsonl(self) -> dict:
"""
@@ -1422,9 +1422,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return False
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Snapshot model instance to a JSONL record.
Convert Snapshot model instance to a JSON-serializable dict.
Includes all fields needed to fully reconstruct/identify this snapshot.
"""
from archivebox.config import VERSION
@@ -1445,9 +1445,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
"""
Create/update Snapshot from JSONL record or dict.
Create/update Snapshot from JSON dict.
Unified method that handles:
- ID-based patching: {"id": "...", "title": "new title"}
@@ -2106,8 +2106,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
result['canonical'] = self.canonical_outputs()
return result
def to_json(self, indent: int = 4) -> str:
"""Convert to JSON string"""
def to_json_str(self, indent: int = 4) -> str:
"""Convert to JSON string (legacy method, use to_json() for dict)"""
return to_json(self.to_dict(extended=True), indent=indent)
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2284,14 +2284,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
# Required - every ArchiveResult must have a Process
process = models.OneToOneField(
'machine.Process',
on_delete=models.PROTECT,
null=False, # Required after migration 4
related_name='archiveresult',
help_text='Process execution details for this archive result'
)
# Added POST-v0.9.0, will be added in a separate migration
# process = models.OneToOneField(
# 'machine.Process',
# on_delete=models.PROTECT,
# null=False,
# related_name='archiveresult',
# help_text='Process execution details for this archive result'
# )
# New output fields (replacing old 'output' field)
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
@@ -2326,9 +2326,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
return self.snapshot.crawl.created_by
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert ArchiveResult model instance to a JSONL record.
Convert ArchiveResult model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
@@ -2360,6 +2360,50 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
record['process_id'] = str(self.process_id)
return record
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
"""
Create/update ArchiveResult from JSON dict.
Args:
record: JSON dict with 'snapshot_id', 'plugin', etc.
overrides: Optional dict of field overrides
Returns:
ArchiveResult instance or None
"""
snapshot_id = record.get('snapshot_id')
plugin = record.get('plugin')
if not snapshot_id or not plugin:
return None
# Try to get existing by ID first
result_id = record.get('id')
if result_id:
try:
return ArchiveResult.objects.get(id=result_id)
except ArchiveResult.DoesNotExist:
pass
# Get or create by snapshot_id + plugin
try:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.get(id=snapshot_id)
result, _ = ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin=plugin,
defaults={
'hook_name': record.get('hook_name', ''),
'status': record.get('status', 'queued'),
'output_str': record.get('output_str', ''),
}
)
return result
except Snapshot.DoesNotExist:
return None
def save(self, *args, **kwargs):
is_new = self._state.adding

View File

@@ -1,90 +0,0 @@
# Generated by hand on 2025-12-29
# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema
from django.db import migrations
def upgrade_crawl_schema_if_needed(apps, schema_editor):
"""
Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column).
"""
with schema_editor.connection.cursor() as cursor:
# Check if we need to upgrade (missing urls column means v0.8.6rc0)
cursor.execute("""
SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls'
""")
has_urls = cursor.fetchone()[0] > 0
if not has_urls:
print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...")
# Create new table with v0.9.0 schema
cursor.execute("""
CREATE TABLE crawls_crawl_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
urls TEXT NOT NULL DEFAULT '[]',
config TEXT,
max_depth INTEGER NOT NULL DEFAULT 0,
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
persona_id TEXT,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
output_dir VARCHAR(512) NOT NULL DEFAULT '',
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
created_by_id INTEGER NOT NULL,
schedule_id TEXT,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
)
""")
# Copy data from old table (v0.8.6rc0 schema)
cursor.execute("""
INSERT INTO crawls_crawl_new (
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id
)
SELECT
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
'[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id,
CAST(schedule_id AS TEXT)
FROM crawls_crawl
""")
# Replace old table
cursor.execute("DROP TABLE crawls_crawl")
cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl")
# Create indexes
cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)")
cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)")
cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)")
cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)")
cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)")
print(" ✓ Upgraded crawls_crawl to v0.9.0 schema")
else:
print(" ✓ crawls_crawl already has v0.9.0 schema")
class Migration(migrations.Migration):
dependencies = [
('crawls', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.RunPython(
upgrade_crawl_schema_if_needed,
reverse_code=migrations.RunPython.noop,
),
]

View File

@@ -134,9 +134,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Crawl model instance to a JSONL record.
Convert Crawl model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
@@ -152,9 +152,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def from_json(record: dict, overrides: dict = None):
"""
Create or get a Crawl from a JSONL record.
Create or get a Crawl from a JSON dict.
Args:
record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'

View File

@@ -1176,7 +1176,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
"""
Process JSONL records from hook output.
Dispatches to Model.from_jsonl() for each record type.
Dispatches to Model.from_json() for each record type.
Args:
records: List of JSONL record dicts from result['records']
@@ -1201,25 +1201,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
# Dispatch to appropriate model's from_jsonl() method
if record_type == 'Snapshot':
from archivebox.core.models import Snapshot
obj = Snapshot.from_jsonl(record.copy(), overrides)
obj = Snapshot.from_jsonll(record.copy(), overrides)
if obj:
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
elif record_type == 'Tag':
from archivebox.core.models import Tag
obj = Tag.from_jsonl(record.copy(), overrides)
obj = Tag.from_json(record.copy(), overrides)
if obj:
stats['Tag'] = stats.get('Tag', 0) + 1
elif record_type == 'Binary':
from archivebox.machine.models import Binary
obj = Binary.from_jsonl(record.copy(), overrides)
obj = Binary.from_json(record.copy(), overrides)
if obj:
stats['Binary'] = stats.get('Binary', 0) + 1
elif record_type == 'Machine':
from archivebox.machine.models import Machine
obj = Machine.from_jsonl(record.copy(), overrides)
obj = Machine.from_json(record.copy(), overrides)
if obj:
stats['Machine'] = stats.get('Machine', 0) + 1

View File

@@ -100,46 +100,8 @@ class Migration(migrations.Migration):
CREATE INDEX IF NOT EXISTS machine_binary_status_idx ON machine_binary(status);
CREATE INDEX IF NOT EXISTS machine_binary_retry_at_idx ON machine_binary(retry_at);
-- Create machine_process table
CREATE TABLE IF NOT EXISTS machine_process (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
machine_id TEXT NOT NULL,
binary_id TEXT,
iface_id TEXT,
pwd VARCHAR(512) NOT NULL DEFAULT '',
cmd TEXT NOT NULL DEFAULT '[]',
env TEXT NOT NULL DEFAULT '{}',
timeout INTEGER NOT NULL DEFAULT 120,
pid INTEGER,
exit_code INTEGER,
stdout TEXT NOT NULL DEFAULT '',
stderr TEXT NOT NULL DEFAULT '',
started_at DATETIME,
ended_at DATETIME,
url VARCHAR(2048),
status VARCHAR(16) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
FOREIGN KEY (machine_id) REFERENCES machine_machine(id) ON DELETE CASCADE,
FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL,
FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL
);
CREATE INDEX IF NOT EXISTS machine_process_status_idx ON machine_process(status);
CREATE INDEX IF NOT EXISTS machine_process_retry_at_idx ON machine_process(retry_at);
CREATE INDEX IF NOT EXISTS machine_process_machine_id_idx ON machine_process(machine_id);
CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id);
CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at);
""",
reverse_sql="""
DROP TABLE IF EXISTS machine_process;
DROP TABLE IF EXISTS machine_binary;
DROP TABLE IF EXISTS machine_networkinterface;
DROP TABLE IF EXISTS machine_machine;
@@ -167,6 +129,8 @@ class Migration(migrations.Migration):
('os_kernel', models.CharField(default=None, max_length=255)),
('stats', models.JSONField(blank=True, default=dict, null=True)),
('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
],
options={
'app_label': 'machine',
@@ -189,6 +153,8 @@ class Migration(migrations.Migration):
('region', models.CharField(default=None, max_length=63)),
('country', models.CharField(default=None, max_length=63)),
('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
],
options={
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
@@ -212,6 +178,8 @@ class Migration(migrations.Migration):
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
],
options={
'verbose_name': 'Binary',
@@ -220,43 +188,6 @@ class Migration(migrations.Migration):
'app_label': 'machine',
},
),
migrations.CreateModel(
name='Process',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='process_set', to='machine.machine')),
('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.binary')),
('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.networkinterface')),
],
options={
'verbose_name': 'Process',
'verbose_name_plural': 'Processes',
'app_label': 'machine',
},
),
migrations.AddIndex(
model_name='process',
index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_5e3a87_idx'),
),
migrations.AddIndex(
model_name='process',
index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__7bd19c_idx'),
),
],
),
]

View File

@@ -0,0 +1,45 @@
# Generated by Django 6.0 on 2025-12-31 22:54
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='Process',
fields=[
('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.binary')),
('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.networkinterface')),
('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='process_set', to='machine.machine')),
],
options={
'verbose_name': 'Process',
'verbose_name_plural': 'Processes',
'indexes': [models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_5e3a87_idx'), models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__7bd19c_idx')],
},
),
]

View File

@@ -1,101 +0,0 @@
# Generated on 2025-12-31
# Adds parent FK and process_type field to Process model
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('machine', '0001_initial'),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunSQL(
sql="""
-- Add parent_id FK column to machine_process
ALTER TABLE machine_process ADD COLUMN parent_id TEXT REFERENCES machine_process(id) ON DELETE SET NULL;
CREATE INDEX IF NOT EXISTS machine_process_parent_id_idx ON machine_process(parent_id);
-- Add process_type column with default 'binary'
ALTER TABLE machine_process ADD COLUMN process_type VARCHAR(16) NOT NULL DEFAULT 'binary';
CREATE INDEX IF NOT EXISTS machine_process_process_type_idx ON machine_process(process_type);
-- Add composite index for parent + status queries
CREATE INDEX IF NOT EXISTS machine_process_parent_status_idx ON machine_process(parent_id, status);
-- Add composite index for machine + pid + started_at (for PID reuse protection)
CREATE INDEX IF NOT EXISTS machine_process_machine_pid_started_idx ON machine_process(machine_id, pid, started_at);
""",
# Migration is irreversible due to SQLite limitations
# SQLite doesn't support DROP COLUMN, would require table rebuild
reverse_sql=migrations.RunSQL.noop
),
],
state_operations=[
# Add parent FK
migrations.AddField(
model_name='process',
name='parent',
field=models.ForeignKey(
blank=True,
help_text='Parent process that spawned this one',
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name='children',
to='machine.process',
),
),
# Add process_type field
migrations.AddField(
model_name='process',
name='process_type',
field=models.CharField(
choices=[
('cli', 'CLI Command'),
('supervisord', 'Supervisord Daemon'),
('orchestrator', 'Orchestrator'),
('worker', 'Worker Process'),
('hook', 'Hook Script'),
('binary', 'Binary Execution'),
],
default='binary',
help_text='Type of process in the execution hierarchy',
max_length=16,
),
),
# Add indexes - must match the SQL index names exactly
migrations.AddIndex(
model_name='process',
index=models.Index(
fields=['parent'],
name='machine_process_parent_id_idx',
),
),
migrations.AddIndex(
model_name='process',
index=models.Index(
fields=['process_type'],
name='machine_process_process_type_idx',
),
),
migrations.AddIndex(
model_name='process',
index=models.Index(
fields=['parent', 'status'],
name='machine_process_parent_status_idx',
),
),
migrations.AddIndex(
model_name='process',
index=models.Index(
fields=['machine', 'pid', 'started_at'],
name='machine_process_machine_pid_started_idx',
),
),
],
),
]

View File

@@ -82,13 +82,38 @@ class Machine(ModelWithHealthStats):
)
return _CURRENT_MACHINE
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def to_json(self) -> dict:
"""
Update Machine config from JSONL record.
Convert Machine model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Machine',
'schema_version': VERSION,
'id': str(self.id),
'guid': self.guid,
'hostname': self.hostname,
'hw_in_docker': self.hw_in_docker,
'hw_in_vm': self.hw_in_vm,
'hw_manufacturer': self.hw_manufacturer,
'hw_product': self.hw_product,
'hw_uuid': self.hw_uuid,
'os_arch': self.os_arch,
'os_family': self.os_family,
'os_platform': self.os_platform,
'os_kernel': self.os_kernel,
'os_release': self.os_release,
'stats': self.stats,
'config': self.config or {},
}
@staticmethod
def from_json(record: dict, overrides: dict = None):
"""
Update Machine config from JSON dict.
Args:
record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
record: JSON dict with '_method': 'update', 'key': '...', 'value': '...'
overrides: Not used
Returns:
@@ -255,9 +280,9 @@ class Binary(ModelWithHealthStats):
'is_valid': self.is_valid,
}
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Binary model instance to a JSONL record.
Convert Binary model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
@@ -274,17 +299,17 @@ class Binary(ModelWithHealthStats):
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def from_json(record: dict, overrides: dict = None):
"""
Create/update Binary from JSONL record.
Create/update Binary from JSON dict.
Handles two cases:
1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
1. From binaries.json: creates queued binary with name, binproviders, overrides
2. From hook output: updates binary with abspath, version, sha256, binprovider
Args:
record: JSONL record with 'name' and either:
- 'binproviders', 'overrides' (from binaries.jsonl)
record: JSON dict with 'name' and either:
- 'binproviders', 'overrides' (from binaries.json)
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
overrides: Not used
@@ -542,7 +567,7 @@ class ProcessManager(models.Manager):
return process
class Process(ModelWithHealthStats):
class Process(models.Model):
"""
Tracks a single OS process execution.
@@ -563,38 +588,11 @@ class Process(ModelWithHealthStats):
RUNNING = 'running', 'Running'
EXITED = 'exited', 'Exited'
class TypeChoices(models.TextChoices):
CLI = 'cli', 'CLI Command'
SUPERVISORD = 'supervisord', 'Supervisord Daemon'
ORCHESTRATOR = 'orchestrator', 'Orchestrator'
WORKER = 'worker', 'Worker Process'
HOOK = 'hook', 'Hook Script'
BINARY = 'binary', 'Binary Execution'
# Primary fields
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
# Parent process FK for hierarchy tracking
parent = models.ForeignKey(
'self',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='children',
help_text='Parent process that spawned this one'
)
# Process type for distinguishing in hierarchy
process_type = models.CharField(
max_length=16,
choices=TypeChoices.choices,
default=TypeChoices.BINARY,
db_index=True,
help_text='Type of process in the execution hierarchy'
)
# Machine FK - required (every process runs on a machine)
machine = models.ForeignKey(
Machine,
@@ -667,10 +665,6 @@ class Process(ModelWithHealthStats):
help_text='When to retry this process'
)
# Health stats
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
state_machine_name: str = 'archivebox.machine.models.ProcessMachine'
objects: ProcessManager = ProcessManager()
@@ -682,8 +676,6 @@ class Process(ModelWithHealthStats):
indexes = [
models.Index(fields=['machine', 'status', 'retry_at']),
models.Index(fields=['binary', 'exit_code']),
models.Index(fields=['parent', 'status']),
models.Index(fields=['machine', 'pid', 'started_at']),
]
def __str__(self) -> str:
@@ -716,9 +708,9 @@ class Process(ModelWithHealthStats):
return self.archiveresult.hook_name
return ''
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Process model instance to a JSONL record.
Convert Process model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
@@ -742,6 +734,26 @@ class Process(ModelWithHealthStats):
record['timeout'] = self.timeout
return record
@staticmethod
def from_json(record: dict, overrides: dict = None):
"""
Create/update Process from JSON dict.
Args:
record: JSON dict with 'id' or process details
overrides: Optional dict of field overrides
Returns:
Process instance or None
"""
process_id = record.get('id')
if process_id:
try:
return Process.objects.get(id=process_id)
except Process.DoesNotExist:
pass
return None
def update_and_requeue(self, **kwargs):
"""
Update process fields and requeue for worker state machine.
@@ -1751,17 +1763,12 @@ class ProcessMachine(BaseStateMachine, strict_states=True):
@exited.enter
def enter_exited(self):
"""Process has exited."""
success = self.process.exit_code == 0
self.process.update_and_requeue(
retry_at=None,
status=Process.StatusChoices.EXITED,
ended_at=timezone.now(),
)
# Increment health stats based on exit code
self.process.increment_health_stats(success=success)
# =============================================================================
# State Machine Registration

View File

@@ -76,7 +76,7 @@ class TestMachineModel(TestCase):
self.assertEqual(machine1.guid, machine2.guid)
def test_machine_from_jsonl_update(self):
"""Machine.from_jsonl() should update machine config."""
"""Machine.from_json() should update machine config."""
Machine.current() # Ensure machine exists
record = {
'_method': 'update',
@@ -84,14 +84,14 @@ class TestMachineModel(TestCase):
'value': '/usr/bin/wget',
}
result = Machine.from_jsonl(record)
result = Machine.from_json(record)
self.assertIsNotNone(result)
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
def test_machine_from_jsonl_invalid(self):
"""Machine.from_jsonl() should return None for invalid records."""
result = Machine.from_jsonl({'invalid': 'record'})
"""Machine.from_json() should return None for invalid records."""
result = Machine.from_json({'invalid': 'record'})
self.assertIsNone(result)
def test_machine_manager_current(self):
@@ -254,14 +254,14 @@ class TestProcessModel(TestCase):
self.assertIsNone(process.exit_code)
def test_process_to_jsonl(self):
"""Process.to_jsonl() should serialize correctly."""
"""Process.to_json() should serialize correctly."""
process = Process.objects.create(
machine=self.machine,
cmd=['echo', 'hello'],
pwd='/tmp',
timeout=60,
)
json_data = process.to_jsonl()
json_data = process.to_json()
self.assertEqual(json_data['type'], 'Process')
self.assertEqual(json_data['cmd'], ['echo', 'hello'])

View File

@@ -271,10 +271,51 @@ async function configure2Captcha() {
if (result.success) {
console.error(`[+] 2captcha configured via ${result.method}`);
// Verify config was applied by reloading options page and checking form values
console.error('[*] Verifying config by reloading options page...');
try {
await configPage.reload({ waitUntil: 'networkidle0', timeout: 10000 });
} catch (e) {
console.error(`[*] Reload threw error (may still work): ${e.message}`);
}
await new Promise(r => setTimeout(r, 2000));
// Wait for Config object again
await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 });
// Read back the config using Config.getAll()
const verifyConfig = await configPage.evaluate(async () => {
if (typeof Config !== 'undefined' && typeof Config.getAll === 'function') {
return await Config.getAll();
}
return null;
});
if (!verifyConfig) {
return { success: false, error: 'Could not verify config - Config.getAll() not available' };
}
// Check that API key was actually set
const actualApiKey = verifyConfig.apiKey || verifyConfig.api_key;
if (!actualApiKey || actualApiKey !== config.apiKey) {
console.error(`[!] Config verification FAILED - API key mismatch`);
console.error(`[!] Expected: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`);
console.error(`[!] Got: ${actualApiKey ? actualApiKey.slice(0, 8) + '...' + actualApiKey.slice(-4) : 'null'}`);
return { success: false, error: 'Config verification failed - API key not set correctly' };
}
console.error('[+] Config verified successfully!');
console.error(`[+] API Key: ${actualApiKey.slice(0, 8)}...${actualApiKey.slice(-4)}`);
console.error(`[+] Plugin Enabled: ${verifyConfig.isPluginEnabled}`);
console.error(`[+] Auto Solve Turnstile: ${verifyConfig.autoSolveTurnstile}`);
fs.writeFileSync(CONFIG_MARKER, JSON.stringify({
timestamp: new Date().toISOString(),
method: result.method,
extensionId: extensionId,
verified: true,
config: {
apiKeySet: !!config.apiKey,
isPluginEnabled: config.isPluginEnabled,
@@ -284,7 +325,7 @@ async function configure2Captcha() {
autoSolveEnabled: true,
}
}, null, 2));
return { success: true, method: result.method };
return { success: true, method: result.method, verified: true };
}
return { success: false, error: result.error || 'Config failed' };

View File

@@ -29,7 +29,7 @@ PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile'
# Alias for backward compatibility with existing test names
@@ -70,8 +70,17 @@ class TestTwoCaptcha:
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
exts = json.loads((chrome_dir / 'extensions.json').read_text())
assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}"
# Wait for extensions.json to be written
extensions_file = chrome_dir / 'extensions.json'
for i in range(20):
if extensions_file.exists():
break
time.sleep(0.5)
assert extensions_file.exists(), f"extensions.json not created. Chrome dir files: {list(chrome_dir.iterdir())}"
exts = json.loads(extensions_file.read_text())
assert any(e['name'] == 'twocaptcha' for e in exts), f"twocaptcha not loaded: {exts}"
print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
finally:
kill_chrome(process, chrome_dir)
@@ -95,6 +104,14 @@ class TestTwoCaptcha:
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
# Wait for extensions.json to be written
extensions_file = chrome_dir / 'extensions.json'
for i in range(20):
if extensions_file.exists():
break
time.sleep(0.5)
assert extensions_file.exists(), f"extensions.json not created"
result = subprocess.run(
['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
env=env, timeout=30, capture_output=True, text=True
@@ -163,7 +180,34 @@ const puppeteer = require('puppeteer-core');
kill_chrome(process, chrome_dir)
def test_solves_recaptcha(self):
"""Extension solves reCAPTCHA on demo page."""
"""Extension attempts to solve CAPTCHA on demo page.
CRITICAL: DO NOT SKIP OR DISABLE THIS TEST EVEN IF IT'S FLAKY!
This test is INTENTIONALLY left enabled to expose the REAL, ACTUAL flakiness
of the 2captcha service and demo page. The test failures you see here are NOT
test bugs - they are ACCURATE representations of the real-world reliability
of this CAPTCHA solving service.
If this test is flaky, that's because 2captcha IS FLAKY in production.
If this test fails intermittently, that's because 2captcha FAILS INTERMITTENTLY in production.
NEVER EVER hide real flakiness by disabling tests or adding @pytest.mark.skip.
Users NEED to see this failure rate to understand what they're getting into.
When this test DOES pass, it confirms:
- Extension loads and configures correctly
- 2captcha API key is accepted
- Extension can successfully auto-solve CAPTCHAs
- The entire flow works end-to-end
When it fails (as it often does):
- Demo page has JavaScript errors (representing real-world broken sites)
- Turnstile tokens expire before solving (representing real-world timing issues)
- 2captcha service may be slow/down (representing real-world service issues)
This is VALUABLE INFORMATION about the service. DO NOT HIDE IT.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = setup_test_env(tmpdir)
@@ -179,6 +223,14 @@ const puppeteer = require('puppeteer-core');
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
# Wait for extensions.json to be written
extensions_file = chrome_dir / 'extensions.json'
for i in range(20):
if extensions_file.exists():
break
time.sleep(0.5)
assert extensions_file.exists(), f"extensions.json not created"
subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
script = f'''
@@ -187,48 +239,97 @@ const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
const page = await browser.newPage();
// Capture console messages from the page (including extension messages)
page.on('console', msg => {{
const text = msg.text();
if (text.includes('2captcha') || text.includes('turnstile') || text.includes('captcha')) {{
console.error('[CONSOLE]', text);
}}
}});
await page.setViewport({{ width: 1440, height: 900 }});
console.error('[*] Loading {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
await new Promise(r => setTimeout(r, 3000));
// Wait for CAPTCHA iframe (minimal wait to avoid token expiration)
console.error('[*] Waiting for CAPTCHA iframe...');
await page.waitForSelector('iframe', {{ timeout: 30000 }});
console.error('[*] CAPTCHA iframe found - extension should auto-solve now');
// DON'T CLICK - extension should auto-solve since autoSolveTurnstile=True
console.error('[*] Waiting for auto-solve (extension configured with autoSolveTurnstile=True)...');
// Poll for data-state changes with debug output
console.error('[*] Waiting for CAPTCHA to be solved (up to 150s)...');
const start = Date.now();
const maxWait = 90000;
let solved = false;
let lastState = null;
while (Date.now() - start < maxWait) {{
while (!solved && (Date.now() - start) < 150000) {{
const state = await page.evaluate(() => {{
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
const solver = document.querySelector('.captcha-solver');
return {{
solved: resp ? resp.value.length > 0 : false,
state: solver?.getAttribute('data-state'),
text: solver?.textContent?.trim() || ''
text: solver?.textContent?.trim(),
classList: solver?.className
}};
}});
const sec = Math.round((Date.now() - start) / 1000);
console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30));
if (state.solved) {{ console.error('[+] SOLVED!'); break; }}
if (state.state === 'error') {{ console.error('[!] ERROR'); break; }}
if (state.state !== lastState) {{
const elapsed = Math.round((Date.now() - start) / 1000);
console.error(`[*] State change at ${{elapsed}}s: "${{lastState}}" -> "${{state.state}}" (text: "${{state.text?.slice(0, 50)}}")`);
lastState = state.state;
}}
if (state.state === 'solved') {{
solved = true;
const elapsed = Math.round((Date.now() - start) / 1000);
console.error('[+] SOLVED in ' + elapsed + 's!');
break;
}}
// Check every 2 seconds
await new Promise(r => setTimeout(r, 2000));
}}
if (!solved) {{
const elapsed = Math.round((Date.now() - start) / 1000);
const finalState = await page.evaluate(() => {{
const solver = document.querySelector('.captcha-solver');
return {{
state: solver?.getAttribute('data-state'),
text: solver?.textContent?.trim(),
html: solver?.outerHTML?.slice(0, 200)
}};
}});
console.error(`[!] TIMEOUT after ${{elapsed}}s. Final state: ${{JSON.stringify(finalState)}}`);
browser.disconnect();
process.exit(1);
}}
const final = await page.evaluate(() => {{
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }};
const solver = document.querySelector('.captcha-solver');
return {{
solved: true,
state: solver?.getAttribute('data-state'),
text: solver?.textContent?.trim()
}};
}});
browser.disconnect();
console.log(JSON.stringify(final));
}})();
'''
(tmpdir / 's.js').write_text(script)
print("\n[*] Solving CAPTCHA (10-60s)...")
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True)
print("\n[*] Solving CAPTCHA (this can take up to 150s for 2captcha API)...")
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=200, capture_output=True, text=True)
print(r.stderr)
assert r.returncode == 0, f"Failed: {r.stderr}"
final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
assert final.get('solved'), f"Not solved: {final}"
print(f"[+] SOLVED! {final.get('preview','')[:30]}...")
assert final.get('state') == 'solved', f"State not 'solved': {final}"
print(f"[+] SUCCESS! CAPTCHA solved: {final.get('text','')[:50]}")
finally:
kill_chrome(process, chrome_dir)

View File

@@ -265,57 +265,60 @@ class Orchestrator:
def runloop(self) -> None:
"""Main orchestrator loop."""
from archivebox.misc.logging import IS_TTY, CONSOLE
import sys
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn
from archivebox.misc.logging import IS_TTY
import archivebox.misc.logging as logging_module
self.on_startup()
# Enable progress bars only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle
last_progress_output = ""
progress = Progress(
TextColumn("[cyan]{task.description}"),
BarColumn(bar_width=40),
TaskProgressColumn(),
transient=False,
) if show_progress else None
task_ids = {} # snapshot_id -> task_id
# Replace global CONSOLE with progress.console when active
original_console = logging_module.CONSOLE
original_stderr = logging_module.STDERR
try:
if progress:
progress.start()
# Redirect all logging through progress.console
logging_module.CONSOLE = progress.console
logging_module.STDERR = progress.console
while True:
# Check queues and spawn workers
queue_sizes = self.check_queues_and_spawn_workers()
# Update progress bars (simple inline update)
if show_progress:
# Update progress bars
if progress:
from archivebox.core.models import Snapshot
active_snapshots = list(Snapshot.objects.filter(status='started').iterator(chunk_size=100))
active_snapshots = Snapshot.objects.filter(status='started').iterator(chunk_size=100)
if active_snapshots:
# Build progress string
progress_lines = []
for snapshot in active_snapshots[:5]: # Limit to 5 snapshots
total = snapshot.archiveresult_set.count()
if total == 0:
continue
for snapshot in active_snapshots:
total = snapshot.archiveresult_set.count()
if total == 0:
continue
completed = snapshot.archiveresult_set.filter(
status__in=['succeeded', 'skipped', 'failed']
).count()
completed = snapshot.archiveresult_set.filter(
status__in=['succeeded', 'skipped', 'failed']
).count()
percentage = (completed / total) * 100
bar_width = 30
filled = int(bar_width * completed / total)
bar = '' * filled + '' * (bar_width - filled)
# Create or update task
if snapshot.id not in task_ids:
url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url
task_ids[snapshot.id] = progress.add_task(url, total=total)
url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
progress_lines.append(f"{url} {bar} {percentage:>3.0f}%")
progress_output = "\n".join(progress_lines)
# Only update if changed
if progress_output != last_progress_output:
# Clear previous lines and print new ones
if last_progress_output:
num_lines = last_progress_output.count('\n') + 1
sys.stderr.write(f"\r\033[{num_lines}A\033[J")
sys.stderr.write(progress_output + "\n")
sys.stderr.flush()
last_progress_output = progress_output
progress.update(task_ids[snapshot.id], completed=completed)
# Track idle state
if self.has_pending_work(queue_sizes) or self.has_running_workers():
@@ -327,12 +330,6 @@ class Orchestrator:
# Check if we should exit
if self.should_exit(queue_sizes):
# Clear progress lines
if show_progress and last_progress_output:
num_lines = last_progress_output.count('\n') + 1
sys.stderr.write(f"\r\033[{num_lines}A\033[J")
sys.stderr.flush()
log_worker_event(
worker_type='Orchestrator',
event='All work complete',
@@ -350,6 +347,12 @@ class Orchestrator:
raise
else:
self.on_shutdown()
finally:
if progress:
# Restore original consoles
logging_module.CONSOLE = original_console
logging_module.STDERR = original_stderr
progress.stop()
def start(self) -> int:
"""

View File

@@ -67,7 +67,7 @@ for test_dir in $TEST_DIRS; do
echo -e "${YELLOW}[RUNNING]${NC} $plugin_name"
if python -m pytest "$test_dir" -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
if python -m pytest "$test_dir" -p no:django -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
echo -e "${GREEN}[PASSED]${NC} $plugin_name"
PASSED_PLUGINS=$((PASSED_PLUGINS + 1))
else