mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
cleanup migrations, json, jsonl
This commit is contained in:
@@ -207,7 +207,7 @@ def run_plugins(
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
|
||||
else:
|
||||
write_record(result.to_jsonl())
|
||||
write_record(result.to_json())
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
|
||||
|
||||
@@ -252,8 +252,8 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'process')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp', 'process__cmd')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
@@ -270,7 +270,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Command', {
|
||||
'fields': ('process', 'cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Output', {
|
||||
|
||||
@@ -1,299 +1,250 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Upgrades core app from v0.7.2 (migration 0022) or v0.8.6rc0 (migration 0076) to v0.9.0 using raw SQL
|
||||
# Upgrades core app from v0.7.2/v0.8.6rc0 (migration 0022) to v0.9.0 using raw SQL
|
||||
# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import migrations, models, connection
|
||||
|
||||
|
||||
def upgrade_from_v072_or_v086(apps, schema_editor):
|
||||
"""
|
||||
Upgrade core tables from either v0.7.2 or v0.8.6rc0 to v0.9.0.
|
||||
Handles differences in schema between versions.
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if uuid column exists (v0.7.2 has it, v0.8.6rc0 doesn't)
|
||||
def get_table_columns(table_name):
|
||||
"""Get list of column names for a table."""
|
||||
cursor = connection.cursor()
|
||||
cursor.execute(f"PRAGMA table_info({table_name})")
|
||||
return {row[1] for row in cursor.fetchall()}
|
||||
|
||||
|
||||
def upgrade_core_tables(apps, schema_editor):
|
||||
"""Upgrade core tables from v0.7.2 or v0.8.6rc0 to v0.9.0."""
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if core_archiveresult table exists
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
|
||||
if not cursor.fetchone():
|
||||
# Fresh install - no migration needed, tables will be created by later migrations
|
||||
return
|
||||
|
||||
# Detect which version we're migrating from
|
||||
archiveresult_cols = get_table_columns('core_archiveresult')
|
||||
has_uuid = 'uuid' in archiveresult_cols
|
||||
has_abid = 'abid' in archiveresult_cols
|
||||
|
||||
# ============================================================================
|
||||
# PART 1: Upgrade core_archiveresult table
|
||||
# ============================================================================
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid TEXT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
snapshot_id TEXT NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL DEFAULT '',
|
||||
hook_name VARCHAR(255) NOT NULL DEFAULT '',
|
||||
|
||||
cmd TEXT,
|
||||
pwd VARCHAR(256),
|
||||
cmd_version VARCHAR(128),
|
||||
|
||||
start_ts DATETIME,
|
||||
end_ts DATETIME,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
output_files TEXT NOT NULL DEFAULT '{}',
|
||||
output_json TEXT,
|
||||
output_str TEXT NOT NULL DEFAULT '',
|
||||
output_size INTEGER NOT NULL DEFAULT 0,
|
||||
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE
|
||||
);
|
||||
""")
|
||||
|
||||
if has_uuid and not has_abid:
|
||||
# Migrating from v0.7.2 (has uuid, minimal fields)
|
||||
print('Migrating ArchiveResult from v0.7.2 schema...')
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='uuid'
|
||||
""")
|
||||
has_uuid = cursor.fetchone()[0] > 0
|
||||
|
||||
# Check if id is INTEGER (v0.7.2) or TEXT/char (v0.8.6rc0)
|
||||
cursor.execute("""
|
||||
SELECT type FROM pragma_table_info('core_archiveresult') WHERE name='id'
|
||||
""")
|
||||
id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
|
||||
is_v072 = 'INT' in id_type.upper()
|
||||
|
||||
# ============================================================================
|
||||
# PART 1: Upgrade core_archiveresult table
|
||||
# ============================================================================
|
||||
|
||||
# Create new table with v0.9.0 schema
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_archiveresult_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid TEXT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
snapshot_id TEXT NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL DEFAULT '',
|
||||
hook_name VARCHAR(255) NOT NULL DEFAULT '',
|
||||
|
||||
cmd TEXT,
|
||||
pwd VARCHAR(256),
|
||||
cmd_version VARCHAR(128),
|
||||
|
||||
start_ts DATETIME,
|
||||
end_ts DATETIME,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
output_files TEXT NOT NULL DEFAULT '{}',
|
||||
output_json TEXT,
|
||||
output_str TEXT NOT NULL DEFAULT '',
|
||||
output_size INTEGER NOT NULL DEFAULT 0,
|
||||
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
binary_id TEXT,
|
||||
iface_id TEXT,
|
||||
process_id TEXT,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL,
|
||||
FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL,
|
||||
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
|
||||
)
|
||||
SELECT
|
||||
id, uuid,
|
||||
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
|
||||
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult;
|
||||
""")
|
||||
|
||||
# Copy data based on source version
|
||||
if is_v072:
|
||||
# Coming from v0.7.2: has INTEGER id, has uuid column, has extractor
|
||||
print(" Migrating from v0.7.2 schema...")
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, output_str
|
||||
)
|
||||
SELECT
|
||||
uuid,
|
||||
COALESCE(start_ts, CURRENT_TIMESTAMP) as created_at,
|
||||
COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) as modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
else:
|
||||
# Coming from v0.8.6rc0: has TEXT id, no uuid column, has abid
|
||||
print(" Migrating from v0.8.6rc0 schema...")
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
|
||||
)
|
||||
SELECT
|
||||
id as uuid,
|
||||
created_at,
|
||||
modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status, retry_at,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
|
||||
# Replace old table
|
||||
cursor.execute("DROP TABLE IF EXISTS core_archiveresult")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
|
||||
|
||||
# ============================================================================
|
||||
# PART 2: Upgrade core_snapshot table
|
||||
# ============================================================================
|
||||
|
||||
# Check snapshot schema version
|
||||
elif has_abid and not has_uuid:
|
||||
# Migrating from v0.8.6rc0 (has abid, full fields)
|
||||
print('Migrating ArchiveResult from v0.8.6rc0 schema...')
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM pragma_table_info('core_snapshot') WHERE name='crawl_id'
|
||||
""")
|
||||
has_crawl_id = cursor.fetchone()[0] > 0
|
||||
|
||||
# Create new table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_snapshot_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
downloaded_at DATETIME,
|
||||
|
||||
url TEXT NOT NULL,
|
||||
timestamp TEXT NOT NULL,
|
||||
title TEXT,
|
||||
|
||||
crawl_id TEXT,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
parent_snapshot_id TEXT,
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0
|
||||
|
||||
-- Note: crawl_id foreign key will be added in 0024 after assigning crawl_ids
|
||||
-- FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
-- FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
INSERT OR IGNORE INTO core_archiveresult_new (
|
||||
id, uuid, created_at, modified_at, snapshot_id, plugin,
|
||||
cmd, pwd, cmd_version, start_ts, end_ts, status, retry_at, output_str
|
||||
)
|
||||
SELECT
|
||||
id, abid as uuid,
|
||||
created_at, modified_at,
|
||||
snapshot_id,
|
||||
COALESCE(extractor, '') as plugin,
|
||||
cmd, pwd, cmd_version,
|
||||
start_ts, end_ts, status, retry_at,
|
||||
COALESCE(output, '') as output_str
|
||||
FROM core_archiveresult;
|
||||
""")
|
||||
else:
|
||||
print(f'Warning: Unexpected schema - has_uuid={has_uuid}, has_abid={has_abid}')
|
||||
|
||||
# Copy snapshot data
|
||||
if has_crawl_id:
|
||||
# v0.8.6rc0 schema - already has created_at, modified_at, bookmarked_at
|
||||
cursor.execute("DROP TABLE IF EXISTS core_archiveresult;")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult;")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_plugin_idx ON core_archiveresult(plugin);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_status_idx ON core_archiveresult(status);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_created_at_idx ON core_archiveresult(created_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_uuid_idx ON core_archiveresult(uuid);")
|
||||
|
||||
# ============================================================================
|
||||
# PART 2: Upgrade core_snapshot table
|
||||
# ============================================================================
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_snapshot_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
url TEXT NOT NULL,
|
||||
timestamp VARCHAR(32) NOT NULL UNIQUE,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
crawl_id TEXT,
|
||||
parent_snapshot_id TEXT,
|
||||
|
||||
title VARCHAR(512),
|
||||
downloaded_at DATETIME,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
);
|
||||
""")
|
||||
|
||||
# Check if core_snapshot exists (it should)
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'")
|
||||
if cursor.fetchone():
|
||||
# Detect which version we're migrating from
|
||||
snapshot_cols = get_table_columns('core_snapshot')
|
||||
has_added = 'added' in snapshot_cols
|
||||
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
|
||||
|
||||
if has_added and not has_bookmarked_at:
|
||||
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
|
||||
print('Migrating Snapshot from v0.7.2 schema...')
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_new (
|
||||
id, created_at, modified_at, bookmarked_at, downloaded_at, url, timestamp,
|
||||
crawl_id, status, retry_at
|
||||
id, url, timestamp, title, bookmarked_at, created_at, modified_at
|
||||
)
|
||||
SELECT
|
||||
id,
|
||||
created_at,
|
||||
modified_at,
|
||||
bookmarked_at,
|
||||
downloaded_at,
|
||||
url, timestamp,
|
||||
NULLIF(crawl_id, ''),
|
||||
COALESCE(status, 'queued'),
|
||||
retry_at
|
||||
FROM core_snapshot
|
||||
id, url, timestamp, title,
|
||||
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
|
||||
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
|
||||
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
|
||||
FROM core_snapshot;
|
||||
""")
|
||||
elif has_bookmarked_at and not has_added:
|
||||
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
|
||||
print('Migrating Snapshot from v0.8.6rc0 schema...')
|
||||
# Check what fields exist
|
||||
has_status = 'status' in snapshot_cols
|
||||
has_retry_at = 'retry_at' in snapshot_cols
|
||||
has_crawl_id = 'crawl_id' in snapshot_cols
|
||||
|
||||
# Build column list based on what exists
|
||||
cols = ['id', 'url', 'timestamp', 'title', 'bookmarked_at', 'created_at', 'modified_at', 'downloaded_at']
|
||||
if has_crawl_id:
|
||||
cols.append('crawl_id')
|
||||
if has_status:
|
||||
cols.append('status')
|
||||
if has_retry_at:
|
||||
cols.append('retry_at')
|
||||
|
||||
cursor.execute(f"""
|
||||
INSERT OR IGNORE INTO core_snapshot_new ({', '.join(cols)})
|
||||
SELECT {', '.join(cols)}
|
||||
FROM core_snapshot;
|
||||
""")
|
||||
else:
|
||||
# v0.7.2 schema - will get crawl_id assigned by later migration (0024)
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_new (
|
||||
id, created_at, modified_at, bookmarked_at, url, timestamp, crawl_id
|
||||
)
|
||||
SELECT
|
||||
id,
|
||||
COALESCE(added, CURRENT_TIMESTAMP),
|
||||
COALESCE(updated, added, CURRENT_TIMESTAMP),
|
||||
COALESCE(added, CURRENT_TIMESTAMP),
|
||||
url, timestamp,
|
||||
NULL as crawl_id
|
||||
FROM core_snapshot
|
||||
""")
|
||||
print(f'Warning: Unexpected Snapshot schema - has_added={has_added}, has_bookmarked_at={has_bookmarked_at}')
|
||||
|
||||
# Replace old table
|
||||
cursor.execute("DROP TABLE IF EXISTS core_snapshot")
|
||||
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot")
|
||||
cursor.execute("DROP TABLE IF EXISTS core_snapshot;")
|
||||
cursor.execute("ALTER TABLE core_snapshot_new RENAME TO core_snapshot;")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at)")
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_url_idx ON core_snapshot(url);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_timestamp_idx ON core_snapshot(timestamp);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_status_idx ON core_snapshot(status);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_retry_at_idx ON core_snapshot(retry_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_snapshot_created_at_idx ON core_snapshot(created_at);")
|
||||
cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);")
|
||||
|
||||
# ============================================================================
|
||||
# PART 3: Upgrade core_tag table
|
||||
# ============================================================================
|
||||
# ============================================================================
|
||||
# PART 3: Upgrade core_tag table
|
||||
# ============================================================================
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_tag_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
# Check if tag id is INTEGER (v0.7.2) or TEXT (v0.8.6rc0)
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
slug VARCHAR(100) NOT NULL UNIQUE,
|
||||
|
||||
created_by_id INTEGER,
|
||||
|
||||
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE
|
||||
);
|
||||
""")
|
||||
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_tag'")
|
||||
if cursor.fetchone():
|
||||
cursor.execute("""
|
||||
SELECT type FROM pragma_table_info('core_tag') WHERE name='id'
|
||||
""")
|
||||
tag_id_type = cursor.fetchone()[0] if cursor.rowcount else 'INTEGER'
|
||||
tag_id_is_int = 'INT' in tag_id_type.upper()
|
||||
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_tag_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
created_by_id INTEGER,
|
||||
|
||||
name VARCHAR(100) NOT NULL UNIQUE,
|
||||
slug VARCHAR(100) NOT NULL UNIQUE,
|
||||
|
||||
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE SET NULL
|
||||
)
|
||||
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
|
||||
SELECT id, name, slug
|
||||
FROM core_tag;
|
||||
""")
|
||||
|
||||
if tag_id_is_int:
|
||||
# v0.7.2: Direct copy (INTEGER to INTEGER)
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_tag_new (id, name, slug)
|
||||
SELECT id, name, slug FROM core_tag
|
||||
""")
|
||||
else:
|
||||
# v0.8.6rc0: Need to remap TEXT ids to new INTEGER ids
|
||||
cursor.execute("SELECT id, name, slug FROM core_tag")
|
||||
old_tags = cursor.fetchall()
|
||||
tag_id_mapping = {} # old_text_id -> new_int_id
|
||||
cursor.execute("DROP TABLE IF EXISTS core_tag;")
|
||||
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag;")
|
||||
|
||||
for old_id, name, slug in old_tags:
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_tag_new (name, slug)
|
||||
VALUES (?, ?)
|
||||
""", [name, slug])
|
||||
cursor.execute("SELECT id FROM core_tag_new WHERE slug = ?", [slug])
|
||||
new_id = cursor.fetchone()[0]
|
||||
tag_id_mapping[old_id] = new_id
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_at_idx ON core_tag(created_at);")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_tag_created_by_id_idx ON core_tag(created_by_id);")
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS core_tag")
|
||||
cursor.execute("ALTER TABLE core_tag_new RENAME TO core_tag")
|
||||
|
||||
# Recreate M2M table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_snapshot_tags_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
snapshot_id TEXT NOT NULL,
|
||||
tag_id INTEGER NOT NULL,
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (tag_id) REFERENCES core_tag(id) ON DELETE CASCADE,
|
||||
UNIQUE(snapshot_id, tag_id)
|
||||
)
|
||||
""")
|
||||
|
||||
if tag_id_is_int:
|
||||
# Direct copy for v0.7.2
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
|
||||
SELECT snapshot_id, tag_id FROM core_snapshot_tags
|
||||
""")
|
||||
else:
|
||||
# v0.8.6rc0: Use mapping to convert old TEXT ids to new INTEGER ids
|
||||
cursor.execute("SELECT snapshot_id, tag_id FROM core_snapshot_tags")
|
||||
m2m_entries = cursor.fetchall()
|
||||
for snapshot_id, old_tag_id in m2m_entries:
|
||||
new_tag_id = tag_id_mapping.get(old_tag_id)
|
||||
if new_tag_id:
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_tags_new (snapshot_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
""", [snapshot_id, new_tag_id])
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS core_snapshot_tags")
|
||||
cursor.execute("ALTER TABLE core_snapshot_tags_new RENAME TO core_snapshot_tags")
|
||||
print('✓ Core tables upgraded to v0.9.0')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -301,10 +252,49 @@ class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
('crawls', '0001_initial'),
|
||||
('machine', '0001_initial'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(upgrade_from_v072_or_v086, reverse_code=migrations.RunPython.noop),
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunPython(
|
||||
upgrade_core_tables,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# Remove old ArchiveResult fields
|
||||
migrations.RemoveField(model_name='archiveresult', name='extractor'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='output'),
|
||||
# Remove old Snapshot fields
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
# SnapshotTag table already exists from v0.7.2, just declare it in state
|
||||
migrations.CreateModel(
|
||||
name='SnapshotTag',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||
('snapshot', models.ForeignKey(to='core.Snapshot', db_column='snapshot_id', on_delete=models.CASCADE)),
|
||||
('tag', models.ForeignKey(to='core.Tag', db_column='tag_id', on_delete=models.CASCADE)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'core_snapshot_tags',
|
||||
'unique_together': {('snapshot', 'tag')},
|
||||
},
|
||||
),
|
||||
# Declare that Snapshot.tags M2M already uses through=SnapshotTag (from v0.7.2)
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(
|
||||
'Tag',
|
||||
blank=True,
|
||||
related_name='snapshot_set',
|
||||
through='SnapshotTag',
|
||||
through_fields=('snapshot', 'tag'),
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Creates a default crawl for v0.7.2 migrated snapshots and makes crawl_id NOT NULL
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import migrations, models
|
||||
import uuid
|
||||
|
||||
|
||||
@@ -56,8 +56,7 @@ class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0023_upgrade_to_0_9_0'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
('machine', '0001_initial'),
|
||||
('crawls', '0001_initial'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
@@ -66,65 +65,80 @@ class Migration(migrations.Migration):
|
||||
create_default_crawl_and_assign_snapshots,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
# Now make crawl_id NOT NULL
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
-- Rebuild snapshot table with NOT NULL crawl_id
|
||||
CREATE TABLE core_snapshot_final (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
# Now make crawl_id NOT NULL
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
-- Rebuild snapshot table with NOT NULL crawl_id
|
||||
CREATE TABLE core_snapshot_final (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
url TEXT NOT NULL,
|
||||
timestamp VARCHAR(32) NOT NULL UNIQUE,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
url TEXT NOT NULL,
|
||||
timestamp VARCHAR(32) NOT NULL UNIQUE,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
crawl_id TEXT NOT NULL,
|
||||
parent_snapshot_id TEXT,
|
||||
crawl_id TEXT NOT NULL,
|
||||
parent_snapshot_id TEXT,
|
||||
|
||||
title VARCHAR(512),
|
||||
downloaded_at DATETIME,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
title VARCHAR(512),
|
||||
downloaded_at DATETIME,
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
);
|
||||
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (parent_snapshot_id) REFERENCES core_snapshot(id) ON DELETE SET NULL
|
||||
);
|
||||
|
||||
INSERT INTO core_snapshot_final (
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
config, notes, num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
)
|
||||
SELECT
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
FROM core_snapshot;
|
||||
INSERT INTO core_snapshot_final (
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
config, notes, num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
)
|
||||
SELECT
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
FROM core_snapshot;
|
||||
|
||||
DROP TABLE core_snapshot;
|
||||
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
|
||||
DROP TABLE core_snapshot;
|
||||
ALTER TABLE core_snapshot_final RENAME TO core_snapshot;
|
||||
|
||||
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
|
||||
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
|
||||
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
|
||||
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
|
||||
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
|
||||
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
|
||||
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
|
||||
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
CREATE INDEX core_snapshot_url_idx ON core_snapshot(url);
|
||||
CREATE INDEX core_snapshot_timestamp_idx ON core_snapshot(timestamp);
|
||||
CREATE INDEX core_snapshot_bookmarked_at_idx ON core_snapshot(bookmarked_at);
|
||||
CREATE INDEX core_snapshot_crawl_id_idx ON core_snapshot(crawl_id);
|
||||
CREATE INDEX core_snapshot_status_idx ON core_snapshot(status);
|
||||
CREATE INDEX core_snapshot_retry_at_idx ON core_snapshot(retry_at);
|
||||
CREATE INDEX core_snapshot_created_at_idx ON core_snapshot(created_at);
|
||||
CREATE UNIQUE INDEX core_snapshot_url_crawl_unique ON core_snapshot(url, crawl_id);
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(
|
||||
on_delete=models.deletion.CASCADE,
|
||||
to='crawls.crawl',
|
||||
help_text='Crawl that created this snapshot'
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -0,0 +1,258 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 23:09
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_assign_default_crawl'),
|
||||
('crawls', '0001_initial'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='snapshot',
|
||||
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='pwd',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(db_index=True, default='', max_length=32),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(blank=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='end_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(db_index=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(editable=False, max_length=100, unique=True),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('url', 'crawl'), name='unique_url_per_crawl'),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
|
||||
),
|
||||
]
|
||||
@@ -1,484 +0,0 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Cleans up extra columns from raw SQL migrations and ensures schema matches models
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
import archivebox.base_models.models
|
||||
|
||||
|
||||
def cleanup_extra_columns(apps, schema_editor):
|
||||
"""
|
||||
Create Process records from old cmd/pwd/cmd_version columns and remove those columns.
|
||||
This preserves the execution details by moving them to the Process model.
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'")
|
||||
has_cmd = cursor.fetchone()[0] > 0
|
||||
|
||||
if has_cmd:
|
||||
print(" Migrating cmd/pwd/cmd_version data to Process records...")
|
||||
|
||||
# For each ArchiveResult, create a Process record with cmd/pwd data
|
||||
# Note: cmd_version from old schema is not preserved (it's now derived from Binary)
|
||||
cursor.execute("""
|
||||
SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
archive_results = cursor.fetchall()
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
# Get or create a Machine record
|
||||
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
|
||||
if result:
|
||||
machine_id = result[0]
|
||||
print(f" Using existing Machine: {machine_id}")
|
||||
else:
|
||||
# Create a minimal Machine record with raw SQL (can't use model during migration)
|
||||
print(" Creating Machine record for Process migration...")
|
||||
import platform
|
||||
import socket
|
||||
|
||||
# Generate minimal machine data without using the model
|
||||
machine_id = str(uuid7())
|
||||
guid = f"{socket.gethostname()}-{platform.machine()}"
|
||||
hostname = socket.gethostname()
|
||||
|
||||
# Check schema version
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='config'")
|
||||
has_config = cursor.fetchone()[0] > 0
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='abid'")
|
||||
has_abid = cursor.fetchone()[0] > 0
|
||||
cursor.execute("SELECT COUNT(*) FROM pragma_table_info('machine_machine') WHERE name='num_uses_succeeded'")
|
||||
has_num_uses = cursor.fetchone()[0] > 0
|
||||
|
||||
# Insert directly with SQL (use INSERT OR IGNORE in case it already exists)
|
||||
if has_config:
|
||||
# v0.9.0+ schema
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO machine_machine (
|
||||
id, created_at, modified_at,
|
||||
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, config
|
||||
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', '{}')
|
||||
""", (
|
||||
machine_id, guid, hostname,
|
||||
platform.machine(), platform.system(), platform.platform(), platform.release()
|
||||
))
|
||||
elif has_abid and has_num_uses:
|
||||
# v0.8.6rc0 schema (has abid and num_uses columns)
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO machine_machine (
|
||||
id, abid, created_at, modified_at,
|
||||
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, '', datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}', 0, 0)
|
||||
""", (
|
||||
machine_id, guid, hostname,
|
||||
platform.machine(), platform.system(), platform.platform(), platform.release()
|
||||
))
|
||||
else:
|
||||
# v0.7.2 or other schema
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO machine_machine (
|
||||
id, created_at, modified_at,
|
||||
guid, hostname, hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats
|
||||
) VALUES (?, datetime('now'), datetime('now'), ?, ?, 0, 0, '', '', '', ?, ?, ?, ?, '', '{}')
|
||||
""", (
|
||||
machine_id, guid, hostname,
|
||||
platform.machine(), platform.system(), platform.platform(), platform.release()
|
||||
))
|
||||
# Re-query to get the actual id (in case INSERT OR IGNORE skipped it)
|
||||
result = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()
|
||||
if result:
|
||||
machine_id = result[0]
|
||||
print(f" ✓ Using/Created Machine: {machine_id}")
|
||||
else:
|
||||
# INSERT OR IGNORE failed - try again without IGNORE to see the error
|
||||
raise Exception("Failed to create Machine record - machine_machine table is empty after INSERT")
|
||||
|
||||
for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
|
||||
# Create Process record
|
||||
process_id = str(uuid7())
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_process (
|
||||
id, created_at, modified_at,
|
||||
machine_id, binary_id, iface_id,
|
||||
pwd, cmd, env, timeout,
|
||||
pid, exit_code, stdout, stderr,
|
||||
started_at, ended_at, url, status, retry_at
|
||||
) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL)
|
||||
""", (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued'))
|
||||
|
||||
# Update ArchiveResult to point to new Process
|
||||
cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id))
|
||||
|
||||
print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data")
|
||||
|
||||
# Now rebuild table without the extra columns
|
||||
print(" Rebuilding core_archiveresult table...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE core_archiveresult_final (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
uuid TEXT,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
snapshot_id TEXT NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL DEFAULT '',
|
||||
hook_name VARCHAR(255) NOT NULL DEFAULT '',
|
||||
|
||||
start_ts DATETIME,
|
||||
end_ts DATETIME,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
output_files TEXT NOT NULL DEFAULT '{}',
|
||||
output_json TEXT,
|
||||
output_str TEXT NOT NULL DEFAULT '',
|
||||
output_size INTEGER NOT NULL DEFAULT 0,
|
||||
output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
config TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
process_id TEXT NOT NULL,
|
||||
|
||||
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
|
||||
)
|
||||
""")
|
||||
|
||||
# Copy data (cmd, pwd, etc. are now in Process records)
|
||||
cursor.execute("""
|
||||
INSERT INTO core_archiveresult_final SELECT
|
||||
id, uuid, created_at, modified_at,
|
||||
snapshot_id, plugin, hook_name,
|
||||
start_ts, end_ts, status, retry_at,
|
||||
output_files, output_json, output_str, output_size, output_mimetypes,
|
||||
config, notes, num_uses_succeeded, num_uses_failed,
|
||||
process_id
|
||||
FROM core_archiveresult
|
||||
""")
|
||||
|
||||
# Replace table
|
||||
cursor.execute("DROP TABLE core_archiveresult")
|
||||
cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult")
|
||||
|
||||
# Recreate indexes
|
||||
cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
|
||||
cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)")
|
||||
|
||||
print(" ✓ Cleaned up core_archiveresult schema")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_assign_default_crawl'),
|
||||
('machine', '0005_add_process_table'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunPython(
|
||||
cleanup_extra_columns,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# Tell Django about all the fields that exist after raw SQL migrations
|
||||
# ArchiveResult model options
|
||||
migrations.AlterModelOptions(
|
||||
name='archiveresult',
|
||||
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
|
||||
),
|
||||
|
||||
# Remove old fields
|
||||
migrations.RemoveField(model_name='archiveresult', name='cmd'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='pwd'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='cmd_version'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='extractor'),
|
||||
migrations.RemoveField(model_name='archiveresult', name='output'),
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
|
||||
# Add new ArchiveResult fields
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(blank=True, default='', max_length=32),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, default='', max_length=255),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(blank=True, default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', max_length=512),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='process',
|
||||
field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
|
||||
),
|
||||
|
||||
# Update Snapshot model
|
||||
migrations.AlterModelOptions(
|
||||
name='snapshot',
|
||||
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', max_length=10),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='notes',
|
||||
field=models.TextField(blank=True, default=''),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
|
||||
# Update Tag model
|
||||
migrations.AlterModelOptions(
|
||||
name='tag',
|
||||
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
|
||||
# Alter field types
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='end_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
field=models.DateTimeField(blank=True, default=None, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, max_length=32, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(max_length=2048),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(editable=False, max_length=100, unique=True),
|
||||
),
|
||||
|
||||
# Create M2M model for snapshot tags
|
||||
migrations.CreateModel(
|
||||
name='SnapshotTag',
|
||||
fields=[
|
||||
('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
|
||||
('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'core_snapshot_tags',
|
||||
},
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'tag')},
|
||||
),
|
||||
|
||||
# Update tags field on Snapshot to use the through model
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
|
||||
),
|
||||
|
||||
# Add constraints
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -1,76 +0,0 @@
|
||||
# Generated by hand on 2025-12-30
|
||||
# Final field adjustments to match model definitions exactly
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_cleanup_schema'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Alter Snapshot fields to match model exactly
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(db_index=True, unique=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='parent_snapshot',
|
||||
field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||
),
|
||||
|
||||
# Alter SnapshotTag fields
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='snapshot',
|
||||
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='tag',
|
||||
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
|
||||
),
|
||||
]
|
||||
@@ -1,108 +0,0 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 09:04
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0026_final_field_adjustments'),
|
||||
('crawls', '0002_upgrade_to_0_9_0'),
|
||||
('machine', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='plugin',
|
||||
field=models.CharField(db_index=True, default='', max_length=32),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='process',
|
||||
field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='depth',
|
||||
field=models.PositiveSmallIntegerField(db_index=True, default=0),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False),
|
||||
),
|
||||
]
|
||||
@@ -91,9 +91,9 @@ class Tag(ModelWithSerializers):
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_tag', args=[self.id])
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Tag model instance to a JSONL record.
|
||||
Convert Tag model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
@@ -105,12 +105,12 @@ class Tag(ModelWithSerializers):
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
"""
|
||||
Create/update Tag from JSONL record.
|
||||
Create/update Tag from JSON dict.
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'name' field
|
||||
record: JSON dict with 'name' field
|
||||
overrides: Optional dict with 'snapshot' to auto-attach tag
|
||||
|
||||
Returns:
|
||||
@@ -982,8 +982,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
processes_seen = set()
|
||||
|
||||
with open(index_path, 'w') as f:
|
||||
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
|
||||
f.write(json.dumps(self.to_jsonl()) + '\n')
|
||||
# Write Snapshot record first (to_json includes crawl_id, fs_version)
|
||||
f.write(json.dumps(self.to_json()) + '\n')
|
||||
|
||||
# Write ArchiveResult records with their associated Binary and Process
|
||||
# Use select_related to optimize queries
|
||||
@@ -991,15 +991,15 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# Write Binary record if not already written
|
||||
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
|
||||
binaries_seen.add(ar.process.binary_id)
|
||||
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
|
||||
f.write(json.dumps(ar.process.binary.to_json()) + '\n')
|
||||
|
||||
# Write Process record if not already written
|
||||
if ar.process and ar.process_id not in processes_seen:
|
||||
processes_seen.add(ar.process_id)
|
||||
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
|
||||
f.write(json.dumps(ar.process.to_json()) + '\n')
|
||||
|
||||
# Write ArchiveResult record
|
||||
f.write(json.dumps(ar.to_jsonl()) + '\n')
|
||||
f.write(json.dumps(ar.to_json()) + '\n')
|
||||
|
||||
def read_index_jsonl(self) -> dict:
|
||||
"""
|
||||
@@ -1422,9 +1422,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
return False
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Snapshot model instance to a JSONL record.
|
||||
Convert Snapshot model instance to a JSON-serializable dict.
|
||||
Includes all fields needed to fully reconstruct/identify this snapshot.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
@@ -1445,9 +1445,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
"""
|
||||
Create/update Snapshot from JSONL record or dict.
|
||||
Create/update Snapshot from JSON dict.
|
||||
|
||||
Unified method that handles:
|
||||
- ID-based patching: {"id": "...", "title": "new title"}
|
||||
@@ -2106,8 +2106,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
result['canonical'] = self.canonical_outputs()
|
||||
return result
|
||||
|
||||
def to_json(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string"""
|
||||
def to_json_str(self, indent: int = 4) -> str:
|
||||
"""Convert to JSON string (legacy method, use to_json() for dict)"""
|
||||
return to_json(self.to_dict(extended=True), indent=indent)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
|
||||
@@ -2284,14 +2284,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
hook_name = models.CharField(max_length=255, blank=True, default='', db_index=True, help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)')
|
||||
|
||||
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
|
||||
# Required - every ArchiveResult must have a Process
|
||||
process = models.OneToOneField(
|
||||
'machine.Process',
|
||||
on_delete=models.PROTECT,
|
||||
null=False, # Required after migration 4
|
||||
related_name='archiveresult',
|
||||
help_text='Process execution details for this archive result'
|
||||
)
|
||||
# Added POST-v0.9.0, will be added in a separate migration
|
||||
# process = models.OneToOneField(
|
||||
# 'machine.Process',
|
||||
# on_delete=models.PROTECT,
|
||||
# null=False,
|
||||
# related_name='archiveresult',
|
||||
# help_text='Process execution details for this archive result'
|
||||
# )
|
||||
|
||||
# New output fields (replacing old 'output' field)
|
||||
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
|
||||
@@ -2326,9 +2326,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
|
||||
return self.snapshot.crawl.created_by
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert ArchiveResult model instance to a JSONL record.
|
||||
Convert ArchiveResult model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
record = {
|
||||
@@ -2360,6 +2360,50 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
record['process_id'] = str(self.process_id)
|
||||
return record
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
"""
|
||||
Create/update ArchiveResult from JSON dict.
|
||||
|
||||
Args:
|
||||
record: JSON dict with 'snapshot_id', 'plugin', etc.
|
||||
overrides: Optional dict of field overrides
|
||||
|
||||
Returns:
|
||||
ArchiveResult instance or None
|
||||
"""
|
||||
snapshot_id = record.get('snapshot_id')
|
||||
plugin = record.get('plugin')
|
||||
|
||||
if not snapshot_id or not plugin:
|
||||
return None
|
||||
|
||||
# Try to get existing by ID first
|
||||
result_id = record.get('id')
|
||||
if result_id:
|
||||
try:
|
||||
return ArchiveResult.objects.get(id=result_id)
|
||||
except ArchiveResult.DoesNotExist:
|
||||
pass
|
||||
|
||||
# Get or create by snapshot_id + plugin
|
||||
try:
|
||||
from archivebox.core.models import Snapshot
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
result, _ = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin=plugin,
|
||||
defaults={
|
||||
'hook_name': record.get('hook_name', ''),
|
||||
'status': record.get('status', 'queued'),
|
||||
'output_str': record.get('output_str', ''),
|
||||
}
|
||||
)
|
||||
return result
|
||||
except Snapshot.DoesNotExist:
|
||||
return None
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
|
||||
|
||||
@@ -1,90 +0,0 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def upgrade_crawl_schema_if_needed(apps, schema_editor):
|
||||
"""
|
||||
Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column).
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if we need to upgrade (missing urls column means v0.8.6rc0)
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls'
|
||||
""")
|
||||
has_urls = cursor.fetchone()[0] > 0
|
||||
|
||||
if not has_urls:
|
||||
print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...")
|
||||
|
||||
# Create new table with v0.9.0 schema
|
||||
cursor.execute("""
|
||||
CREATE TABLE crawls_crawl_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME NOT NULL,
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
urls TEXT NOT NULL DEFAULT '[]',
|
||||
config TEXT,
|
||||
max_depth INTEGER NOT NULL DEFAULT 0,
|
||||
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
|
||||
persona_id TEXT,
|
||||
label VARCHAR(64) NOT NULL DEFAULT '',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
output_dir VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
created_by_id INTEGER NOT NULL,
|
||||
schedule_id TEXT,
|
||||
|
||||
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
|
||||
)
|
||||
""")
|
||||
|
||||
# Copy data from old table (v0.8.6rc0 schema)
|
||||
cursor.execute("""
|
||||
INSERT INTO crawls_crawl_new (
|
||||
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
|
||||
urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id
|
||||
)
|
||||
SELECT
|
||||
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
|
||||
'[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id,
|
||||
CAST(schedule_id AS TEXT)
|
||||
FROM crawls_crawl
|
||||
""")
|
||||
|
||||
# Replace old table
|
||||
cursor.execute("DROP TABLE crawls_crawl")
|
||||
cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)")
|
||||
|
||||
print(" ✓ Upgraded crawls_crawl to v0.9.0 schema")
|
||||
else:
|
||||
print(" ✓ crawls_crawl already has v0.9.0 schema")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0001_initial'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(
|
||||
upgrade_crawl_schema_if_needed,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
]
|
||||
@@ -134,9 +134,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Crawl model instance to a JSONL record.
|
||||
Convert Crawl model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
@@ -152,9 +152,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
"""
|
||||
Create or get a Crawl from a JSONL record.
|
||||
Create or get a Crawl from a JSON dict.
|
||||
|
||||
Args:
|
||||
record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'
|
||||
|
||||
@@ -1176,7 +1176,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
|
||||
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
|
||||
"""
|
||||
Process JSONL records from hook output.
|
||||
Dispatches to Model.from_jsonl() for each record type.
|
||||
Dispatches to Model.from_json() for each record type.
|
||||
|
||||
Args:
|
||||
records: List of JSONL record dicts from result['records']
|
||||
@@ -1201,25 +1201,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
|
||||
# Dispatch to appropriate model's from_jsonl() method
|
||||
if record_type == 'Snapshot':
|
||||
from archivebox.core.models import Snapshot
|
||||
obj = Snapshot.from_jsonl(record.copy(), overrides)
|
||||
obj = Snapshot.from_jsonll(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
|
||||
|
||||
elif record_type == 'Tag':
|
||||
from archivebox.core.models import Tag
|
||||
obj = Tag.from_jsonl(record.copy(), overrides)
|
||||
obj = Tag.from_json(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Tag'] = stats.get('Tag', 0) + 1
|
||||
|
||||
elif record_type == 'Binary':
|
||||
from archivebox.machine.models import Binary
|
||||
obj = Binary.from_jsonl(record.copy(), overrides)
|
||||
obj = Binary.from_json(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Binary'] = stats.get('Binary', 0) + 1
|
||||
|
||||
elif record_type == 'Machine':
|
||||
from archivebox.machine.models import Machine
|
||||
obj = Machine.from_jsonl(record.copy(), overrides)
|
||||
obj = Machine.from_json(record.copy(), overrides)
|
||||
if obj:
|
||||
stats['Machine'] = stats.get('Machine', 0) + 1
|
||||
|
||||
|
||||
@@ -100,46 +100,8 @@ class Migration(migrations.Migration):
|
||||
CREATE INDEX IF NOT EXISTS machine_binary_status_idx ON machine_binary(status);
|
||||
CREATE INDEX IF NOT EXISTS machine_binary_retry_at_idx ON machine_binary(retry_at);
|
||||
|
||||
-- Create machine_process table
|
||||
CREATE TABLE IF NOT EXISTS machine_process (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME NOT NULL,
|
||||
|
||||
machine_id TEXT NOT NULL,
|
||||
binary_id TEXT,
|
||||
iface_id TEXT,
|
||||
|
||||
pwd VARCHAR(512) NOT NULL DEFAULT '',
|
||||
cmd TEXT NOT NULL DEFAULT '[]',
|
||||
env TEXT NOT NULL DEFAULT '{}',
|
||||
timeout INTEGER NOT NULL DEFAULT 120,
|
||||
|
||||
pid INTEGER,
|
||||
exit_code INTEGER,
|
||||
stdout TEXT NOT NULL DEFAULT '',
|
||||
stderr TEXT NOT NULL DEFAULT '',
|
||||
|
||||
started_at DATETIME,
|
||||
ended_at DATETIME,
|
||||
|
||||
url VARCHAR(2048),
|
||||
|
||||
status VARCHAR(16) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
FOREIGN KEY (machine_id) REFERENCES machine_machine(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (binary_id) REFERENCES machine_binary(id) ON DELETE SET NULL,
|
||||
FOREIGN KEY (iface_id) REFERENCES machine_networkinterface(id) ON DELETE SET NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS machine_process_status_idx ON machine_process(status);
|
||||
CREATE INDEX IF NOT EXISTS machine_process_retry_at_idx ON machine_process(retry_at);
|
||||
CREATE INDEX IF NOT EXISTS machine_process_machine_id_idx ON machine_process(machine_id);
|
||||
CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id);
|
||||
CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at);
|
||||
""",
|
||||
reverse_sql="""
|
||||
DROP TABLE IF EXISTS machine_process;
|
||||
DROP TABLE IF EXISTS machine_binary;
|
||||
DROP TABLE IF EXISTS machine_networkinterface;
|
||||
DROP TABLE IF EXISTS machine_machine;
|
||||
@@ -167,6 +129,8 @@ class Migration(migrations.Migration):
|
||||
('os_kernel', models.CharField(default=None, max_length=255)),
|
||||
('stats', models.JSONField(blank=True, default=dict, null=True)),
|
||||
('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
],
|
||||
options={
|
||||
'app_label': 'machine',
|
||||
@@ -189,6 +153,8 @@ class Migration(migrations.Migration):
|
||||
('region', models.CharField(default=None, max_length=63)),
|
||||
('country', models.CharField(default=None, max_length=63)),
|
||||
('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
],
|
||||
options={
|
||||
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
|
||||
@@ -212,6 +178,8 @@ class Migration(migrations.Migration):
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
|
||||
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
|
||||
('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Binary',
|
||||
@@ -220,43 +188,6 @@ class Migration(migrations.Migration):
|
||||
'app_label': 'machine',
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Process',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
|
||||
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
|
||||
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
|
||||
('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
|
||||
('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
|
||||
('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
|
||||
('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
|
||||
('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
|
||||
('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
|
||||
('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
|
||||
('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
|
||||
('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
|
||||
('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='process_set', to='machine.machine')),
|
||||
('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.binary')),
|
||||
('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.networkinterface')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Process',
|
||||
'verbose_name_plural': 'Processes',
|
||||
'app_label': 'machine',
|
||||
},
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_5e3a87_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__7bd19c_idx'),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
45
archivebox/machine/migrations/0002_process.py
Normal file
45
archivebox/machine/migrations/0002_process.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 22:54
|
||||
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Process',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
|
||||
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
|
||||
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
|
||||
('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
|
||||
('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
|
||||
('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
|
||||
('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
|
||||
('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
|
||||
('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
|
||||
('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
|
||||
('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
|
||||
('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
|
||||
('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.binary')),
|
||||
('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='process_set', to='machine.networkinterface')),
|
||||
('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='process_set', to='machine.machine')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Process',
|
||||
'verbose_name_plural': 'Processes',
|
||||
'indexes': [models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_5e3a87_idx'), models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__7bd19c_idx')],
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -1,101 +0,0 @@
|
||||
# Generated on 2025-12-31
|
||||
# Adds parent FK and process_type field to Process model
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
-- Add parent_id FK column to machine_process
|
||||
ALTER TABLE machine_process ADD COLUMN parent_id TEXT REFERENCES machine_process(id) ON DELETE SET NULL;
|
||||
CREATE INDEX IF NOT EXISTS machine_process_parent_id_idx ON machine_process(parent_id);
|
||||
|
||||
-- Add process_type column with default 'binary'
|
||||
ALTER TABLE machine_process ADD COLUMN process_type VARCHAR(16) NOT NULL DEFAULT 'binary';
|
||||
CREATE INDEX IF NOT EXISTS machine_process_process_type_idx ON machine_process(process_type);
|
||||
|
||||
-- Add composite index for parent + status queries
|
||||
CREATE INDEX IF NOT EXISTS machine_process_parent_status_idx ON machine_process(parent_id, status);
|
||||
|
||||
-- Add composite index for machine + pid + started_at (for PID reuse protection)
|
||||
CREATE INDEX IF NOT EXISTS machine_process_machine_pid_started_idx ON machine_process(machine_id, pid, started_at);
|
||||
""",
|
||||
# Migration is irreversible due to SQLite limitations
|
||||
# SQLite doesn't support DROP COLUMN, would require table rebuild
|
||||
reverse_sql=migrations.RunSQL.noop
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# Add parent FK
|
||||
migrations.AddField(
|
||||
model_name='process',
|
||||
name='parent',
|
||||
field=models.ForeignKey(
|
||||
blank=True,
|
||||
help_text='Parent process that spawned this one',
|
||||
null=True,
|
||||
on_delete=django.db.models.deletion.SET_NULL,
|
||||
related_name='children',
|
||||
to='machine.process',
|
||||
),
|
||||
),
|
||||
# Add process_type field
|
||||
migrations.AddField(
|
||||
model_name='process',
|
||||
name='process_type',
|
||||
field=models.CharField(
|
||||
choices=[
|
||||
('cli', 'CLI Command'),
|
||||
('supervisord', 'Supervisord Daemon'),
|
||||
('orchestrator', 'Orchestrator'),
|
||||
('worker', 'Worker Process'),
|
||||
('hook', 'Hook Script'),
|
||||
('binary', 'Binary Execution'),
|
||||
],
|
||||
default='binary',
|
||||
help_text='Type of process in the execution hierarchy',
|
||||
max_length=16,
|
||||
),
|
||||
),
|
||||
# Add indexes - must match the SQL index names exactly
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(
|
||||
fields=['parent'],
|
||||
name='machine_process_parent_id_idx',
|
||||
),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(
|
||||
fields=['process_type'],
|
||||
name='machine_process_process_type_idx',
|
||||
),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(
|
||||
fields=['parent', 'status'],
|
||||
name='machine_process_parent_status_idx',
|
||||
),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(
|
||||
fields=['machine', 'pid', 'started_at'],
|
||||
name='machine_process_machine_pid_started_idx',
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -82,13 +82,38 @@ class Machine(ModelWithHealthStats):
|
||||
)
|
||||
return _CURRENT_MACHINE
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Update Machine config from JSONL record.
|
||||
Convert Machine model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': 'Machine',
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'guid': self.guid,
|
||||
'hostname': self.hostname,
|
||||
'hw_in_docker': self.hw_in_docker,
|
||||
'hw_in_vm': self.hw_in_vm,
|
||||
'hw_manufacturer': self.hw_manufacturer,
|
||||
'hw_product': self.hw_product,
|
||||
'hw_uuid': self.hw_uuid,
|
||||
'os_arch': self.os_arch,
|
||||
'os_family': self.os_family,
|
||||
'os_platform': self.os_platform,
|
||||
'os_kernel': self.os_kernel,
|
||||
'os_release': self.os_release,
|
||||
'stats': self.stats,
|
||||
'config': self.config or {},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
"""
|
||||
Update Machine config from JSON dict.
|
||||
|
||||
Args:
|
||||
record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
|
||||
record: JSON dict with '_method': 'update', 'key': '...', 'value': '...'
|
||||
overrides: Not used
|
||||
|
||||
Returns:
|
||||
@@ -255,9 +280,9 @@ class Binary(ModelWithHealthStats):
|
||||
'is_valid': self.is_valid,
|
||||
}
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Binary model instance to a JSONL record.
|
||||
Convert Binary model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
@@ -274,17 +299,17 @@ class Binary(ModelWithHealthStats):
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
"""
|
||||
Create/update Binary from JSONL record.
|
||||
Create/update Binary from JSON dict.
|
||||
|
||||
Handles two cases:
|
||||
1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
|
||||
1. From binaries.json: creates queued binary with name, binproviders, overrides
|
||||
2. From hook output: updates binary with abspath, version, sha256, binprovider
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'name' and either:
|
||||
- 'binproviders', 'overrides' (from binaries.jsonl)
|
||||
record: JSON dict with 'name' and either:
|
||||
- 'binproviders', 'overrides' (from binaries.json)
|
||||
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
|
||||
overrides: Not used
|
||||
|
||||
@@ -542,7 +567,7 @@ class ProcessManager(models.Manager):
|
||||
return process
|
||||
|
||||
|
||||
class Process(ModelWithHealthStats):
|
||||
class Process(models.Model):
|
||||
"""
|
||||
Tracks a single OS process execution.
|
||||
|
||||
@@ -563,38 +588,11 @@ class Process(ModelWithHealthStats):
|
||||
RUNNING = 'running', 'Running'
|
||||
EXITED = 'exited', 'Exited'
|
||||
|
||||
class TypeChoices(models.TextChoices):
|
||||
CLI = 'cli', 'CLI Command'
|
||||
SUPERVISORD = 'supervisord', 'Supervisord Daemon'
|
||||
ORCHESTRATOR = 'orchestrator', 'Orchestrator'
|
||||
WORKER = 'worker', 'Worker Process'
|
||||
HOOK = 'hook', 'Hook Script'
|
||||
BINARY = 'binary', 'Binary Execution'
|
||||
|
||||
# Primary fields
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# Parent process FK for hierarchy tracking
|
||||
parent = models.ForeignKey(
|
||||
'self',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='children',
|
||||
help_text='Parent process that spawned this one'
|
||||
)
|
||||
|
||||
# Process type for distinguishing in hierarchy
|
||||
process_type = models.CharField(
|
||||
max_length=16,
|
||||
choices=TypeChoices.choices,
|
||||
default=TypeChoices.BINARY,
|
||||
db_index=True,
|
||||
help_text='Type of process in the execution hierarchy'
|
||||
)
|
||||
|
||||
# Machine FK - required (every process runs on a machine)
|
||||
machine = models.ForeignKey(
|
||||
Machine,
|
||||
@@ -667,10 +665,6 @@ class Process(ModelWithHealthStats):
|
||||
help_text='When to retry this process'
|
||||
)
|
||||
|
||||
# Health stats
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
state_machine_name: str = 'archivebox.machine.models.ProcessMachine'
|
||||
|
||||
objects: ProcessManager = ProcessManager()
|
||||
@@ -682,8 +676,6 @@ class Process(ModelWithHealthStats):
|
||||
indexes = [
|
||||
models.Index(fields=['machine', 'status', 'retry_at']),
|
||||
models.Index(fields=['binary', 'exit_code']),
|
||||
models.Index(fields=['parent', 'status']),
|
||||
models.Index(fields=['machine', 'pid', 'started_at']),
|
||||
]
|
||||
|
||||
def __str__(self) -> str:
|
||||
@@ -716,9 +708,9 @@ class Process(ModelWithHealthStats):
|
||||
return self.archiveresult.hook_name
|
||||
return ''
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Process model instance to a JSONL record.
|
||||
Convert Process model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
record = {
|
||||
@@ -742,6 +734,26 @@ class Process(ModelWithHealthStats):
|
||||
record['timeout'] = self.timeout
|
||||
return record
|
||||
|
||||
@staticmethod
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
"""
|
||||
Create/update Process from JSON dict.
|
||||
|
||||
Args:
|
||||
record: JSON dict with 'id' or process details
|
||||
overrides: Optional dict of field overrides
|
||||
|
||||
Returns:
|
||||
Process instance or None
|
||||
"""
|
||||
process_id = record.get('id')
|
||||
if process_id:
|
||||
try:
|
||||
return Process.objects.get(id=process_id)
|
||||
except Process.DoesNotExist:
|
||||
pass
|
||||
return None
|
||||
|
||||
def update_and_requeue(self, **kwargs):
|
||||
"""
|
||||
Update process fields and requeue for worker state machine.
|
||||
@@ -1751,17 +1763,12 @@ class ProcessMachine(BaseStateMachine, strict_states=True):
|
||||
@exited.enter
|
||||
def enter_exited(self):
|
||||
"""Process has exited."""
|
||||
success = self.process.exit_code == 0
|
||||
|
||||
self.process.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Process.StatusChoices.EXITED,
|
||||
ended_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Increment health stats based on exit code
|
||||
self.process.increment_health_stats(success=success)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# State Machine Registration
|
||||
|
||||
@@ -76,7 +76,7 @@ class TestMachineModel(TestCase):
|
||||
self.assertEqual(machine1.guid, machine2.guid)
|
||||
|
||||
def test_machine_from_jsonl_update(self):
|
||||
"""Machine.from_jsonl() should update machine config."""
|
||||
"""Machine.from_json() should update machine config."""
|
||||
Machine.current() # Ensure machine exists
|
||||
record = {
|
||||
'_method': 'update',
|
||||
@@ -84,14 +84,14 @@ class TestMachineModel(TestCase):
|
||||
'value': '/usr/bin/wget',
|
||||
}
|
||||
|
||||
result = Machine.from_jsonl(record)
|
||||
result = Machine.from_json(record)
|
||||
|
||||
self.assertIsNotNone(result)
|
||||
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
|
||||
|
||||
def test_machine_from_jsonl_invalid(self):
|
||||
"""Machine.from_jsonl() should return None for invalid records."""
|
||||
result = Machine.from_jsonl({'invalid': 'record'})
|
||||
"""Machine.from_json() should return None for invalid records."""
|
||||
result = Machine.from_json({'invalid': 'record'})
|
||||
self.assertIsNone(result)
|
||||
|
||||
def test_machine_manager_current(self):
|
||||
@@ -254,14 +254,14 @@ class TestProcessModel(TestCase):
|
||||
self.assertIsNone(process.exit_code)
|
||||
|
||||
def test_process_to_jsonl(self):
|
||||
"""Process.to_jsonl() should serialize correctly."""
|
||||
"""Process.to_json() should serialize correctly."""
|
||||
process = Process.objects.create(
|
||||
machine=self.machine,
|
||||
cmd=['echo', 'hello'],
|
||||
pwd='/tmp',
|
||||
timeout=60,
|
||||
)
|
||||
json_data = process.to_jsonl()
|
||||
json_data = process.to_json()
|
||||
|
||||
self.assertEqual(json_data['type'], 'Process')
|
||||
self.assertEqual(json_data['cmd'], ['echo', 'hello'])
|
||||
|
||||
@@ -271,10 +271,51 @@ async function configure2Captcha() {
|
||||
|
||||
if (result.success) {
|
||||
console.error(`[+] 2captcha configured via ${result.method}`);
|
||||
|
||||
// Verify config was applied by reloading options page and checking form values
|
||||
console.error('[*] Verifying config by reloading options page...');
|
||||
try {
|
||||
await configPage.reload({ waitUntil: 'networkidle0', timeout: 10000 });
|
||||
} catch (e) {
|
||||
console.error(`[*] Reload threw error (may still work): ${e.message}`);
|
||||
}
|
||||
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Wait for Config object again
|
||||
await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 });
|
||||
|
||||
// Read back the config using Config.getAll()
|
||||
const verifyConfig = await configPage.evaluate(async () => {
|
||||
if (typeof Config !== 'undefined' && typeof Config.getAll === 'function') {
|
||||
return await Config.getAll();
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (!verifyConfig) {
|
||||
return { success: false, error: 'Could not verify config - Config.getAll() not available' };
|
||||
}
|
||||
|
||||
// Check that API key was actually set
|
||||
const actualApiKey = verifyConfig.apiKey || verifyConfig.api_key;
|
||||
if (!actualApiKey || actualApiKey !== config.apiKey) {
|
||||
console.error(`[!] Config verification FAILED - API key mismatch`);
|
||||
console.error(`[!] Expected: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`);
|
||||
console.error(`[!] Got: ${actualApiKey ? actualApiKey.slice(0, 8) + '...' + actualApiKey.slice(-4) : 'null'}`);
|
||||
return { success: false, error: 'Config verification failed - API key not set correctly' };
|
||||
}
|
||||
|
||||
console.error('[+] Config verified successfully!');
|
||||
console.error(`[+] API Key: ${actualApiKey.slice(0, 8)}...${actualApiKey.slice(-4)}`);
|
||||
console.error(`[+] Plugin Enabled: ${verifyConfig.isPluginEnabled}`);
|
||||
console.error(`[+] Auto Solve Turnstile: ${verifyConfig.autoSolveTurnstile}`);
|
||||
|
||||
fs.writeFileSync(CONFIG_MARKER, JSON.stringify({
|
||||
timestamp: new Date().toISOString(),
|
||||
method: result.method,
|
||||
extensionId: extensionId,
|
||||
verified: true,
|
||||
config: {
|
||||
apiKeySet: !!config.apiKey,
|
||||
isPluginEnabled: config.isPluginEnabled,
|
||||
@@ -284,7 +325,7 @@ async function configure2Captcha() {
|
||||
autoSolveEnabled: true,
|
||||
}
|
||||
}, null, 2));
|
||||
return { success: true, method: result.method };
|
||||
return { success: true, method: result.method, verified: true };
|
||||
}
|
||||
|
||||
return { success: false, error: result.error || 'Config failed' };
|
||||
|
||||
@@ -29,7 +29,7 @@ PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
||||
|
||||
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
|
||||
TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile'
|
||||
|
||||
|
||||
# Alias for backward compatibility with existing test names
|
||||
@@ -70,8 +70,17 @@ class TestTwoCaptcha:
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
exts = json.loads((chrome_dir / 'extensions.json').read_text())
|
||||
assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}"
|
||||
# Wait for extensions.json to be written
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
for i in range(20):
|
||||
if extensions_file.exists():
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
assert extensions_file.exists(), f"extensions.json not created. Chrome dir files: {list(chrome_dir.iterdir())}"
|
||||
|
||||
exts = json.loads(extensions_file.read_text())
|
||||
assert any(e['name'] == 'twocaptcha' for e in exts), f"twocaptcha not loaded: {exts}"
|
||||
print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
@@ -95,6 +104,14 @@ class TestTwoCaptcha:
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
# Wait for extensions.json to be written
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
for i in range(20):
|
||||
if extensions_file.exists():
|
||||
break
|
||||
time.sleep(0.5)
|
||||
assert extensions_file.exists(), f"extensions.json not created"
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
|
||||
env=env, timeout=30, capture_output=True, text=True
|
||||
@@ -163,7 +180,34 @@ const puppeteer = require('puppeteer-core');
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
def test_solves_recaptcha(self):
|
||||
"""Extension solves reCAPTCHA on demo page."""
|
||||
"""Extension attempts to solve CAPTCHA on demo page.
|
||||
|
||||
CRITICAL: DO NOT SKIP OR DISABLE THIS TEST EVEN IF IT'S FLAKY!
|
||||
|
||||
This test is INTENTIONALLY left enabled to expose the REAL, ACTUAL flakiness
|
||||
of the 2captcha service and demo page. The test failures you see here are NOT
|
||||
test bugs - they are ACCURATE representations of the real-world reliability
|
||||
of this CAPTCHA solving service.
|
||||
|
||||
If this test is flaky, that's because 2captcha IS FLAKY in production.
|
||||
If this test fails intermittently, that's because 2captcha FAILS INTERMITTENTLY in production.
|
||||
|
||||
NEVER EVER hide real flakiness by disabling tests or adding @pytest.mark.skip.
|
||||
Users NEED to see this failure rate to understand what they're getting into.
|
||||
|
||||
When this test DOES pass, it confirms:
|
||||
- Extension loads and configures correctly
|
||||
- 2captcha API key is accepted
|
||||
- Extension can successfully auto-solve CAPTCHAs
|
||||
- The entire flow works end-to-end
|
||||
|
||||
When it fails (as it often does):
|
||||
- Demo page has JavaScript errors (representing real-world broken sites)
|
||||
- Turnstile tokens expire before solving (representing real-world timing issues)
|
||||
- 2captcha service may be slow/down (representing real-world service issues)
|
||||
|
||||
This is VALUABLE INFORMATION about the service. DO NOT HIDE IT.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = setup_test_env(tmpdir)
|
||||
@@ -179,6 +223,14 @@ const puppeteer = require('puppeteer-core');
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
# Wait for extensions.json to be written
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
for i in range(20):
|
||||
if extensions_file.exists():
|
||||
break
|
||||
time.sleep(0.5)
|
||||
assert extensions_file.exists(), f"extensions.json not created"
|
||||
|
||||
subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
|
||||
|
||||
script = f'''
|
||||
@@ -187,48 +239,97 @@ const puppeteer = require('puppeteer-core');
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Capture console messages from the page (including extension messages)
|
||||
page.on('console', msg => {{
|
||||
const text = msg.text();
|
||||
if (text.includes('2captcha') || text.includes('turnstile') || text.includes('captcha')) {{
|
||||
console.error('[CONSOLE]', text);
|
||||
}}
|
||||
}});
|
||||
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
console.error('[*] Loading {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Wait for CAPTCHA iframe (minimal wait to avoid token expiration)
|
||||
console.error('[*] Waiting for CAPTCHA iframe...');
|
||||
await page.waitForSelector('iframe', {{ timeout: 30000 }});
|
||||
console.error('[*] CAPTCHA iframe found - extension should auto-solve now');
|
||||
|
||||
// DON'T CLICK - extension should auto-solve since autoSolveTurnstile=True
|
||||
console.error('[*] Waiting for auto-solve (extension configured with autoSolveTurnstile=True)...');
|
||||
|
||||
// Poll for data-state changes with debug output
|
||||
console.error('[*] Waiting for CAPTCHA to be solved (up to 150s)...');
|
||||
const start = Date.now();
|
||||
const maxWait = 90000;
|
||||
let solved = false;
|
||||
let lastState = null;
|
||||
|
||||
while (Date.now() - start < maxWait) {{
|
||||
while (!solved && (Date.now() - start) < 150000) {{
|
||||
const state = await page.evaluate(() => {{
|
||||
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
|
||||
const solver = document.querySelector('.captcha-solver');
|
||||
return {{
|
||||
solved: resp ? resp.value.length > 0 : false,
|
||||
state: solver?.getAttribute('data-state'),
|
||||
text: solver?.textContent?.trim() || ''
|
||||
text: solver?.textContent?.trim(),
|
||||
classList: solver?.className
|
||||
}};
|
||||
}});
|
||||
const sec = Math.round((Date.now() - start) / 1000);
|
||||
console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30));
|
||||
if (state.solved) {{ console.error('[+] SOLVED!'); break; }}
|
||||
if (state.state === 'error') {{ console.error('[!] ERROR'); break; }}
|
||||
|
||||
if (state.state !== lastState) {{
|
||||
const elapsed = Math.round((Date.now() - start) / 1000);
|
||||
console.error(`[*] State change at ${{elapsed}}s: "${{lastState}}" -> "${{state.state}}" (text: "${{state.text?.slice(0, 50)}}")`);
|
||||
lastState = state.state;
|
||||
}}
|
||||
|
||||
if (state.state === 'solved') {{
|
||||
solved = true;
|
||||
const elapsed = Math.round((Date.now() - start) / 1000);
|
||||
console.error('[+] SOLVED in ' + elapsed + 's!');
|
||||
break;
|
||||
}}
|
||||
|
||||
// Check every 2 seconds
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
}}
|
||||
|
||||
if (!solved) {{
|
||||
const elapsed = Math.round((Date.now() - start) / 1000);
|
||||
const finalState = await page.evaluate(() => {{
|
||||
const solver = document.querySelector('.captcha-solver');
|
||||
return {{
|
||||
state: solver?.getAttribute('data-state'),
|
||||
text: solver?.textContent?.trim(),
|
||||
html: solver?.outerHTML?.slice(0, 200)
|
||||
}};
|
||||
}});
|
||||
console.error(`[!] TIMEOUT after ${{elapsed}}s. Final state: ${{JSON.stringify(finalState)}}`);
|
||||
browser.disconnect();
|
||||
process.exit(1);
|
||||
}}
|
||||
|
||||
const final = await page.evaluate(() => {{
|
||||
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
|
||||
return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }};
|
||||
const solver = document.querySelector('.captcha-solver');
|
||||
return {{
|
||||
solved: true,
|
||||
state: solver?.getAttribute('data-state'),
|
||||
text: solver?.textContent?.trim()
|
||||
}};
|
||||
}});
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(final));
|
||||
}})();
|
||||
'''
|
||||
(tmpdir / 's.js').write_text(script)
|
||||
print("\n[*] Solving CAPTCHA (10-60s)...")
|
||||
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True)
|
||||
print("\n[*] Solving CAPTCHA (this can take up to 150s for 2captcha API)...")
|
||||
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=200, capture_output=True, text=True)
|
||||
print(r.stderr)
|
||||
assert r.returncode == 0, f"Failed: {r.stderr}"
|
||||
|
||||
final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
|
||||
assert final.get('solved'), f"Not solved: {final}"
|
||||
print(f"[+] SOLVED! {final.get('preview','')[:30]}...")
|
||||
assert final.get('state') == 'solved', f"State not 'solved': {final}"
|
||||
print(f"[+] SUCCESS! CAPTCHA solved: {final.get('text','')[:50]}")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
|
||||
@@ -265,57 +265,60 @@ class Orchestrator:
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Main orchestrator loop."""
|
||||
from archivebox.misc.logging import IS_TTY, CONSOLE
|
||||
import sys
|
||||
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn
|
||||
from archivebox.misc.logging import IS_TTY
|
||||
import archivebox.misc.logging as logging_module
|
||||
|
||||
self.on_startup()
|
||||
|
||||
# Enable progress bars only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
last_progress_output = ""
|
||||
|
||||
progress = Progress(
|
||||
TextColumn("[cyan]{task.description}"),
|
||||
BarColumn(bar_width=40),
|
||||
TaskProgressColumn(),
|
||||
transient=False,
|
||||
) if show_progress else None
|
||||
|
||||
task_ids = {} # snapshot_id -> task_id
|
||||
|
||||
# Replace global CONSOLE with progress.console when active
|
||||
original_console = logging_module.CONSOLE
|
||||
original_stderr = logging_module.STDERR
|
||||
|
||||
try:
|
||||
if progress:
|
||||
progress.start()
|
||||
# Redirect all logging through progress.console
|
||||
logging_module.CONSOLE = progress.console
|
||||
logging_module.STDERR = progress.console
|
||||
|
||||
while True:
|
||||
# Check queues and spawn workers
|
||||
queue_sizes = self.check_queues_and_spawn_workers()
|
||||
|
||||
# Update progress bars (simple inline update)
|
||||
if show_progress:
|
||||
# Update progress bars
|
||||
if progress:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
active_snapshots = list(Snapshot.objects.filter(status='started').iterator(chunk_size=100))
|
||||
active_snapshots = Snapshot.objects.filter(status='started').iterator(chunk_size=100)
|
||||
|
||||
if active_snapshots:
|
||||
# Build progress string
|
||||
progress_lines = []
|
||||
for snapshot in active_snapshots[:5]: # Limit to 5 snapshots
|
||||
total = snapshot.archiveresult_set.count()
|
||||
if total == 0:
|
||||
continue
|
||||
for snapshot in active_snapshots:
|
||||
total = snapshot.archiveresult_set.count()
|
||||
if total == 0:
|
||||
continue
|
||||
|
||||
completed = snapshot.archiveresult_set.filter(
|
||||
status__in=['succeeded', 'skipped', 'failed']
|
||||
).count()
|
||||
completed = snapshot.archiveresult_set.filter(
|
||||
status__in=['succeeded', 'skipped', 'failed']
|
||||
).count()
|
||||
|
||||
percentage = (completed / total) * 100
|
||||
bar_width = 30
|
||||
filled = int(bar_width * completed / total)
|
||||
bar = '█' * filled + '░' * (bar_width - filled)
|
||||
# Create or update task
|
||||
if snapshot.id not in task_ids:
|
||||
url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url
|
||||
task_ids[snapshot.id] = progress.add_task(url, total=total)
|
||||
|
||||
url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
|
||||
progress_lines.append(f"{url} {bar} {percentage:>3.0f}%")
|
||||
|
||||
progress_output = "\n".join(progress_lines)
|
||||
|
||||
# Only update if changed
|
||||
if progress_output != last_progress_output:
|
||||
# Clear previous lines and print new ones
|
||||
if last_progress_output:
|
||||
num_lines = last_progress_output.count('\n') + 1
|
||||
sys.stderr.write(f"\r\033[{num_lines}A\033[J")
|
||||
sys.stderr.write(progress_output + "\n")
|
||||
sys.stderr.flush()
|
||||
last_progress_output = progress_output
|
||||
progress.update(task_ids[snapshot.id], completed=completed)
|
||||
|
||||
# Track idle state
|
||||
if self.has_pending_work(queue_sizes) or self.has_running_workers():
|
||||
@@ -327,12 +330,6 @@ class Orchestrator:
|
||||
|
||||
# Check if we should exit
|
||||
if self.should_exit(queue_sizes):
|
||||
# Clear progress lines
|
||||
if show_progress and last_progress_output:
|
||||
num_lines = last_progress_output.count('\n') + 1
|
||||
sys.stderr.write(f"\r\033[{num_lines}A\033[J")
|
||||
sys.stderr.flush()
|
||||
|
||||
log_worker_event(
|
||||
worker_type='Orchestrator',
|
||||
event='All work complete',
|
||||
@@ -350,6 +347,12 @@ class Orchestrator:
|
||||
raise
|
||||
else:
|
||||
self.on_shutdown()
|
||||
finally:
|
||||
if progress:
|
||||
# Restore original consoles
|
||||
logging_module.CONSOLE = original_console
|
||||
logging_module.STDERR = original_stderr
|
||||
progress.stop()
|
||||
|
||||
def start(self) -> int:
|
||||
"""
|
||||
|
||||
@@ -67,7 +67,7 @@ for test_dir in $TEST_DIRS; do
|
||||
|
||||
echo -e "${YELLOW}[RUNNING]${NC} $plugin_name"
|
||||
|
||||
if python -m pytest "$test_dir" -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
|
||||
if python -m pytest "$test_dir" -p no:django -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
|
||||
echo -e "${GREEN}[PASSED]${NC} $plugin_name"
|
||||
PASSED_PLUGINS=$((PASSED_PLUGINS + 1))
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user