cleanup migrations, json, jsonl

This commit is contained in:
Nick Sweeting
2025-12-31 15:36:13 -08:00
parent 0930911a15
commit a04e4a7345
21 changed files with 993 additions and 1418 deletions

View File

@@ -1,90 +0,0 @@
# Generated by hand on 2025-12-29
# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema
from django.db import migrations
def upgrade_crawl_schema_if_needed(apps, schema_editor):
"""
Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column).
"""
with schema_editor.connection.cursor() as cursor:
# Check if we need to upgrade (missing urls column means v0.8.6rc0)
cursor.execute("""
SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls'
""")
has_urls = cursor.fetchone()[0] > 0
if not has_urls:
print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...")
# Create new table with v0.9.0 schema
cursor.execute("""
CREATE TABLE crawls_crawl_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
urls TEXT NOT NULL DEFAULT '[]',
config TEXT,
max_depth INTEGER NOT NULL DEFAULT 0,
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
persona_id TEXT,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
output_dir VARCHAR(512) NOT NULL DEFAULT '',
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
created_by_id INTEGER NOT NULL,
schedule_id TEXT,
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
)
""")
# Copy data from old table (v0.8.6rc0 schema)
cursor.execute("""
INSERT INTO crawls_crawl_new (
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id
)
SELECT
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
'[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id,
CAST(schedule_id AS TEXT)
FROM crawls_crawl
""")
# Replace old table
cursor.execute("DROP TABLE crawls_crawl")
cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl")
# Create indexes
cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)")
cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)")
cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)")
cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)")
cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)")
print(" ✓ Upgraded crawls_crawl to v0.9.0 schema")
else:
print(" ✓ crawls_crawl already has v0.9.0 schema")
class Migration(migrations.Migration):
dependencies = [
('crawls', '0001_initial'),
('auth', '0012_alter_user_first_name_max_length'),
]
operations = [
migrations.RunPython(
upgrade_crawl_schema_if_needed,
reverse_code=migrations.RunPython.noop,
),
]

View File

@@ -134,9 +134,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
def api_url(self) -> str:
return reverse_lazy('api-1:get_crawl', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Crawl model instance to a JSONL record.
Convert Crawl model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
@@ -152,9 +152,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
def from_json(record: dict, overrides: dict = None):
"""
Create or get a Crawl from a JSONL record.
Create or get a Crawl from a JSON dict.
Args:
record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'