mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
cleanup migrations, json, jsonl
This commit is contained in:
@@ -1,90 +0,0 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def upgrade_crawl_schema_if_needed(apps, schema_editor):
|
||||
"""
|
||||
Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column).
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if we need to upgrade (missing urls column means v0.8.6rc0)
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls'
|
||||
""")
|
||||
has_urls = cursor.fetchone()[0] > 0
|
||||
|
||||
if not has_urls:
|
||||
print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...")
|
||||
|
||||
# Create new table with v0.9.0 schema
|
||||
cursor.execute("""
|
||||
CREATE TABLE crawls_crawl_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME NOT NULL,
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
urls TEXT NOT NULL DEFAULT '[]',
|
||||
config TEXT,
|
||||
max_depth INTEGER NOT NULL DEFAULT 0,
|
||||
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
|
||||
persona_id TEXT,
|
||||
label VARCHAR(64) NOT NULL DEFAULT '',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
output_dir VARCHAR(512) NOT NULL DEFAULT '',
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
created_by_id INTEGER NOT NULL,
|
||||
schedule_id TEXT,
|
||||
|
||||
FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL
|
||||
)
|
||||
""")
|
||||
|
||||
# Copy data from old table (v0.8.6rc0 schema)
|
||||
cursor.execute("""
|
||||
INSERT INTO crawls_crawl_new (
|
||||
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
|
||||
urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id
|
||||
)
|
||||
SELECT
|
||||
id, created_at, modified_at, num_uses_succeeded, num_uses_failed,
|
||||
'[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id,
|
||||
CAST(schedule_id AS TEXT)
|
||||
FROM crawls_crawl
|
||||
""")
|
||||
|
||||
# Replace old table
|
||||
cursor.execute("DROP TABLE crawls_crawl")
|
||||
cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)")
|
||||
cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)")
|
||||
|
||||
print(" ✓ Upgraded crawls_crawl to v0.9.0 schema")
|
||||
else:
|
||||
print(" ✓ crawls_crawl already has v0.9.0 schema")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0001_initial'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(
|
||||
upgrade_crawl_schema_if_needed,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
]
|
||||
@@ -134,9 +134,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_crawl', args=[self.id])
|
||||
|
||||
def to_jsonl(self) -> dict:
|
||||
def to_json(self) -> dict:
|
||||
"""
|
||||
Convert Crawl model instance to a JSONL record.
|
||||
Convert Crawl model instance to a JSON-serializable dict.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
@@ -152,9 +152,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
def from_json(record: dict, overrides: dict = None):
|
||||
"""
|
||||
Create or get a Crawl from a JSONL record.
|
||||
Create or get a Crawl from a JSON dict.
|
||||
|
||||
Args:
|
||||
record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'
|
||||
|
||||
Reference in New Issue
Block a user