mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
use full dotted paths for all archivebox imports, add migrations and more fixes
This commit is contained in:
@@ -5,8 +5,12 @@ from django.apps import AppConfig
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'archivebox.core'
|
||||
label = 'core'
|
||||
|
||||
def ready(self):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from archivebox.core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
# Import models to register state machines with the registry
|
||||
from archivebox.core import models # noqa: F401
|
||||
|
||||
57
archivebox/core/migrations/0024_b_clear_config_fields.py
Normal file
57
archivebox/core/migrations/0024_b_clear_config_fields.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# Data migration to clear config fields that may contain invalid JSON
|
||||
# This runs before 0025 to prevent CHECK constraint failures
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def clear_config_fields(apps, schema_editor):
|
||||
"""Clear all config fields in related tables to avoid JSON validation errors."""
|
||||
db_alias = schema_editor.connection.alias
|
||||
|
||||
# Disable foreign key checks temporarily to allow updates
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
cursor.execute("PRAGMA foreign_keys=OFF")
|
||||
|
||||
tables_to_clear = [
|
||||
('crawls_seed', 'config'),
|
||||
('crawls_crawl', 'config'),
|
||||
('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
|
||||
('machine_machine', 'stats'),
|
||||
('machine_machine', 'config'),
|
||||
]
|
||||
|
||||
for table_info in tables_to_clear:
|
||||
if table_info is None:
|
||||
continue
|
||||
table_name, field_name = table_info
|
||||
|
||||
try:
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if table exists first
|
||||
cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
|
||||
if not cursor.fetchone():
|
||||
print(f" Skipping {table_name}.{field_name}: table does not exist")
|
||||
continue
|
||||
|
||||
# Set all to empty JSON object
|
||||
cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
|
||||
print(f" Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
|
||||
except Exception as e:
|
||||
print(f" Skipping {table_name}.{field_name}: {e}")
|
||||
|
||||
# Re-enable foreign key checks
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
cursor.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0023_new_schema'),
|
||||
('crawls', '0001_initial'),
|
||||
('machine', '0001_squashed'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
28
archivebox/core/migrations/0024_c_disable_fk_checks.py
Normal file
28
archivebox/core/migrations/0024_c_disable_fk_checks.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def disable_fk_checks(apps, schema_editor):
|
||||
"""Temporarily disable foreign key checks."""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
cursor.execute("PRAGMA foreign_keys=OFF")
|
||||
print(" Disabled foreign key checks")
|
||||
|
||||
|
||||
def enable_fk_checks(apps, schema_editor):
|
||||
"""Re-enable foreign key checks."""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
cursor.execute("PRAGMA foreign_keys=ON")
|
||||
print(" Enabled foreign key checks")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_b_clear_config_fields'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
|
||||
]
|
||||
93
archivebox/core/migrations/0024_d_fix_crawls_config.py
Normal file
93
archivebox/core/migrations/0024_d_fix_crawls_config.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
def fix_crawls_config(apps, schema_editor):
|
||||
"""
|
||||
Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
|
||||
Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
|
||||
For fresh installs, crawls.0001_initial creates the correct schema.
|
||||
"""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check if this is an upgrade from old 0.8.x or a fresh install
|
||||
# In fresh installs, crawls.0001_initial was applied, creating seed FK
|
||||
# In upgrades, the table was created by old migrations before 0001_initial existed
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) FROM django_migrations
|
||||
WHERE app='crawls' AND name='0001_initial'
|
||||
""")
|
||||
has_crawls_0001 = cursor.fetchone()[0] > 0
|
||||
|
||||
if has_crawls_0001:
|
||||
# Fresh install - crawls.0001_initial already created the correct schema
|
||||
# Just clear config to avoid CHECK constraint issues
|
||||
print(" Fresh install detected - clearing config field only")
|
||||
try:
|
||||
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
|
||||
except Exception as e:
|
||||
print(f" Skipping config clear: {e}")
|
||||
return
|
||||
|
||||
# Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
|
||||
print(" Upgrading from 0.8.x - rebuilding crawls_crawl table")
|
||||
cursor.execute("PRAGMA foreign_keys=OFF")
|
||||
|
||||
# Backup
|
||||
cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
|
||||
|
||||
# Recreate without config CHECK constraint, with nullable seed_id
|
||||
cursor.execute("DROP TABLE crawls_crawl")
|
||||
cursor.execute("""
|
||||
CREATE TABLE "crawls_crawl" (
|
||||
"num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
|
||||
"num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
|
||||
"id" char(32) NOT NULL PRIMARY KEY,
|
||||
"created_at" datetime NOT NULL,
|
||||
"modified_at" datetime NOT NULL,
|
||||
"urls" text NOT NULL,
|
||||
"config" text,
|
||||
"max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
|
||||
"tags_str" varchar(1024) NOT NULL,
|
||||
"persona_id" char(32) NULL,
|
||||
"label" varchar(64) NOT NULL,
|
||||
"notes" text NOT NULL,
|
||||
"output_dir" varchar(512) NOT NULL,
|
||||
"status" varchar(15) NOT NULL,
|
||||
"retry_at" datetime NULL,
|
||||
"created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
|
||||
"seed_id" char(32) NULL DEFAULT NULL,
|
||||
"schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
|
||||
)
|
||||
""")
|
||||
|
||||
# Restore data
|
||||
cursor.execute("""
|
||||
INSERT INTO "crawls_crawl" (
|
||||
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
|
||||
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
|
||||
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
|
||||
)
|
||||
SELECT
|
||||
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
|
||||
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
|
||||
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
|
||||
FROM crawls_crawl_backup
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE crawls_crawl_backup")
|
||||
|
||||
# NULL out config to avoid any invalid JSON
|
||||
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_c_disable_fk_checks'),
|
||||
('crawls', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
|
||||
]
|
||||
@@ -8,9 +8,7 @@ import django.db.models.deletion
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0023_new_schema'),
|
||||
('crawls', '0001_initial'),
|
||||
('machine', '0001_squashed'),
|
||||
('core', '0024_d_fix_crawls_config'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
|
||||
@@ -10,6 +10,13 @@ from django.db import migrations, models
|
||||
|
||||
def populate_archiveresult_uuids(apps, schema_editor):
|
||||
"""Generate unique UUIDs for ArchiveResults that don't have one."""
|
||||
# Check if uuid column exists before trying to populate it
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
if 'uuid' not in columns:
|
||||
return # uuid column doesn't exist, skip this data migration
|
||||
|
||||
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||
for result in ArchiveResult.objects.filter(uuid__isnull=True):
|
||||
result.uuid = uuid_compat.uuid7()
|
||||
@@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor):
|
||||
pass
|
||||
|
||||
|
||||
def remove_output_dir_if_exists(apps, schema_editor):
|
||||
"""Remove output_dir columns if they exist."""
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Check and remove from core_archiveresult
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
if 'output_dir' in columns:
|
||||
cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
|
||||
|
||||
# Check and remove from core_snapshot
|
||||
cursor.execute("PRAGMA table_info(core_snapshot)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
if 'output_dir' in columns:
|
||||
cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
@@ -33,82 +56,90 @@ class Migration(migrations.Migration):
|
||||
migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),
|
||||
|
||||
# Remove output_dir fields (not needed, computed from snapshot)
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='output_dir',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='output_dir',
|
||||
migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
|
||||
|
||||
# Update Django's migration state to match 0.9.x schema
|
||||
# Database already has correct types from 0.8.x, just update state
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
# Archiveresult field alterations
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(db_index=True, max_length=32),
|
||||
),
|
||||
# Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
|
||||
# Snapshot field alterations
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No actual database changes needed - schema is already correct from 0.8.x
|
||||
],
|
||||
),
|
||||
|
||||
# Archiveresult field alterations
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(db_index=True, max_length=32),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
|
||||
# Snapshot field alterations
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
|
||||
# SnapshotTag and Tag alterations
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'tag')},
|
||||
# SnapshotTag and Tag alterations - state only, DB already correct
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'tag')},
|
||||
),
|
||||
],
|
||||
database_operations=[],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -13,68 +13,79 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Add new output fields (keep old 'output' temporarily for migration)
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
default='',
|
||||
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(
|
||||
null=True,
|
||||
blank=True,
|
||||
default=None,
|
||||
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(
|
||||
default=dict,
|
||||
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(
|
||||
default=0,
|
||||
help_text='Total recursive size in bytes of all output files'
|
||||
),
|
||||
),
|
||||
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(
|
||||
max_length=512,
|
||||
blank=True,
|
||||
default='',
|
||||
help_text='CSV of mimetypes sorted by size descending'
|
||||
),
|
||||
),
|
||||
|
||||
# Add binary FK (optional)
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='binary',
|
||||
field=models.ForeignKey(
|
||||
'machine.Binary',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='archiveresults',
|
||||
help_text='Primary binary used by this hook (optional)'
|
||||
),
|
||||
# Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(
|
||||
blank=True,
|
||||
default='',
|
||||
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(
|
||||
null=True,
|
||||
blank=True,
|
||||
default=None,
|
||||
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(
|
||||
default=dict,
|
||||
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(
|
||||
default=0,
|
||||
help_text='Total recursive size in bytes of all output files'
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(
|
||||
max_length=512,
|
||||
blank=True,
|
||||
default='',
|
||||
help_text='CSV of mimetypes sorted by size descending'
|
||||
),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='binary',
|
||||
field=models.ForeignKey(
|
||||
'machine.Binary',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='archiveresults',
|
||||
help_text='Primary binary used by this hook (optional)'
|
||||
),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
|
||||
ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
|
||||
ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
|
||||
ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
|
||||
ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
|
||||
ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor):
|
||||
Logic:
|
||||
- If output contains JSON {...}, move to output_json
|
||||
- Otherwise, move to output_str
|
||||
|
||||
Use raw SQL to avoid CHECK constraint issues during migration.
|
||||
"""
|
||||
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||
# Use raw SQL to migrate data without triggering CHECK constraints
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
# Get all archive results
|
||||
cursor.execute("""
|
||||
SELECT id, output FROM core_archiveresult
|
||||
""")
|
||||
|
||||
for ar in ArchiveResult.objects.all().iterator():
|
||||
old_output = ar.output or ''
|
||||
for row in cursor.fetchall():
|
||||
ar_id, old_output = row
|
||||
old_output = old_output or ''
|
||||
|
||||
# Case 1: JSON output
|
||||
if old_output.strip().startswith('{'):
|
||||
try:
|
||||
parsed = json.loads(old_output)
|
||||
ar.output_json = parsed
|
||||
ar.output_str = ''
|
||||
except json.JSONDecodeError:
|
||||
# Not valid JSON, treat as string
|
||||
ar.output_str = old_output
|
||||
|
||||
# Case 2: File path or plain string
|
||||
else:
|
||||
ar.output_str = old_output
|
||||
|
||||
ar.save(update_fields=['output_str', 'output_json'])
|
||||
# Case 1: JSON output
|
||||
if old_output.strip().startswith('{'):
|
||||
try:
|
||||
# Validate it's actual JSON
|
||||
parsed = json.loads(old_output)
|
||||
# Update with JSON - cast to JSON to satisfy CHECK constraint
|
||||
json_str = json.dumps(parsed)
|
||||
cursor.execute("""
|
||||
UPDATE core_archiveresult
|
||||
SET output_str = '', output_json = json(?)
|
||||
WHERE id = ?
|
||||
""", (json_str, ar_id))
|
||||
except json.JSONDecodeError:
|
||||
# Not valid JSON, treat as string
|
||||
cursor.execute("""
|
||||
UPDATE core_archiveresult
|
||||
SET output_str = ?, output_json = NULL
|
||||
WHERE id = ?
|
||||
""", (old_output, ar_id))
|
||||
# Case 2: File path or plain string
|
||||
else:
|
||||
cursor.execute("""
|
||||
UPDATE core_archiveresult
|
||||
SET output_str = ?, output_json = NULL
|
||||
WHERE id = ?
|
||||
""", (old_output, ar_id))
|
||||
|
||||
|
||||
def reverse_migrate(apps, schema_editor):
|
||||
|
||||
@@ -16,43 +16,62 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='binary',
|
||||
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
|
||||
# Update Django's state only - database already has correct schema from 0029
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='binary',
|
||||
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes needed - columns already exist with correct types
|
||||
],
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_files',
|
||||
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_json',
|
||||
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_mimetypes',
|
||||
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_size',
|
||||
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output_str',
|
||||
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
|
||||
# Add unique constraint without table rebuild
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
migrations.RunSQL(
|
||||
sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
|
||||
reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -10,20 +10,35 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='extractor',
|
||||
new_name='plugin',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
default='',
|
||||
max_length=255,
|
||||
db_index=True,
|
||||
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
|
||||
),
|
||||
# Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.RenameField(
|
||||
model_name='archiveresult',
|
||||
old_name='extractor',
|
||||
new_name='plugin',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='hook_name',
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
default='',
|
||||
max_length=255,
|
||||
db_index=True,
|
||||
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
|
||||
),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
|
||||
ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -11,13 +11,27 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(
|
||||
default=0,
|
||||
db_index=True,
|
||||
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
|
||||
),
|
||||
# Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(
|
||||
default=0,
|
||||
db_index=True,
|
||||
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
|
||||
),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -54,7 +54,7 @@ class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0034_snapshot_current_step'),
|
||||
('crawls', '0004_alter_crawl_output_dir'),
|
||||
('crawls', '0005_drop_seed_id_column'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@@ -64,16 +64,24 @@ class Migration(migrations.Migration):
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
|
||||
# Step 2: Make crawl non-nullable
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
|
||||
# Step 3: Remove created_by field
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
# Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
# Make crawl non-nullable
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
# Remove created_by field from Django's state
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
|
||||
# created_by_id column remains in database but is unused
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -10,10 +10,18 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Remove created_by field from ArchiveResult
|
||||
# Remove created_by field from ArchiveResult (state only)
|
||||
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
# Leave created_by_id column in database (unused but harmless, avoids table rebuild)
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes - leave created_by_id column in place to avoid table rebuild
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
# Generated by Django 6.0 on 2025-12-29 06:45
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0036_remove_archiveresult_created_by'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Update Django's state only - database columns remain for backwards compat
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='output_dir',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='output_dir',
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes - columns remain in place to avoid table rebuilds
|
||||
],
|
||||
),
|
||||
]
|
||||
84
archivebox/core/migrations/0038_fix_missing_columns.py
Normal file
84
archivebox/core/migrations/0038_fix_missing_columns.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
|
||||
|
||||
from django.db import migrations, models, connection
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
def add_columns_if_not_exist(apps, schema_editor):
|
||||
"""Add columns to ArchiveResult only if they don't already exist."""
|
||||
with connection.cursor() as cursor:
|
||||
# Get existing columns
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
existing_columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Add num_uses_failed if it doesn't exist
|
||||
if 'num_uses_failed' not in existing_columns:
|
||||
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
|
||||
|
||||
# Add num_uses_succeeded if it doesn't exist
|
||||
if 'num_uses_succeeded' not in existing_columns:
|
||||
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
|
||||
|
||||
# Add config if it doesn't exist
|
||||
if 'config' not in existing_columns:
|
||||
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
|
||||
|
||||
# Add retry_at if it doesn't exist
|
||||
if 'retry_at' not in existing_columns:
|
||||
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0037_remove_archiveresult_output_dir_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Add missing columns to ArchiveResult
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
field=models.PositiveIntegerField(default=0),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
|
||||
],
|
||||
),
|
||||
|
||||
# Drop created_by_id from Snapshot (database only, already removed from model in 0035)
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
# No state changes - field already removed in 0035
|
||||
],
|
||||
database_operations=[
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
-- Drop index first, then column
|
||||
DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
|
||||
ALTER TABLE core_snapshot DROP COLUMN created_by_id;
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
30
archivebox/core/migrations/0039_fix_num_uses_values.py
Normal file
30
archivebox/core/migrations/0039_fix_num_uses_values.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# Fix num_uses_failed and num_uses_succeeded string values to integers
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0038_fix_missing_columns'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Fix string values that got inserted as literals instead of integers
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
UPDATE core_snapshot
|
||||
SET num_uses_failed = 0
|
||||
WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
|
||||
|
||||
UPDATE core_snapshot
|
||||
SET num_uses_succeeded = 0
|
||||
WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
|
||||
|
||||
UPDATE core_snapshot
|
||||
SET depth = 0
|
||||
WHERE typeof(depth) = 'text' OR depth = 'depth';
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
]
|
||||
@@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
)
|
||||
|
||||
merged = 0
|
||||
for dup in duplicates.iterator():
|
||||
for dup in duplicates.iterator(chunk_size=500):
|
||||
snapshots = list(
|
||||
cls.objects
|
||||
.filter(url=dup['url'], timestamp=dup['timestamp'])
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str:
|
||||
'output_path': output_path,
|
||||
'plugin': plugin,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
rendered = tpl.render(ctx)
|
||||
# Only return non-empty content (strip whitespace to check)
|
||||
if rendered.strip():
|
||||
return mark_safe(rendered)
|
||||
return ''
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
@@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str:
|
||||
'output_path': output_path,
|
||||
'plugin': plugin,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
rendered = tpl.render(ctx)
|
||||
# Only return non-empty content (strip whitespace to check)
|
||||
if rendered.strip():
|
||||
return mark_safe(rendered)
|
||||
return ''
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
@@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str:
|
||||
'output_path': output_path,
|
||||
'plugin': plugin,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
rendered = tpl.render(ctx)
|
||||
# Only return non-empty content (strip whitespace to check)
|
||||
if rendered.strip():
|
||||
return mark_safe(rendered)
|
||||
return ''
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
@@ -539,7 +539,7 @@ from django.http import JsonResponse
|
||||
def live_progress_view(request):
|
||||
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
||||
try:
|
||||
from workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
Reference in New Issue
Block a user