use full dotted paths for all archivebox imports, add migrations and more fixes

This commit is contained in:
Nick Sweeting
2025-12-29 00:47:08 -08:00
parent 1e4d3ffd11
commit f4e7820533
61 changed files with 1082 additions and 2985 deletions

View File

@@ -5,8 +5,12 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'archivebox.core'
label = 'core'
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
from archivebox.core.admin_site import register_admin_site
register_admin_site()
# Import models to register state machines with the registry
from archivebox.core import models # noqa: F401

View File

@@ -0,0 +1,57 @@
# Data migration to clear config fields that may contain invalid JSON
# This runs before 0025 to prevent CHECK constraint failures
from django.db import migrations
def clear_config_fields(apps, schema_editor):
"""Clear all config fields in related tables to avoid JSON validation errors."""
db_alias = schema_editor.connection.alias
# Disable foreign key checks temporarily to allow updates
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF")
tables_to_clear = [
('crawls_seed', 'config'),
('crawls_crawl', 'config'),
('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
('machine_machine', 'stats'),
('machine_machine', 'config'),
]
for table_info in tables_to_clear:
if table_info is None:
continue
table_name, field_name = table_info
try:
with schema_editor.connection.cursor() as cursor:
# Check if table exists first
cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
if not cursor.fetchone():
print(f" Skipping {table_name}.{field_name}: table does not exist")
continue
# Set all to empty JSON object
cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
print(f" Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
except Exception as e:
print(f" Skipping {table_name}.{field_name}: {e}")
# Re-enable foreign key checks
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=ON")
class Migration(migrations.Migration):
dependencies = [
('core', '0023_new_schema'),
('crawls', '0001_initial'),
('machine', '0001_squashed'),
]
operations = [
migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
]

View File

@@ -0,0 +1,28 @@
# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
from django.db import migrations
def disable_fk_checks(apps, schema_editor):
"""Temporarily disable foreign key checks."""
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF")
print(" Disabled foreign key checks")
def enable_fk_checks(apps, schema_editor):
"""Re-enable foreign key checks."""
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=ON")
print(" Enabled foreign key checks")
class Migration(migrations.Migration):
dependencies = [
('core', '0024_b_clear_config_fields'),
]
operations = [
migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
]

View File

@@ -0,0 +1,93 @@
# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
from django.db import migrations
def fix_crawls_config(apps, schema_editor):
"""
Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
For fresh installs, crawls.0001_initial creates the correct schema.
"""
with schema_editor.connection.cursor() as cursor:
# Check if this is an upgrade from old 0.8.x or a fresh install
# In fresh installs, crawls.0001_initial was applied, creating seed FK
# In upgrades, the table was created by old migrations before 0001_initial existed
cursor.execute("""
SELECT COUNT(*) FROM django_migrations
WHERE app='crawls' AND name='0001_initial'
""")
has_crawls_0001 = cursor.fetchone()[0] > 0
if has_crawls_0001:
# Fresh install - crawls.0001_initial already created the correct schema
# Just clear config to avoid CHECK constraint issues
print(" Fresh install detected - clearing config field only")
try:
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
except Exception as e:
print(f" Skipping config clear: {e}")
return
# Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
print(" Upgrading from 0.8.x - rebuilding crawls_crawl table")
cursor.execute("PRAGMA foreign_keys=OFF")
# Backup
cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
# Recreate without config CHECK constraint, with nullable seed_id
cursor.execute("DROP TABLE crawls_crawl")
cursor.execute("""
CREATE TABLE "crawls_crawl" (
"num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
"num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
"id" char(32) NOT NULL PRIMARY KEY,
"created_at" datetime NOT NULL,
"modified_at" datetime NOT NULL,
"urls" text NOT NULL,
"config" text,
"max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
"tags_str" varchar(1024) NOT NULL,
"persona_id" char(32) NULL,
"label" varchar(64) NOT NULL,
"notes" text NOT NULL,
"output_dir" varchar(512) NOT NULL,
"status" varchar(15) NOT NULL,
"retry_at" datetime NULL,
"created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
"seed_id" char(32) NULL DEFAULT NULL,
"schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
)
""")
# Restore data
cursor.execute("""
INSERT INTO "crawls_crawl" (
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
)
SELECT
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
FROM crawls_crawl_backup
""")
cursor.execute("DROP TABLE crawls_crawl_backup")
# NULL out config to avoid any invalid JSON
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
class Migration(migrations.Migration):
dependencies = [
('core', '0024_c_disable_fk_checks'),
('crawls', '0001_initial'),
]
operations = [
migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
]

View File

@@ -8,9 +8,7 @@ import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('core', '0023_new_schema'),
('crawls', '0001_initial'),
('machine', '0001_squashed'),
('core', '0024_d_fix_crawls_config'),
]
operations = [

View File

@@ -10,6 +10,13 @@ from django.db import migrations, models
def populate_archiveresult_uuids(apps, schema_editor):
"""Generate unique UUIDs for ArchiveResults that don't have one."""
# Check if uuid column exists before trying to populate it
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = [row[1] for row in cursor.fetchall()]
if 'uuid' not in columns:
return # uuid column doesn't exist, skip this data migration
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for result in ArchiveResult.objects.filter(uuid__isnull=True):
result.uuid = uuid_compat.uuid7()
@@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor):
pass
def remove_output_dir_if_exists(apps, schema_editor):
"""Remove output_dir columns if they exist."""
with schema_editor.connection.cursor() as cursor:
# Check and remove from core_archiveresult
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = [row[1] for row in cursor.fetchall()]
if 'output_dir' in columns:
cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
# Check and remove from core_snapshot
cursor.execute("PRAGMA table_info(core_snapshot)")
columns = [row[1] for row in cursor.fetchall()]
if 'output_dir' in columns:
cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
class Migration(migrations.Migration):
dependencies = [
@@ -33,82 +56,90 @@ class Migration(migrations.Migration):
migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),
# Remove output_dir fields (not needed, computed from snapshot)
migrations.RemoveField(
model_name='archiveresult',
name='output_dir',
),
migrations.RemoveField(
model_name='snapshot',
name='output_dir',
migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
# Update Django's migration state to match 0.9.x schema
# Database already has correct types from 0.8.x, just update state
migrations.SeparateDatabaseAndState(
state_operations=[
# Archiveresult field alterations
migrations.AlterField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(db_index=True, max_length=32),
),
# Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
# Snapshot field alterations
migrations.AlterField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
],
database_operations=[
# No actual database changes needed - schema is already correct from 0.8.x
],
),
# Archiveresult field alterations
migrations.AlterField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(db_index=True, max_length=32),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
# Snapshot field alterations
migrations.AlterField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
# SnapshotTag and Tag alterations
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
# SnapshotTag and Tag alterations - state only, DB already correct
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
],
database_operations=[],
),
]

View File

@@ -13,68 +13,79 @@ class Migration(migrations.Migration):
]
operations = [
# Add new output fields (keep old 'output' temporarily for migration)
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(
blank=True,
default='',
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(
null=True,
blank=True,
default=None,
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(
default=dict,
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(
default=0,
help_text='Total recursive size in bytes of all output files'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(
max_length=512,
blank=True,
default='',
help_text='CSV of mimetypes sorted by size descending'
),
),
# Add binary FK (optional)
migrations.AddField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(
'machine.Binary',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='archiveresults',
help_text='Primary binary used by this hook (optional)'
),
# Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='archiveresult',
name='output_str',
field=models.TextField(
blank=True,
default='',
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(
null=True,
blank=True,
default=None,
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(
default=dict,
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(
default=0,
help_text='Total recursive size in bytes of all output files'
),
),
migrations.AddField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(
max_length=512,
blank=True,
default='',
help_text='CSV of mimetypes sorted by size descending'
),
),
migrations.AddField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(
'machine.Binary',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='archiveresults',
help_text='Primary binary used by this hook (optional)'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor):
Logic:
- If output contains JSON {...}, move to output_json
- Otherwise, move to output_str
Use raw SQL to avoid CHECK constraint issues during migration.
"""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
# Use raw SQL to migrate data without triggering CHECK constraints
with schema_editor.connection.cursor() as cursor:
# Get all archive results
cursor.execute("""
SELECT id, output FROM core_archiveresult
""")
for ar in ArchiveResult.objects.all().iterator():
old_output = ar.output or ''
for row in cursor.fetchall():
ar_id, old_output = row
old_output = old_output or ''
# Case 1: JSON output
if old_output.strip().startswith('{'):
try:
parsed = json.loads(old_output)
ar.output_json = parsed
ar.output_str = ''
except json.JSONDecodeError:
# Not valid JSON, treat as string
ar.output_str = old_output
# Case 2: File path or plain string
else:
ar.output_str = old_output
ar.save(update_fields=['output_str', 'output_json'])
# Case 1: JSON output
if old_output.strip().startswith('{'):
try:
# Validate it's actual JSON
parsed = json.loads(old_output)
# Update with JSON - cast to JSON to satisfy CHECK constraint
json_str = json.dumps(parsed)
cursor.execute("""
UPDATE core_archiveresult
SET output_str = '', output_json = json(?)
WHERE id = ?
""", (json_str, ar_id))
except json.JSONDecodeError:
# Not valid JSON, treat as string
cursor.execute("""
UPDATE core_archiveresult
SET output_str = ?, output_json = NULL
WHERE id = ?
""", (old_output, ar_id))
# Case 2: File path or plain string
else:
cursor.execute("""
UPDATE core_archiveresult
SET output_str = ?, output_json = NULL
WHERE id = ?
""", (old_output, ar_id))
def reverse_migrate(apps, schema_editor):

View File

@@ -16,43 +16,62 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
# Update Django's state only - database already has correct schema from 0029
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
),
],
database_operations=[
# No database changes needed - columns already exist with correct types
],
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
# Add unique constraint without table rebuild
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
),
],
database_operations=[
migrations.RunSQL(
sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
),
],
),
]

View File

@@ -10,20 +10,35 @@ class Migration(migrations.Migration):
]
operations = [
migrations.RenameField(
model_name='archiveresult',
old_name='extractor',
new_name='plugin',
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(
blank=True,
default='',
max_length=255,
db_index=True,
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
),
# Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.RenameField(
model_name='archiveresult',
old_name='extractor',
new_name='plugin',
),
migrations.AddField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(
blank=True,
default='',
max_length=255,
db_index=True,
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -11,13 +11,27 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(
default=0,
db_index=True,
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
),
# Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(
default=0,
db_index=True,
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -54,7 +54,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0034_snapshot_current_step'),
('crawls', '0004_alter_crawl_output_dir'),
('crawls', '0005_drop_seed_id_column'),
]
operations = [
@@ -64,16 +64,24 @@ class Migration(migrations.Migration):
reverse_code=migrations.RunPython.noop,
),
# Step 2: Make crawl non-nullable
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
# Step 3: Remove created_by field
migrations.RemoveField(
model_name='snapshot',
name='created_by',
# Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
migrations.SeparateDatabaseAndState(
state_operations=[
# Make crawl non-nullable
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
# Remove created_by field from Django's state
migrations.RemoveField(
model_name='snapshot',
name='created_by',
),
],
database_operations=[
# No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
# created_by_id column remains in database but is unused
],
),
]

View File

@@ -10,10 +10,18 @@ class Migration(migrations.Migration):
]
operations = [
# Remove created_by field from ArchiveResult
# Remove created_by field from ArchiveResult (state only)
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
migrations.RemoveField(
model_name='archiveresult',
name='created_by',
# Leave created_by_id column in database (unused but harmless, avoids table rebuild)
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.RemoveField(
model_name='archiveresult',
name='created_by',
),
],
database_operations=[
# No database changes - leave created_by_id column in place to avoid table rebuild
],
),
]

View File

@@ -0,0 +1,44 @@
# Generated by Django 6.0 on 2025-12-29 06:45
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0036_remove_archiveresult_created_by'),
]
operations = [
# Update Django's state only - database columns remain for backwards compat
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.RemoveField(
model_name='archiveresult',
name='output_dir',
),
migrations.RemoveField(
model_name='snapshot',
name='output_dir',
),
migrations.AlterField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
],
database_operations=[
# No database changes - columns remain in place to avoid table rebuilds
],
),
]

View File

@@ -0,0 +1,84 @@
# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
from django.db import migrations, models, connection
import django.utils.timezone
def add_columns_if_not_exist(apps, schema_editor):
"""Add columns to ArchiveResult only if they don't already exist."""
with connection.cursor() as cursor:
# Get existing columns
cursor.execute("PRAGMA table_info(core_archiveresult)")
existing_columns = {row[1] for row in cursor.fetchall()}
# Add num_uses_failed if it doesn't exist
if 'num_uses_failed' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
# Add num_uses_succeeded if it doesn't exist
if 'num_uses_succeeded' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
# Add config if it doesn't exist
if 'config' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
# Add retry_at if it doesn't exist
if 'retry_at' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
class Migration(migrations.Migration):
dependencies = [
('core', '0037_remove_archiveresult_output_dir_and_more'),
]
operations = [
# Add missing columns to ArchiveResult
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
],
database_operations=[
migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
],
),
# Drop created_by_id from Snapshot (database only, already removed from model in 0035)
migrations.SeparateDatabaseAndState(
state_operations=[
# No state changes - field already removed in 0035
],
database_operations=[
migrations.RunSQL(
sql="""
-- Drop index first, then column
DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
ALTER TABLE core_snapshot DROP COLUMN created_by_id;
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -0,0 +1,30 @@
# Fix num_uses_failed and num_uses_succeeded string values to integers
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0038_fix_missing_columns'),
]
operations = [
# Fix string values that got inserted as literals instead of integers
migrations.RunSQL(
sql="""
UPDATE core_snapshot
SET num_uses_failed = 0
WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
UPDATE core_snapshot
SET num_uses_succeeded = 0
WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
UPDATE core_snapshot
SET depth = 0
WHERE typeof(depth) = 'text' OR depth = 'depth';
""",
reverse_sql=migrations.RunSQL.noop,
),
]

View File

@@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)
merged = 0
for dup in duplicates.iterator():
for dup in duplicates.iterator(chunk_size=500):
snapshots = list(
cls.objects
.filter(url=dup['url'], timestamp=dup['timestamp'])

File diff suppressed because it is too large Load Diff

View File

@@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str:
'output_path': output_path,
'plugin': plugin,
})
return mark_safe(tpl.render(ctx))
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
except Exception:
return ''
@@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str:
'output_path': output_path,
'plugin': plugin,
})
return mark_safe(tpl.render(ctx))
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
except Exception:
return ''
@@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str:
'output_path': output_path,
'plugin': plugin,
})
return mark_safe(tpl.render(ctx))
rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
except Exception:
return ''

View File

@@ -539,7 +539,7 @@ from django.http import JsonResponse
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
from workers.orchestrator import Orchestrator
from archivebox.workers.orchestrator import Orchestrator
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from django.db.models import Case, When, Value, IntegerField