fix extensions dir paths add personas migration

This commit is contained in:
Nick Sweeting
2025-12-31 01:12:29 -08:00
parent 1bbb9b45a7
commit 3d8c62ffb1
10 changed files with 300 additions and 85 deletions

View File

@@ -10,8 +10,8 @@ import archivebox.base_models.models
def cleanup_extra_columns(apps, schema_editor):
"""
Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models.
The actual models use @property methods to access these values from the process FK.
Create Process records from old cmd/pwd/cmd_version columns and remove those columns.
This preserves the execution details by moving them to the Process model.
"""
with schema_editor.connection.cursor() as cursor:
# Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
@@ -19,8 +19,41 @@ def cleanup_extra_columns(apps, schema_editor):
has_cmd = cursor.fetchone()[0] > 0
if has_cmd:
print(" Cleaning up temporary columns from core_archiveresult...")
# Rebuild table without the extra columns
print(" Migrating cmd/pwd/cmd_version data to Process records...")
# For each ArchiveResult, create a Process record with cmd/pwd data
# Note: cmd_version from old schema is not preserved (it's now derived from Binary)
cursor.execute("""
SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status
FROM core_archiveresult
""")
archive_results = cursor.fetchall()
from archivebox.uuid_compat import uuid7
from archivebox.base_models.models import get_or_create_system_user_pk
machine_id = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()[0]
for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
# Create Process record
process_id = str(uuid7())
cursor.execute("""
INSERT INTO machine_process (
id, created_at, modified_at,
machine_id, binary_id, iface_id,
pwd, cmd, env, timeout,
pid, exit_code, stdout, stderr,
started_at, ended_at, url, status, retry_at
) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL)
""", (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued'))
# Update ArchiveResult to point to new Process
cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id))
print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data")
# Now rebuild table without the extra columns
print(" Rebuilding core_archiveresult table...")
cursor.execute("""
CREATE TABLE core_archiveresult_final (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -48,14 +81,14 @@ def cleanup_extra_columns(apps, schema_editor):
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
process_id TEXT,
process_id TEXT NOT NULL,
FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
)
""")
# Copy data (cmd, pwd, etc. are now accessed via process FK)
# Copy data (cmd, pwd, etc. are now in Process records)
cursor.execute("""
INSERT INTO core_archiveresult_final SELECT
id, uuid, created_at, modified_at,

View File

@@ -0,0 +1,108 @@
# Generated by Django 6.0 on 2025-12-31 09:04
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0026_final_field_adjustments'),
('crawls', '0002_upgrade_to_0_9_0'),
('machine', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='hook_name',
field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='plugin',
field=models.CharField(db_index=True, default='', max_length=32),
),
migrations.AlterField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
migrations.AlterField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict),
),
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
migrations.AlterField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
),
migrations.AlterField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(db_index=True, default=0),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
]

View File

@@ -2321,7 +2321,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
process = models.OneToOneField(
'machine.Process',
on_delete=models.PROTECT,
null=False, # Required after migration 4
null=False,
related_name='archiveresult',
help_text='Process execution details for this archive result'
)