use full dotted paths for all archivebox imports, add migrations and more fixes

2026-04-05 23:37:58 +10:00 · 2025-12-29 00:47:08 -08:00
parent 1e4d3ffd11
commit f4e7820533
61 changed files with 1082 additions and 2985 deletions
--- a/archivebox/crawls/apps.py
+++ b/archivebox/crawls/apps.py
@@ -4,3 +4,8 @@ from django.apps import AppConfig
 class CrawlsConfig(AppConfig):
    default_auto_field = "django.db.models.BigAutoField"
    name = "archivebox.crawls"
+    label = "crawls"
+
+    def ready(self):
+        """Import models to register state machines with the registry"""
+        from archivebox.crawls.models import CrawlMachine  # noqa: F401
--- a/archivebox/crawls/migrations/0002_drop_seed_model.py
+++ b/archivebox/crawls/migrations/0002_drop_seed_model.py
@@ -17,39 +17,62 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        # Remove the seed foreign key from Crawl
-        migrations.RemoveField(
-            model_name='crawl',
-            name='seed',
+        # Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d)
+        migrations.RunPython(
+            code=lambda apps, schema_editor: None,
+            reverse_code=migrations.RunPython.noop,
        ),
-        # Delete the Seed model entirely
-        migrations.DeleteModel(
-            name='Seed',
+        # Delete the Seed model entirely (already done)
+        migrations.RunPython(
+            code=lambda apps, schema_editor: None,
+            reverse_code=migrations.RunPython.noop,
        ),
-        # Update fields to new schema
-        migrations.AlterField(
-            model_name='crawl',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='crawl',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
-        ),
-        migrations.AlterField(
-            model_name='crawl',
-            name='urls',
-            field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
-        ),
-        migrations.AlterField(
-            model_name='crawlschedule',
-            name='created_by',
-            field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
-        ),
-        migrations.AlterField(
-            model_name='crawlschedule',
-            name='id',
-            field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        # Drop seed_id column if it exists, then update Django's migration state
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Update fields to new schema
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='urls',
+                    field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
+                ),
+                migrations.AlterField(
+                    model_name='crawlschedule',
+                    name='created_by',
+                    field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
+                ),
+                migrations.AlterField(
+                    model_name='crawlschedule',
+                    name='id',
+                    field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+                ),
+            ],
+            database_operations=[
+                # Drop seed table and NULL out seed_id FK values
+                migrations.RunSQL(
+                    sql="""
+                        PRAGMA foreign_keys=OFF;
+
+                        -- NULL out seed_id values in crawls_crawl
+                        UPDATE crawls_crawl SET seed_id = NULL;
+
+                        -- Drop seed table if it exists
+                        DROP TABLE IF EXISTS crawls_seed;
+
+                        PRAGMA foreign_keys=ON;
+                    """,
+                    reverse_sql=migrations.RunSQL.noop,
+                ),
+            ],
        ),
    ]
--- a/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
+++ b/archivebox/crawls/migrations/0003_alter_crawl_output_dir.py
@@ -8,12 +8,21 @@ class Migration(migrations.Migration):

    dependencies = [
        ('crawls', '0002_drop_seed_model'),
+        ('core', '0024_d_fix_crawls_config'),  # Depends on config fix
    ]

    operations = [
-        migrations.AlterField(
-            model_name='crawl',
-            name='output_dir',
-            field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
+        # Update Django's state only to avoid table rebuild that would re-apply old constraints
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='output_dir',
+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
+                ),
+            ],
+            database_operations=[
+                # No database changes - output_dir type change is cosmetic for Django admin
+            ],
        ),
    ]
--- a/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
+++ b/archivebox/crawls/migrations/0004_alter_crawl_output_dir.py
@@ -11,9 +11,17 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        migrations.AlterField(
-            model_name='crawl',
-            name='output_dir',
-            field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
+        # Update Django's state only to avoid table rebuild that would re-apply old constraints
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='output_dir',
+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
+                ),
+            ],
+            database_operations=[
+                # No database changes - output_dir type change is cosmetic for Django admin
+            ],
        ),
    ]
--- a/archivebox/crawls/migrations/0005_drop_seed_id_column.py
+++ b/archivebox/crawls/migrations/0005_drop_seed_id_column.py
@@ -0,0 +1,28 @@
+# Drop seed_id column from Django's state (leave in database to avoid FK issues)
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0004_alter_crawl_output_dir'),
+    ]
+
+    operations = [
+        # Update Django's state only - leave seed_id column in database (unused but harmless)
+        # This avoids FK mismatch errors with crawls_crawlschedule
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                # Remove seed field from Django's migration state
+                migrations.RemoveField(
+                    model_name='crawl',
+                    name='seed',
+                ),
+            ],
+            database_operations=[
+                # No database changes - seed_id column remains to avoid FK rebuild issues
+                # crawls_seed table can be manually dropped by DBA if needed
+            ],
+        ),
+    ]
--- a/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py
+++ b/archivebox/crawls/migrations/0006_alter_crawl_config_alter_crawl_output_dir_and_more.py
@@ -0,0 +1,35 @@
+# Generated by Django 6.0 on 2025-12-29 06:45
+
+import pathlib
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0005_drop_seed_id_column'),
+    ]
+
+    operations = [
+        # Update Django's state only - database already correct
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='config',
+                    field=models.JSONField(blank=True, default=dict, null=True),
+                ),
+                migrations.AlterField(
+                    model_name='crawl',
+                    name='output_dir',
+                    field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
+                ),
+                migrations.DeleteModel(
+                    name='Seed',
+                ),
+            ],
+            database_operations=[
+                # No database changes - Seed table already dropped in 0005
+            ],
+        ),
+    ]
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    modified_at = models.DateTimeField(auto_now=True)

    urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
-    config = models.JSONField(default=dict)
+    config = models.JSONField(default=dict, null=True, blank=True)
    max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
    tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
    persona_id = models.UUIDField(null=True, blank=True)
@@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)

-    state_machine_name = 'crawls.models.CrawlMachine'
+    state_machine_name = 'archivebox.crawls.models.CrawlMachine'
    retry_at_field_name = 'retry_at'
    state_field_name = 'status'
    StatusChoices = ModelWithStateMachine.StatusChoices
@@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                'status': Snapshot.INITIAL_STATE,
                'retry_at': timezone.now(),
                'timestamp': str(timezone.now().timestamp()),
-                'created_by_id': self.created_by_id,
                'depth': 0,
            },
        )
@@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                    'timestamp': timestamp or str(timezone.now().timestamp()),
                    'status': Snapshot.INITIAL_STATE,
                    'retry_at': timezone.now(),
-                    'created_by_id': self.created_by_id,
+                    # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
                }
            )