use full dotted paths for all archivebox imports, add migrations and more fixes

This commit is contained in:
Nick Sweeting
2025-12-29 00:47:08 -08:00
parent 1e4d3ffd11
commit f4e7820533
61 changed files with 1082 additions and 2985 deletions

View File

@@ -4,3 +4,8 @@ from django.apps import AppConfig
class CrawlsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "archivebox.crawls"
label = "crawls"
def ready(self):
"""Import models to register state machines with the registry"""
from archivebox.crawls.models import CrawlMachine # noqa: F401

View File

@@ -17,39 +17,62 @@ class Migration(migrations.Migration):
]
operations = [
# Remove the seed foreign key from Crawl
migrations.RemoveField(
model_name='crawl',
name='seed',
# Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d)
migrations.RunPython(
code=lambda apps, schema_editor: None,
reverse_code=migrations.RunPython.noop,
),
# Delete the Seed model entirely
migrations.DeleteModel(
name='Seed',
# Delete the Seed model entirely (already done)
migrations.RunPython(
code=lambda apps, schema_editor: None,
reverse_code=migrations.RunPython.noop,
),
# Update fields to new schema
migrations.AlterField(
model_name='crawl',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='crawl',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='crawl',
name='urls',
field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
),
migrations.AlterField(
model_name='crawlschedule',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='crawlschedule',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
# Drop seed_id column if it exists, then update Django's migration state
migrations.SeparateDatabaseAndState(
state_operations=[
# Update fields to new schema
migrations.AlterField(
model_name='crawl',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='crawl',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='crawl',
name='urls',
field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
),
migrations.AlterField(
model_name='crawlschedule',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='crawlschedule',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
],
database_operations=[
# Drop seed table and NULL out seed_id FK values
migrations.RunSQL(
sql="""
PRAGMA foreign_keys=OFF;
-- NULL out seed_id values in crawls_crawl
UPDATE crawls_crawl SET seed_id = NULL;
-- Drop seed table if it exists
DROP TABLE IF EXISTS crawls_seed;
PRAGMA foreign_keys=ON;
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -8,12 +8,21 @@ class Migration(migrations.Migration):
dependencies = [
('crawls', '0002_drop_seed_model'),
('core', '0024_d_fix_crawls_config'), # Depends on config fix
]
operations = [
migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
# Update Django's state only to avoid table rebuild that would re-apply old constraints
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
),
],
database_operations=[
# No database changes - output_dir type change is cosmetic for Django admin
],
),
]

View File

@@ -11,9 +11,17 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
# Update Django's state only to avoid table rebuild that would re-apply old constraints
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
),
],
database_operations=[
# No database changes - output_dir type change is cosmetic for Django admin
],
),
]

View File

@@ -0,0 +1,28 @@
# Drop seed_id column from Django's state (leave in database to avoid FK issues)
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('crawls', '0004_alter_crawl_output_dir'),
]
operations = [
# Update Django's state only - leave seed_id column in database (unused but harmless)
# This avoids FK mismatch errors with crawls_crawlschedule
migrations.SeparateDatabaseAndState(
state_operations=[
# Remove seed field from Django's migration state
migrations.RemoveField(
model_name='crawl',
name='seed',
),
],
database_operations=[
# No database changes - seed_id column remains to avoid FK rebuild issues
# crawls_seed table can be manually dropped by DBA if needed
],
),
]

View File

@@ -0,0 +1,35 @@
# Generated by Django 6.0 on 2025-12-29 06:45
import pathlib
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('crawls', '0005_drop_seed_id_column'),
]
operations = [
# Update Django's state only - database already correct
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='crawl',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
),
migrations.DeleteModel(
name='Seed',
),
],
database_operations=[
# No database changes - Seed table already dropped in 0005
],
),
]

View File

@@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
modified_at = models.DateTimeField(auto_now=True)
urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
config = models.JSONField(default=dict)
config = models.JSONField(default=dict, null=True, blank=True)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
persona_id = models.UUIDField(null=True, blank=True)
@@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
state_machine_name = 'crawls.models.CrawlMachine'
state_machine_name = 'archivebox.crawls.models.CrawlMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
StatusChoices = ModelWithStateMachine.StatusChoices
@@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()),
'created_by_id': self.created_by_id,
'depth': 0,
},
)
@@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'timestamp': timestamp or str(timezone.now().timestamp()),
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'created_by_id': self.created_by_id,
# Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
}
)