mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
use full dotted paths for all archivebox imports, add migrations and more fixes
This commit is contained in:
@@ -4,3 +4,8 @@ from django.apps import AppConfig
|
||||
class CrawlsConfig(AppConfig):
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
||||
name = "archivebox.crawls"
|
||||
label = "crawls"
|
||||
|
||||
def ready(self):
|
||||
"""Import models to register state machines with the registry"""
|
||||
from archivebox.crawls.models import CrawlMachine # noqa: F401
|
||||
|
||||
@@ -17,39 +17,62 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Remove the seed foreign key from Crawl
|
||||
migrations.RemoveField(
|
||||
model_name='crawl',
|
||||
name='seed',
|
||||
# Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d)
|
||||
migrations.RunPython(
|
||||
code=lambda apps, schema_editor: None,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
# Delete the Seed model entirely
|
||||
migrations.DeleteModel(
|
||||
name='Seed',
|
||||
# Delete the Seed model entirely (already done)
|
||||
migrations.RunPython(
|
||||
code=lambda apps, schema_editor: None,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
# Update fields to new schema
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='urls',
|
||||
field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawlschedule',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawlschedule',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
# Drop seed_id column if it exists, then update Django's migration state
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
# Update fields to new schema
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='urls',
|
||||
field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawlschedule',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawlschedule',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# Drop seed table and NULL out seed_id FK values
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
PRAGMA foreign_keys=OFF;
|
||||
|
||||
-- NULL out seed_id values in crawls_crawl
|
||||
UPDATE crawls_crawl SET seed_id = NULL;
|
||||
|
||||
-- Drop seed table if it exists
|
||||
DROP TABLE IF EXISTS crawls_seed;
|
||||
|
||||
PRAGMA foreign_keys=ON;
|
||||
""",
|
||||
reverse_sql=migrations.RunSQL.noop,
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -8,12 +8,21 @@ class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0002_drop_seed_model'),
|
||||
('core', '0024_d_fix_crawls_config'), # Depends on config fix
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='output_dir',
|
||||
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
|
||||
# Update Django's state only to avoid table rebuild that would re-apply old constraints
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='output_dir',
|
||||
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes - output_dir type change is cosmetic for Django admin
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -11,9 +11,17 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='output_dir',
|
||||
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
|
||||
# Update Django's state only to avoid table rebuild that would re-apply old constraints
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='output_dir',
|
||||
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes - output_dir type change is cosmetic for Django admin
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
28
archivebox/crawls/migrations/0005_drop_seed_id_column.py
Normal file
28
archivebox/crawls/migrations/0005_drop_seed_id_column.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Drop seed_id column from Django's state (leave in database to avoid FK issues)
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0004_alter_crawl_output_dir'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Update Django's state only - leave seed_id column in database (unused but harmless)
|
||||
# This avoids FK mismatch errors with crawls_crawlschedule
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
# Remove seed field from Django's migration state
|
||||
migrations.RemoveField(
|
||||
model_name='crawl',
|
||||
name='seed',
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes - seed_id column remains to avoid FK rebuild issues
|
||||
# crawls_seed table can be manually dropped by DBA if needed
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,35 @@
|
||||
# Generated by Django 6.0 on 2025-12-29 06:45
|
||||
|
||||
import pathlib
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0005_drop_seed_id_column'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Update Django's state only - database already correct
|
||||
migrations.SeparateDatabaseAndState(
|
||||
state_operations=[
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='crawl',
|
||||
name='output_dir',
|
||||
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
|
||||
),
|
||||
migrations.DeleteModel(
|
||||
name='Seed',
|
||||
),
|
||||
],
|
||||
database_operations=[
|
||||
# No database changes - Seed table already dropped in 0005
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
|
||||
config = models.JSONField(default=dict)
|
||||
config = models.JSONField(default=dict, null=True, blank=True)
|
||||
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
|
||||
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
persona_id = models.UUIDField(null=True, blank=True)
|
||||
@@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
|
||||
state_machine_name = 'crawls.models.CrawlMachine'
|
||||
state_machine_name = 'archivebox.crawls.models.CrawlMachine'
|
||||
retry_at_field_name = 'retry_at'
|
||||
state_field_name = 'status'
|
||||
StatusChoices = ModelWithStateMachine.StatusChoices
|
||||
@@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'created_by_id': self.created_by_id,
|
||||
'depth': 0,
|
||||
},
|
||||
)
|
||||
@@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
'timestamp': timestamp or str(timezone.now().timestamp()),
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': self.created_by_id,
|
||||
# Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user