From 122bd0cc2ed9229818ae233c5e887a988efce8c2 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 29 Dec 2025 02:58:49 -0800 Subject: [PATCH] fix archivebox add --- archivebox/actors/models.py | 2 +- .../migrations/0075_archiveresult_retry_at.py | 19 +++++ ...pshot_retry_at_snapshot_status_and_more.py | 71 +++++++++++++++++ archivebox/crawls/migrations/0001_initial.py | 78 +++++++++++++++++++ archivebox/index/html.py | 4 +- archivebox/index/schema.py | 3 +- .../migrations/0001_initial.py | 26 +++++++ archivebox/seeds/migrations/0001_initial.py | 40 ++++++++++ 8 files changed, 238 insertions(+), 5 deletions(-) create mode 100644 archivebox/core/migrations/0075_archiveresult_retry_at.py create mode 100644 archivebox/core/migrations/0076_snapshot_crawl_snapshot_retry_at_snapshot_status_and_more.py create mode 100644 archivebox/crawls/migrations/0001_initial.py create mode 100644 archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/migrations/0001_initial.py create mode 100644 archivebox/seeds/migrations/0001_initial.py diff --git a/archivebox/actors/models.py b/archivebox/actors/models.py index 31777c1c..01decea1 100644 --- a/archivebox/actors/models.py +++ b/archivebox/actors/models.py @@ -164,7 +164,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin): @classproperty def ACTIVE_STATE(cls) -> str: - return cls._state_to_str(cls.StateMachineClass.active_state) + return cls._state_to_str(cls.active_state) @classproperty def INITIAL_STATE(cls) -> str: diff --git a/archivebox/core/migrations/0075_archiveresult_retry_at.py b/archivebox/core/migrations/0075_archiveresult_retry_at.py new file mode 100644 index 00000000..dab9b60e --- /dev/null +++ b/archivebox/core/migrations/0075_archiveresult_retry_at.py @@ -0,0 +1,19 @@ +# Generated by Django 5.1.2 on 2025-12-29 10:16 + +import django.utils.timezone +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0074_alter_snapshot_downloaded_at'), + ] + + operations = [ + migrations.AddField( + model_name='archiveresult', + name='retry_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + ] diff --git a/archivebox/core/migrations/0076_snapshot_crawl_snapshot_retry_at_snapshot_status_and_more.py b/archivebox/core/migrations/0076_snapshot_crawl_snapshot_retry_at_snapshot_status_and_more.py new file mode 100644 index 00000000..29865cab --- /dev/null +++ b/archivebox/core/migrations/0076_snapshot_crawl_snapshot_retry_at_snapshot_status_and_more.py @@ -0,0 +1,71 @@ +# Generated by Django 5.1.2 on 2025-12-29 10:16 + +import django.db.models.deletion +import django.utils.timezone +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0075_archiveresult_retry_at'), + ('crawls', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='snapshot', + name='crawl', + field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), + ), + migrations.AddField( + model_name='snapshot', + name='retry_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15), + ), + migrations.AlterField( + model_name='archiveresult', + name='cmd', + field=models.JSONField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='end_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], db_index=True, max_length=32), + ), + migrations.AlterField( + model_name='archiveresult', + name='output', + field=models.CharField(blank=True, default=None, max_length=1024, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='pwd', + field=models.CharField(blank=True, default=None, max_length=256, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='snapshot', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), + ), + migrations.AlterField( + model_name='archiveresult', + name='start_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), + ), + ] diff --git a/archivebox/crawls/migrations/0001_initial.py b/archivebox/crawls/migrations/0001_initial.py new file mode 100644 index 00000000..528074bb --- /dev/null +++ b/archivebox/crawls/migrations/0001_initial.py @@ -0,0 +1,78 @@ +# Generated by Django 5.1.2 on 2025-12-29 10:16 + +import abid_utils.models +import charidfield.fields +import django.core.validators +import django.db.models.deletion +import django.utils.timezone +import statemachine.mixins +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('core', '0075_archiveresult_retry_at'), + ('seeds', '0001_initial'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='CrawlSchedule', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('num_uses_failed', models.PositiveIntegerField(default=0)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('schedule', models.CharField(max_length=64)), + ('is_enabled', models.BooleanField(default=True)), + ('created_at', abid_utils.models.AutoDateTimeField(db_index=True, default=None)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + 'abstract': False, + }, + ), + migrations.CreateModel( + name='Crawl', + fields=[ + ('num_uses_failed', models.PositiveIntegerField(default=0)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('id', models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID')), + ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='crl_', unique=True)), + ('created_at', abid_utils.models.AutoDateTimeField(db_index=True, default=None)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15)), + ('retry_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('max_depth', models.PositiveSmallIntegerField(default=0, validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)])), + ('tags_str', models.CharField(blank=True, default='', max_length=1024)), + ('persona', models.CharField(blank=True, default='auto', max_length=32)), + ('config', models.JSONField(default=dict)), + ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='crawl_set', to=settings.AUTH_USER_MODEL)), + ('seed', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, related_name='crawl_set', to='seeds.seed')), + ('schedule', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='crawls.crawlschedule')), + ], + options={ + 'verbose_name': 'Crawl', + 'verbose_name_plural': 'Crawls', + }, + bases=(models.Model, statemachine.mixins.MachineMixin), + ), + migrations.CreateModel( + name='Outlink', + fields=[ + ('id', models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID')), + ('src', models.URLField()), + ('dst', models.URLField()), + ('crawl', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='outlink_set', to='crawls.crawl')), + ('via', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='outlink_set', to='core.archiveresult')), + ], + options={ + 'unique_together': {('src', 'dst', 'via')}, + }, + ), + ] diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 24cad5c0..be58ffb0 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -80,8 +80,8 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None: @enforce_types def link_details_template(link: Link) -> str: - - from abx_plugin_wget_extractor.wget import wget_output_path + + from abx_plugin_wget.wget import wget_output_path SAVE_ARCHIVE_DOT_ORG = abx.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 78e80ef9..3f9b1d86 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -427,8 +427,7 @@ class Link: """predict the expected output paths that should be present after archiving""" from abx_plugin_wget.wget import wget_output_path - - FAVICON_CONFIG = abx.pm.hook.get_CONFIGS().favicon + from abx_plugin_favicon.config import FAVICON_CONFIG # TODO: banish this awful duplication from the codebase and import these # from their respective extractor files diff --git a/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/migrations/0001_initial.py b/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/migrations/0001_initial.py new file mode 100644 index 00000000..564aa369 --- /dev/null +++ b/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/migrations/0001_initial.py @@ -0,0 +1,26 @@ +# Generated by Django 5.1.2 on 2025-12-29 10:16 + +from django.db import migrations + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('core', '0074_alter_snapshot_downloaded_at'), + ] + + operations = [ + migrations.CreateModel( + name='SinglefileResult', + fields=[ + ], + options={ + 'proxy': True, + 'indexes': [], + 'constraints': [], + }, + bases=('core.archiveresult',), + ), + ] diff --git a/archivebox/seeds/migrations/0001_initial.py b/archivebox/seeds/migrations/0001_initial.py new file mode 100644 index 00000000..82076ea3 --- /dev/null +++ b/archivebox/seeds/migrations/0001_initial.py @@ -0,0 +1,40 @@ +# Generated by Django 5.1.2 on 2025-12-29 10:16 + +import abid_utils.models +import charidfield.fields +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='Seed', + fields=[ + ('num_uses_failed', models.PositiveIntegerField(default=0)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('id', models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID')), + ('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='src_', unique=True)), + ('uri', models.URLField(max_length=2000)), + ('extractor', models.CharField(default='auto', max_length=32)), + ('tags_str', models.CharField(blank=True, default='', max_length=255)), + ('config', models.JSONField(default=dict)), + ('created_at', abid_utils.models.AutoDateTimeField(db_index=True, default=None)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + options={ + 'verbose_name': 'Seed', + 'verbose_name_plural': 'Seeds', + 'unique_together': {('created_by', 'uri', 'extractor')}, + }, + ), + ]