fix archivebox add

This commit is contained in:
Nick Sweeting
2025-12-29 02:58:49 -08:00
parent ec100bfe29
commit 122bd0cc2e
8 changed files with 238 additions and 5 deletions

View File

@@ -164,7 +164,7 @@ class BaseModelWithStateMachine(models.Model, MachineMixin):
@classproperty
def ACTIVE_STATE(cls) -> str:
return cls._state_to_str(cls.StateMachineClass.active_state)
return cls._state_to_str(cls.active_state)
@classproperty
def INITIAL_STATE(cls) -> str:

View File

@@ -0,0 +1,19 @@
# Generated by Django 5.1.2 on 2025-12-29 10:16
import django.utils.timezone
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0074_alter_snapshot_downloaded_at'),
]
operations = [
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
]

View File

@@ -0,0 +1,71 @@
# Generated by Django 5.1.2 on 2025-12-29 10:16
import django.db.models.deletion
import django.utils.timezone
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0075_archiveresult_retry_at'),
('crawls', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='cmd',
field=models.JSONField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], db_index=True, max_length=32),
),
migrations.AlterField(
model_name='archiveresult',
name='output',
field=models.CharField(blank=True, default=None, max_length=1024, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='pwd',
field=models.CharField(blank=True, default=None, max_length=256, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='snapshot',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
),
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(blank=True, default=None, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
]

View File

@@ -0,0 +1,78 @@
# Generated by Django 5.1.2 on 2025-12-29 10:16
import abid_utils.models
import charidfield.fields
import django.core.validators
import django.db.models.deletion
import django.utils.timezone
import statemachine.mixins
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('core', '0075_archiveresult_retry_at'),
('seeds', '0001_initial'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='CrawlSchedule',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('schedule', models.CharField(max_length=64)),
('is_enabled', models.BooleanField(default=True)),
('created_at', abid_utils.models.AutoDateTimeField(db_index=True, default=None)),
('modified_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
options={
'abstract': False,
},
),
migrations.CreateModel(
name='Crawl',
fields=[
('num_uses_failed', models.PositiveIntegerField(default=0)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('id', models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID')),
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='crl_', unique=True)),
('created_at', abid_utils.models.AutoDateTimeField(db_index=True, default=None)),
('modified_at', models.DateTimeField(auto_now=True)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15)),
('retry_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('max_depth', models.PositiveSmallIntegerField(default=0, validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)])),
('tags_str', models.CharField(blank=True, default='', max_length=1024)),
('persona', models.CharField(blank=True, default='auto', max_length=32)),
('config', models.JSONField(default=dict)),
('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='crawl_set', to=settings.AUTH_USER_MODEL)),
('seed', models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, related_name='crawl_set', to='seeds.seed')),
('schedule', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='crawls.crawlschedule')),
],
options={
'verbose_name': 'Crawl',
'verbose_name_plural': 'Crawls',
},
bases=(models.Model, statemachine.mixins.MachineMixin),
),
migrations.CreateModel(
name='Outlink',
fields=[
('id', models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID')),
('src', models.URLField()),
('dst', models.URLField()),
('crawl', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='outlink_set', to='crawls.crawl')),
('via', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='outlink_set', to='core.archiveresult')),
],
options={
'unique_together': {('src', 'dst', 'via')},
},
),
]

View File

@@ -80,8 +80,8 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types
def link_details_template(link: Link) -> str:
from abx_plugin_wget_extractor.wget import wget_output_path
from abx_plugin_wget.wget import wget_output_path
SAVE_ARCHIVE_DOT_ORG = abx.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG

View File

@@ -427,8 +427,7 @@ class Link:
"""predict the expected output paths that should be present after archiving"""
from abx_plugin_wget.wget import wget_output_path
FAVICON_CONFIG = abx.pm.hook.get_CONFIGS().favicon
from abx_plugin_favicon.config import FAVICON_CONFIG
# TODO: banish this awful duplication from the codebase and import these
# from their respective extractor files

View File

@@ -0,0 +1,26 @@
# Generated by Django 5.1.2 on 2025-12-29 10:16
from django.db import migrations
class Migration(migrations.Migration):
initial = True
dependencies = [
('core', '0074_alter_snapshot_downloaded_at'),
]
operations = [
migrations.CreateModel(
name='SinglefileResult',
fields=[
],
options={
'proxy': True,
'indexes': [],
'constraints': [],
},
bases=('core.archiveresult',),
),
]

View File

@@ -0,0 +1,40 @@
# Generated by Django 5.1.2 on 2025-12-29 10:16
import abid_utils.models
import charidfield.fields
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='Seed',
fields=[
('num_uses_failed', models.PositiveIntegerField(default=0)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('id', models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID')),
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='src_', unique=True)),
('uri', models.URLField(max_length=2000)),
('extractor', models.CharField(default='auto', max_length=32)),
('tags_str', models.CharField(blank=True, default='', max_length=255)),
('config', models.JSONField(default=dict)),
('created_at', abid_utils.models.AutoDateTimeField(db_index=True, default=None)),
('modified_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'Seed',
'verbose_name_plural': 'Seeds',
'unique_together': {('created_by', 'uri', 'extractor')},
},
),
]