wip major changes

2026-04-05 07:17:52 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/crawls/init.py
+++ b/archivebox/crawls/init.py
@@ -1,10 +1,7 @@
 __package__ = 'archivebox.crawls'
 __order__ = 100

-import abx

-
-@abx.hookimpl
 def register_admin(admin_site):
    from .admin import register_admin as register_crawls_admin
    register_crawls_admin(admin_site)
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -5,19 +5,19 @@ from django.contrib import admin

 from archivebox import DATA_DIR

-from archivebox.base_models.admin import BaseModelAdmin
+from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin

 from core.models import Snapshot
 from crawls.models import Seed, Crawl, CrawlSchedule


-class SeedAdmin(BaseModelAdmin):
+class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
    list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
    sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
    search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')

-    readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
-    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
+    readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents', 'available_config_options')
+    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'available_config_options', 'created_by', *readonly_fields[:-1])

    list_filter = ('extractor', 'created_by')
    ordering = ['-created_at']
@@ -64,13 +64,13 @@ class SeedAdmin(BaseModelAdmin):



-class CrawlAdmin(BaseModelAdmin):
+class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
    list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
    sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
    search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')

-    readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents')
-    fields = ('label', 'notes', 'urls', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields)
+    readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents', 'available_config_options')
+    fields = ('label', 'notes', 'urls', 'config', 'available_config_options', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields[:-1])

    list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
    ordering = ['-created_at', '-retry_at']
--- a/archivebox/crawls/migrations/0001_initial.py
+++ b/archivebox/crawls/migrations/0001_initial.py
@@ -0,0 +1,119 @@
+# Generated by Django 5.2.9 on 2025-12-24 19:54
+
+import archivebox.base_models.models
+import django.core.validators
+import django.db.models.deletion
+import django.utils.timezone
+import pathlib
+import statemachine.mixins
+import uuid
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ('core', '0001_initial'),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Crawl',
+            fields=[
+                ('num_uses_failed', models.PositiveIntegerField(default=0)),
+                ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
+                ('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                ('modified_at', models.DateTimeField(auto_now=True)),
+                ('urls', models.TextField(blank=True, default='')),
+                ('config', models.JSONField(default=dict)),
+                ('max_depth', models.PositiveSmallIntegerField(default=0, validators=[django.core.validators.MinValueValidator(0), django.core.validators.MaxValueValidator(4)])),
+                ('tags_str', models.CharField(blank=True, default='', max_length=1024)),
+                ('persona_id', models.UUIDField(blank=True, null=True)),
+                ('label', models.CharField(blank=True, default='', max_length=64)),
+                ('notes', models.TextField(blank=True, default='')),
+                ('output_dir', models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/archive'))),
+                ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15)),
+                ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
+                ('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+            ],
+            options={
+                'verbose_name': 'Crawl',
+                'verbose_name_plural': 'Crawls',
+            },
+            bases=(models.Model, statemachine.mixins.MachineMixin),
+        ),
+        migrations.CreateModel(
+            name='CrawlSchedule',
+            fields=[
+                ('num_uses_failed', models.PositiveIntegerField(default=0)),
+                ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
+                ('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                ('modified_at', models.DateTimeField(auto_now=True)),
+                ('schedule', models.CharField(max_length=64)),
+                ('is_enabled', models.BooleanField(default=True)),
+                ('label', models.CharField(blank=True, default='', max_length=64)),
+                ('notes', models.TextField(blank=True, default='')),
+                ('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+                ('template', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl')),
+            ],
+            options={
+                'verbose_name': 'Scheduled Crawl',
+                'verbose_name_plural': 'Scheduled Crawls',
+            },
+        ),
+        migrations.AddField(
+            model_name='crawl',
+            name='schedule',
+            field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='crawls.crawlschedule'),
+        ),
+        migrations.CreateModel(
+            name='Seed',
+            fields=[
+                ('num_uses_failed', models.PositiveIntegerField(default=0)),
+                ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
+                ('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                ('modified_at', models.DateTimeField(auto_now=True)),
+                ('uri', models.URLField(max_length=2048)),
+                ('extractor', models.CharField(default='auto', max_length=32)),
+                ('tags_str', models.CharField(blank=True, default='', max_length=255)),
+                ('label', models.CharField(blank=True, default='', max_length=255)),
+                ('config', models.JSONField(default=dict)),
+                ('output_dir', models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/archive'))),
+                ('notes', models.TextField(blank=True, default='')),
+                ('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+            ],
+            options={
+                'verbose_name': 'Seed',
+                'verbose_name_plural': 'Seeds',
+                'unique_together': {('created_by', 'label'), ('created_by', 'uri', 'extractor')},
+            },
+        ),
+        migrations.AddField(
+            model_name='crawl',
+            name='seed',
+            field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, related_name='crawl_set', to='crawls.seed'),
+        ),
+        migrations.CreateModel(
+            name='Outlink',
+            fields=[
+                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                ('modified_at', models.DateTimeField(auto_now=True)),
+                ('id', models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                ('src', models.URLField()),
+                ('dst', models.URLField()),
+                ('crawl', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='outlink_set', to='crawls.crawl')),
+                ('created_by', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+                ('via', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='outlink_set', to='core.archiveresult')),
+            ],
+            options={
+                'unique_together': {('src', 'dst', 'via')},
+            },
+        ),
+    ]
--- a/archivebox/crawls/migrations/0002_delete_outlink.py
+++ b/archivebox/crawls/migrations/0002_delete_outlink.py
@@ -0,0 +1,16 @@
+# Generated by Django 6.0 on 2025-12-25 02:19
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.DeleteModel(
+            name='Outlink',
+        ),
+    ]
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -20,7 +20,7 @@ if TYPE_CHECKING:
    from core.models import Snapshot, ArchiveResult


-class Seed(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
+class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -101,7 +101,7 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
            self.template.save()


-class Crawl(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
+class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -157,17 +157,131 @@ class Crawl(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWith
            pass
        root_snapshot, _ = Snapshot.objects.update_or_create(
            crawl=self, url=self.seed.uri,
-            defaults={'status': Snapshot.INITIAL_STATE, 'retry_at': timezone.now(), 'timestamp': str(timezone.now().timestamp())},
+            defaults={
+                'status': Snapshot.INITIAL_STATE,
+                'retry_at': timezone.now(),
+                'timestamp': str(timezone.now().timestamp()),
+                'created_by_id': self.created_by_id,
+                'depth': 0,
+            },
        )
        return root_snapshot

+    def add_url(self, entry: dict) -> bool:
+        """
+        Add a URL to the crawl queue if not already present.

-class Outlink(ModelWithSerializers):
-    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
-    src = models.URLField()
-    dst = models.URLField()
-    crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, blank=False, related_name='outlink_set')
-    via = models.ForeignKey('core.ArchiveResult', on_delete=models.SET_NULL, null=True, blank=True, related_name='outlink_set')
+        Args:
+            entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'via_extractor'

-    class Meta:
-        unique_together = (('src', 'dst', 'via'),)
+        Returns:
+            True if URL was added, False if skipped (duplicate or depth exceeded)
+        """
+        import json
+
+        url = entry.get('url', '')
+        if not url:
+            return False
+
+        depth = entry.get('depth', 1)
+
+        # Skip if depth exceeds max_depth
+        if depth > self.max_depth:
+            return False
+
+        # Skip if already a Snapshot for this crawl
+        if self.snapshot_set.filter(url=url).exists():
+            return False
+
+        # Check if already in urls (parse existing JSONL entries)
+        existing_urls = set()
+        for line in self.urls.splitlines():
+            if not line.strip():
+                continue
+            try:
+                existing_entry = json.loads(line)
+                existing_urls.add(existing_entry.get('url', ''))
+            except json.JSONDecodeError:
+                existing_urls.add(line.strip())
+
+        if url in existing_urls:
+            return False
+
+        # Append as JSONL
+        jsonl_entry = json.dumps(entry)
+        self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n')
+        self.save(update_fields=['urls', 'modified_at'])
+        return True
+
+    def create_snapshots_from_urls(self) -> list['Snapshot']:
+        """
+        Create Snapshot objects for each URL in self.urls that doesn't already exist.
+
+        Returns:
+            List of newly created Snapshot objects
+        """
+        import json
+        from core.models import Snapshot
+
+        created_snapshots = []
+
+        for line in self.urls.splitlines():
+            if not line.strip():
+                continue
+
+            # Parse JSONL or plain URL
+            try:
+                entry = json.loads(line)
+                url = entry.get('url', '')
+                depth = entry.get('depth', 1)
+                title = entry.get('title')
+                timestamp = entry.get('timestamp')
+                tags = entry.get('tags', '')
+            except json.JSONDecodeError:
+                url = line.strip()
+                depth = 1
+                title = None
+                timestamp = None
+                tags = ''
+
+            if not url:
+                continue
+
+            # Skip if depth exceeds max_depth
+            if depth > self.max_depth:
+                continue
+
+            # Create snapshot if doesn't exist
+            snapshot, created = Snapshot.objects.get_or_create(
+                url=url,
+                crawl=self,
+                defaults={
+                    'depth': depth,
+                    'title': title,
+                    'timestamp': timestamp or str(timezone.now().timestamp()),
+                    'status': Snapshot.INITIAL_STATE,
+                    'retry_at': timezone.now(),
+                    'created_by_id': self.created_by_id,
+                }
+            )
+
+            if created:
+                created_snapshots.append(snapshot)
+                # Save tags if present
+                if tags:
+                    snapshot.save_tags(tags.split(','))
+
+        return created_snapshots
+
+    def run(self) -> 'Snapshot':
+        """
+        Execute this Crawl by creating the root snapshot and processing queued URLs.
+
+        Called by the state machine when entering the 'started' state.
+
+        Returns:
+            The root Snapshot for this crawl
+        """
+        root_snapshot = self.create_root_snapshot()
+        self.create_snapshots_from_urls()
+        return root_snapshot
--- a/archivebox/crawls/statemachines.py
+++ b/archivebox/crawls/statemachines.py
@@ -73,15 +73,17 @@ class CrawlMachine(StateMachine, strict_states=True):

    @started.enter
    def enter_started(self):
-        print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
-        # lock the crawl object for 2s while we create the root snapshot
+        print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()')
+        # lock the crawl object while we create snapshots
        self.crawl.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=5),
            status=Crawl.StatusChoices.QUEUED,
        )
-        assert self.crawl.create_root_snapshot()
-        
-        # only update status to STARTED once root snapshot is created
+
+        # Run the crawl - creates root snapshot and processes queued URLs
+        self.crawl.run()
+
+        # only update status to STARTED once snapshots are created
        self.crawl.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=5),
            status=Crawl.StatusChoices.STARTED,
@@ -94,19 +96,3 @@ class CrawlMachine(StateMachine, strict_states=True):
            retry_at=None,
            status=Crawl.StatusChoices.SEALED,
        )
-
-
-# class CrawlWorker(ActorType[Crawl]):
-#     """The Actor that manages the lifecycle of all Crawl objects"""
-    
-#     Model = Crawl
-#     StateMachineClass = CrawlMachine
-    
-#     ACTIVE_STATE: ClassVar[State] = CrawlMachine.started
-#     FINAL_STATES: ClassVar[list[State]] = CrawlMachine.final_states
-#     STATE_FIELD_NAME: ClassVar[str] = Crawl.state_field_name
-    
-#     MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
-#     MAX_TICK_TIME: ClassVar[int] = 10
-#     CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
-
--- a/archivebox/crawls/tests.py
+++ b/archivebox/crawls/tests.py
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
--- a/archivebox/crawls/views.py
+++ b/archivebox/crawls/views.py
@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.