wip

2026-04-04 23:07:56 +10:00 · 2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions
--- a/archivebox/core/init.py
+++ b/archivebox/core/init.py
@@ -4,7 +4,7 @@ __order__ = 100

 def register_admin(admin_site):
    """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
-    from core.admin import register_admin as do_register
+    from archivebox.core.admin import register_admin as do_register
    do_register(admin_site)


--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -3,11 +3,11 @@ __package__ = 'archivebox.core'
 from django.contrib.auth import get_user_model


-from core.models import Snapshot, ArchiveResult, Tag
-from core.admin_tags import TagAdmin
-from core.admin_snapshots import SnapshotAdmin
-from core.admin_archiveresults import ArchiveResultAdmin
-from core.admin_users import UserAdmin
+from archivebox.core.models import Snapshot, ArchiveResult, Tag
+from archivebox.core.admin_tags import TagAdmin
+from archivebox.core.admin_snapshots import SnapshotAdmin
+from archivebox.core.admin_archiveresults import ArchiveResultAdmin
+from archivebox.core.admin_users import UserAdmin


 def register_admin(admin_site):
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -16,7 +16,7 @@ from archivebox.base_models.admin import BaseModelAdmin
 from archivebox.hooks import get_plugin_icon


-from core.models import ArchiveResult, Snapshot
+from archivebox.core.models import ArchiveResult, Snapshot


 def render_archiveresults_list(archiveresults_qs, limit=50):
@@ -187,7 +187,7 @@ class ArchiveResultInline(admin.TabularInline):
    extra = 0
    sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
    readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
-    fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
+    fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
    # exclude = ('id',)
    ordering = ('end_ts',)
    show_change_link = True
@@ -229,17 +229,15 @@ class ArchiveResultInline(admin.TabularInline):
        formset.form.base_fields['end_ts'].initial = timezone.now()
        formset.form.base_fields['cmd_version'].initial = '-'
        formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
-        formset.form.base_fields['created_by'].initial = request.user
        formset.form.base_fields['cmd'].initial = '["-"]'
        formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
-        
+
        if obj is not None:
            # hidden values for existing entries and new entries
            formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
            formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
            formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
            formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
-            formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
            formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
        return formset
    
@@ -252,8 +250,8 @@ class ArchiveResultInline(admin.TabularInline):


 class ArchiveResultAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
-    sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
+    list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
+    sort_fields = ('id', 'created_at', 'plugin', 'status')
    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
    search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
    autocomplete_fields = ['snapshot']
@@ -279,10 +277,6 @@ class ArchiveResultAdmin(BaseModelAdmin):
            'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
            'classes': ('card', 'wide'),
        }),
-        ('Metadata', {
-            'fields': ('created_by',),
-            'classes': ('card',),
-        }),
    )

    list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -38,11 +38,11 @@ def register_admin_site():

    # Register admin views for each app
    # (Previously handled by ABX plugin system, now called directly)
-    from core.admin import register_admin as register_core_admin
-    from crawls.admin import register_admin as register_crawls_admin
-    from api.admin import register_admin as register_api_admin
-    from machine.admin import register_admin as register_machine_admin
-    from workers.admin import register_admin as register_workers_admin
+    from archivebox.core.admin import register_admin as register_core_admin
+    from archivebox.crawls.admin import register_admin as register_crawls_admin
+    from archivebox.api.admin import register_admin as register_api_admin
+    from archivebox.machine.admin import register_admin as register_machine_admin
+    from archivebox.workers.admin import register_admin as register_workers_admin

    register_core_admin(archivebox_admin)
    register_crawls_admin(archivebox_admin)
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -23,9 +23,9 @@ from archivebox.search.admin import SearchResultsAdminMixin
 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
 from archivebox.workers.tasks import bg_archive_snapshots, bg_add

-from core.models import Tag, Snapshot
-from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
+from archivebox.core.models import Tag, Snapshot
+from archivebox.core.admin_tags import TagInline
+from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list


 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -59,7 +59,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
    search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
-    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
+    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')

    fieldsets = (
        ('URL', {
@@ -75,7 +75,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            'classes': ('card',),
        }),
        ('Relations', {
-            'fields': ('crawl', 'created_by', 'tags_str'),
+            'fields': ('crawl', 'tags_str'),
            'classes': ('card',),
        }),
        ('Config', {
--- a/archivebox/core/admin_tags.py
+++ b/archivebox/core/admin_tags.py
@@ -6,7 +6,7 @@ from django.utils.html import format_html, mark_safe
 from archivebox.misc.paginators import AccelleratedPaginator
 from archivebox.base_models.admin import BaseModelAdmin

-from core.models import Tag
+from archivebox.core.models import Tag


 class TagInline(admin.TabularInline):
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -4,9 +4,9 @@ from django.apps import AppConfig


 class CoreConfig(AppConfig):
-    name = 'core'
+    name = 'archivebox.core'

    def ready(self):
        """Register the archivebox.core.admin_site as the main django admin site"""
-        from core.admin_site import register_admin_site
+        from archivebox.core.admin_site import register_admin_site
        register_admin_site()
--- a/archivebox/core/asgi.py
+++ b/archivebox/core/asgi.py
@@ -20,7 +20,7 @@ application = get_asgi_application()
 # from channels.routing import ProtocolTypeRouter, URLRouter
 # from channels.auth import AuthMiddlewareStack
 # from channels.security.websocket import AllowedHostsOriginValidator
-# from core.routing import websocket_urlpatterns
+# from archivebox.core.routing import websocket_urlpatterns
 #
 # application = ProtocolTypeRouter({
 #     "http": get_asgi_application(),
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -4,10 +4,14 @@ from django import forms

 from archivebox.misc.util import URL_REGEX
 from taggit.utils import edit_string_for_tags, parse_tags
+from archivebox.base_models.admin import KeyValueWidget

 DEPTH_CHOICES = (
    ('0', 'depth = 0 (archive just these URLs)'),
-    ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
+    ('1', 'depth = 1 (+ URLs one hop away)'),
+    ('2', 'depth = 2 (+ URLs two hops away)'),
+    ('3', 'depth = 3 (+ URLs three hops away)'),
+    ('4', 'depth = 4 (+ URLs four hops away)'),
 )

 from archivebox.hooks import get_plugins
@@ -18,39 +22,180 @@ def get_plugin_choices():


 class AddLinkForm(forms.Form):
-    url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
-    tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
-    depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
-    plugins = forms.MultipleChoiceField(
-        label="Plugins (select at least 1, otherwise all will be used by default)",
+    # Basic fields
+    url = forms.RegexField(
+        label="URLs (one per line)",
+        regex=URL_REGEX,
+        min_length='6',
+        strip=True,
+        widget=forms.Textarea,
+        required=True
+    )
+    tag = forms.CharField(
+        label="Tags (comma separated tag1,tag2,tag3)",
+        strip=True,
+        required=False,
+        widget=forms.TextInput(attrs={
+            'list': 'tag-datalist',
+            'autocomplete': 'off',
+        })
+    )
+    depth = forms.ChoiceField(
+        label="Archive depth",
+        choices=DEPTH_CHOICES,
+        initial='0',
+        widget=forms.RadioSelect(attrs={"class": "depth-selection"})
+    )
+    notes = forms.CharField(
+        label="Notes",
+        strip=True,
+        required=False,
+        widget=forms.Textarea(attrs={
+            'rows': 3,
+            'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
+        })
+    )
+
+    # Plugin groups
+    chrome_plugins = forms.MultipleChoiceField(
+        label="Chrome-dependent plugins",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],  # populated in __init__
+    )
+    archiving_plugins = forms.MultipleChoiceField(
+        label="Archiving",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    parsing_plugins = forms.MultipleChoiceField(
+        label="Parsing",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    search_plugins = forms.MultipleChoiceField(
+        label="Search",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    binary_plugins = forms.MultipleChoiceField(
+        label="Binary providers",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+    extension_plugins = forms.MultipleChoiceField(
+        label="Browser extensions",
+        required=False,
+        widget=forms.CheckboxSelectMultiple,
+        choices=[],
+    )
+
+    # Advanced options
+    schedule = forms.CharField(
+        label="Repeat schedule",
+        max_length=64,
+        required=False,
+        widget=forms.TextInput(attrs={
+            'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
+        })
+    )
+    persona = forms.CharField(
+        label="Persona (authentication profile)",
+        max_length=100,
+        initial='Default',
+        required=False,
+    )
+    overwrite = forms.BooleanField(
+        label="Overwrite existing snapshots",
+        initial=False,
+        required=False,
+    )
+    update = forms.BooleanField(
+        label="Update/retry previously failed URLs",
+        initial=False,
+        required=False,
+    )
+    index_only = forms.BooleanField(
+        label="Index only (don't archive yet)",
+        initial=False,
+        required=False,
+    )
+    config = forms.JSONField(
+        label="Custom config overrides",
+        widget=KeyValueWidget(),
+        initial=dict,
        required=False,
-        widget=forms.SelectMultiple,
-        choices=[],  # populated dynamically in __init__
    )

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self.fields['plugins'].choices = get_plugin_choices()
-    # TODO: hook these up to the view and put them 
-    # in a collapsible UI section labeled "Advanced"
-    #
-    # exclude_patterns = forms.CharField(
-    #     label="Exclude patterns",
-    #     min_length='1',
-    #     required=False,
-    #     initial=URL_DENYLIST,
-    # )
-    # timeout = forms.IntegerField(
-    #     initial=TIMEOUT,
-    # )
-    # overwrite = forms.BooleanField(
-    #     label="Overwrite any existing Snapshots",
-    #     initial=False,
-    # )
-    # index_only = forms.BooleanField(
-    #     label="Add URLs to index without Snapshotting",
-    #     initial=False,
-    # )
+
+        # Import at runtime to avoid circular imports
+        from archivebox.config.common import ARCHIVING_CONFIG
+
+        # Get all plugins
+        all_plugins = get_plugins()
+
+        # Define plugin groups
+        chrome_dependent = {
+            'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
+            'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
+            'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
+        }
+        archiving = {
+            'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
+            'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
+        }
+        parsing = {
+            'parse_html_urls', 'parse_jsonl_urls',
+            'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
+        }
+        search = {
+            'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
+        }
+        binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
+        extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
+
+        # Populate plugin field choices
+        self.fields['chrome_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in chrome_dependent
+        ]
+        self.fields['archiving_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in archiving
+        ]
+        self.fields['parsing_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in parsing
+        ]
+        self.fields['search_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in search
+        ]
+        self.fields['binary_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in binary
+        ]
+        self.fields['extension_plugins'].choices = [
+            (p, p) for p in sorted(all_plugins) if p in extensions
+        ]
+
+        # Set update default from config
+        self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
+
+    def clean(self):
+        cleaned_data = super().clean()
+
+        # Combine all plugin groups into single list
+        all_selected_plugins = []
+        for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
+                      'search_plugins', 'binary_plugins', 'extension_plugins']:
+            all_selected_plugins.extend(cleaned_data.get(field, []))
+
+        # Store combined list for easy access
+        cleaned_data['plugins'] = all_selected_plugins
+
+        return cleaned_data

 class TagWidgetMixin:
    def format_value(self, value):
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -12,7 +12,7 @@ try:
    ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
 except ImportError:
    try:
-        from config import CONFIG
+        from archivebox.config import CONFIG
        ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
    except ImportError:
        ARCHIVE_DIR = Path('./archive')
--- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
+++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py
@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
    dependencies = [
        ('core', '0031_snapshot_parent_snapshot'),
        ('crawls', '0004_alter_crawl_output_dir'),
-        ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
+        ('machine', '0004_drop_dependency_table'),  # Changed from 0003 - wait until Dependency is dropped
        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
    ]

--- a/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
+++ b/archivebox/core/migrations/0035_snapshot_crawl_non_nullable_remove_created_by.py
@@ -0,0 +1,79 @@
+# Generated migration
+
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
+    """
+    Create one catchall Crawl per user for all snapshots without a crawl.
+    Assign those snapshots to their user's catchall crawl.
+    """
+    Snapshot = apps.get_model('core', 'Snapshot')
+    Crawl = apps.get_model('crawls', 'Crawl')
+    User = apps.get_model(settings.AUTH_USER_MODEL)
+
+    # Get all snapshots without a crawl
+    snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
+
+    if not snapshots_without_crawl.exists():
+        return
+
+    # Group by created_by_id
+    snapshots_by_user = {}
+    for snapshot in snapshots_without_crawl:
+        user_id = snapshot.created_by_id
+        if user_id not in snapshots_by_user:
+            snapshots_by_user[user_id] = []
+        snapshots_by_user[user_id].append(snapshot)
+
+    # Create one catchall crawl per user and assign snapshots
+    for user_id, snapshots in snapshots_by_user.items():
+        try:
+            user = User.objects.get(pk=user_id)
+            username = user.username
+        except User.DoesNotExist:
+            username = 'unknown'
+
+        # Create catchall crawl for this user
+        crawl = Crawl.objects.create(
+            urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
+            max_depth=0,
+            label=f'[migration] catchall for user {username}',
+            created_by_id=user_id,
+        )
+
+        # Assign all snapshots to this crawl
+        for snapshot in snapshots:
+            snapshot.crawl = crawl
+            snapshot.save(update_fields=['crawl'])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0034_snapshot_current_step'),
+        ('crawls', '0004_alter_crawl_output_dir'),
+    ]
+
+    operations = [
+        # Step 1: Assign all snapshots without a crawl to catchall crawls
+        migrations.RunPython(
+            create_catchall_crawls_and_assign_snapshots,
+            reverse_code=migrations.RunPython.noop,
+        ),
+
+        # Step 2: Make crawl non-nullable
+        migrations.AlterField(
+            model_name='snapshot',
+            name='crawl',
+            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
+        ),
+
+        # Step 3: Remove created_by field
+        migrations.RemoveField(
+            model_name='snapshot',
+            name='created_by',
+        ),
+    ]
--- a/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
+++ b/archivebox/core/migrations/0036_remove_archiveresult_created_by.py
@@ -0,0 +1,19 @@
+# Generated migration
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
+    ]
+
+    operations = [
+        # Remove created_by field from ArchiveResult
+        # No data migration needed - created_by can be accessed via snapshot.crawl.created_by
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='created_by',
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -9,6 +9,8 @@ import os
 import json
 from pathlib import Path

+from statemachine import State, registry
+
 from django.db import models
 from django.db.models import QuerySet, Value, Case, When, IntegerField
 from django.utils.functional import cached_property
@@ -33,10 +35,10 @@ from archivebox.base_models.models import (
    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
    get_or_create_system_user_pk,
 )
-from workers.models import ModelWithStateMachine
-from workers.tasks import bg_archive_snapshot
-from crawls.models import Crawl
-from machine.models import NetworkInterface, Binary
+from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
+from archivebox.workers.tasks import bg_archive_snapshot
+from archivebox.crawls.models import Crawl
+from archivebox.machine.models import NetworkInterface, Binary



@@ -53,6 +55,7 @@ class Tag(ModelWithSerializers):
    snapshot_set: models.Manager['Snapshot']

    class Meta(TypedModelMeta):
+        app_label = 'core'
        verbose_name = "Tag"
        verbose_name_plural = "Tags"

@@ -122,6 +125,7 @@ class SnapshotTag(models.Model):
    tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')

    class Meta:
+        app_label = 'core'
        db_table = 'core_snapshot_tags'
        unique_together = [('snapshot', 'tag')]

@@ -263,52 +267,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
    # Import Methods
    # =========================================================================

-    def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
-        """Create or update a Snapshot from a SnapshotDict (parser output)"""
-        import re
-        from archivebox.config.common import GENERAL_CONFIG
-
-        url = link_dict['url']
-        timestamp = link_dict.get('timestamp')
-        title = link_dict.get('title')
-        tags_str = link_dict.get('tags')
-
-        tag_list = []
-        if tags_str:
-            tag_list = list(dict.fromkeys(
-                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
-                if tag.strip()
-            ))
-
-        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
-        snapshot = self.filter(url=url).order_by('-created_at').first()
-        if snapshot:
-            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
-                snapshot.title = title
-                snapshot.save(update_fields=['title', 'modified_at'])
-        else:
-            if timestamp:
-                while self.filter(timestamp=timestamp).exists():
-                    timestamp = str(float(timestamp) + 1.0)
-
-            snapshot = self.create(
-                url=url,
-                timestamp=timestamp,
-                title=title,
-                created_by_id=created_by_id or get_or_create_system_user_pk(),
-            )
-
-        if tag_list:
-            existing_tags = set(snapshot.tags.values_list('name', flat=True))
-            new_tags = set(tag_list) | existing_tags
-            snapshot.save_tags(new_tags)
-
-        return snapshot
-
-    def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
-        """Create or update multiple Snapshots from a list of SnapshotDicts"""
-        return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
-
    def remove(self, atomic: bool = False) -> tuple:
        """Remove snapshots from the database"""
        from django.db import transaction
@@ -320,14 +278,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):

 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
-    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore
+    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True)  # type: ignore[assignment]
    parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')

    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
@@ -344,7 +301,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))

-    state_machine_name = 'core.statemachines.SnapshotMachine'
+    state_machine_name = 'core.models.SnapshotMachine'
    state_field_name = 'status'
    retry_at_field_name = 'retry_at'
    StatusChoices = ModelWithStateMachine.StatusChoices
@@ -354,6 +311,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    archiveresult_set: models.Manager['ArchiveResult']

    class Meta(TypedModelMeta):
+        app_label = 'core'
        verbose_name = "Snapshot"
        verbose_name_plural = "Snapshots"
        constraints = [
@@ -366,6 +324,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def __str__(self):
        return f'[{self.id}] {self.url[:64]}'

+    @property
+    def created_by(self):
+        """Convenience property to access the user who created this snapshot via its crawl."""
+        return self.crawl.created_by
+
    def save(self, *args, **kwargs):
        is_new = self._state.adding
        if not self.bookmarked_at:
@@ -395,7 +358,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                self.fs_version = target

        super().save(*args, **kwargs)
-        if self.crawl and self.url not in self.crawl.urls:
+        if self.url not in self.crawl.urls:
            self.crawl.urls += f'\n{self.url}'
            self.crawl.save()

@@ -408,7 +371,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                url=self.url,
                metadata={
                    'id': str(self.id),
-                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
+                    'crawl_id': str(self.crawl_id),
                    'depth': self.depth,
                    'status': self.status,
                },
@@ -437,20 +400,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        return self.fs_version != self._fs_current_version()

    def _fs_next_version(self, version: str) -> str:
-        """Get next version in migration chain"""
-        chain = ['0.7.0', '0.8.0', '0.9.0']
-        try:
-            idx = chain.index(version)
-            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
-        except ValueError:
-            # Unknown version - skip to current
-            return self._fs_current_version()
-
-    def _fs_migrate_from_0_7_0_to_0_8_0(self):
-        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
-        # 0.7 and 0.8 both used archive/<timestamp>
-        # Nothing to do!
-        pass
+        """Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
+        # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
+        if version in ('0.7.0', '0.8.0'):
+            return '0.9.0'
+        return self._fs_current_version()

    def _fs_migrate_from_0_8_0_to_0_9_0(self):
        """
@@ -578,7 +532,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            return CONSTANTS.ARCHIVE_DIR / self.timestamp

        elif version in ('0.9.0', '1.0.0'):
-            username = self.created_by.username if self.created_by else 'unknown'
+            username = self.created_by.username

            # Use created_at for date grouping (fallback to timestamp)
            if self.created_at:
@@ -875,7 +829,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                pwd=result_data.get('pwd', str(self.output_dir)),
                start_ts=start_ts,
                end_ts=end_ts,
-                created_by=self.created_by,
            )
        except:
            pass
@@ -1069,6 +1022,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                result = archive_results.get(plugin)
                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
                icon = get_plugin_icon(plugin)
+
+                # Skip plugins with empty icons that have no output
+                # (e.g., staticfile only shows when there's actual output)
+                if not icon.strip() and not existing:
+                    continue
+
                output += format_html(
                    output_template,
                    path,
@@ -1139,9 +1098,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

    def run(self) -> list['ArchiveResult']:
        """
-        Execute this Snapshot by creating ArchiveResults for all enabled extractors.
+        Execute snapshot by creating pending ArchiveResults for all enabled hooks.

-        Called by the state machine when entering the 'started' state.
+        Called by: SnapshotMachine.enter_started()
+
+        Hook Lifecycle:
+            1. discover_hooks('Snapshot') → finds all plugin hooks
+            2. For each hook:
+               - Create ArchiveResult with status=QUEUED
+               - Store hook_name (e.g., 'on_Snapshot__50_wget.py')
+            3. ArchiveResults execute independently via ArchiveResultMachine
+            4. Hook execution happens in ArchiveResult.run(), NOT here
+
+        Returns:
+            list[ArchiveResult]: Newly created pending results
        """
        return self.create_pending_archiveresults()

@@ -1152,28 +1122,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        Called by the state machine when entering the 'sealed' state.
        Kills any background hooks and finalizes their ArchiveResults.
        """
-        from pathlib import Path
        from archivebox.hooks import kill_process

        # Kill any background ArchiveResult hooks
        if not self.OUTPUT_DIR.exists():
            return

-        for plugin_dir in self.OUTPUT_DIR.iterdir():
-            if not plugin_dir.is_dir():
-                continue
-            pid_file = plugin_dir / 'hook.pid'
-            if pid_file.exists():
-                kill_process(pid_file, validate=True)  # Use validation
+        # Find all .pid files in this snapshot's output directory
+        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+            kill_process(pid_file, validate=True)

-                # Update the ArchiveResult from filesystem
-                plugin_name = plugin_dir.name
-                results = self.archiveresult_set.filter(
-                    status=ArchiveResult.StatusChoices.STARTED,
-                    pwd__contains=plugin_name
-                )
-                for ar in results:
-                    ar.update_from_output()
+        # Update all STARTED ArchiveResults from filesystem
+        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
+        for ar in results:
+            ar.update_from_output()

    def has_running_background_hooks(self) -> bool:
        """
@@ -1196,51 +1158,156 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        return False

    @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
        """
-        Create/update Snapshot from JSONL record.
+        Create/update Snapshot from JSONL record or dict.
+
+        Unified method that handles:
+        - ID-based patching: {"id": "...", "title": "new title"}
+        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
+        - Auto-creates Crawl if not provided
+        - Optionally queues for extraction

        Args:
-            record: JSONL record with 'url' field and optional metadata
+            record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
+            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)

        Returns:
            Snapshot instance or None
-
-        Note:
-            Filtering (depth, URL allowlist/denylist) should be done by caller
-            BEFORE calling this method. This method just creates the snapshot.
        """
-        from archivebox.misc.jsonl import get_or_create_snapshot
+        import re
        from django.utils import timezone
+        from archivebox.misc.util import parse_date
+        from archivebox.base_models.models import get_or_create_system_user_pk
+        from archivebox.config.common import GENERAL_CONFIG

        overrides = overrides or {}
+
+        # If 'id' is provided, lookup and patch that specific snapshot
+        snapshot_id = record.get('id')
+        if snapshot_id:
+            try:
+                snapshot = Snapshot.objects.get(id=snapshot_id)
+
+                # Generically update all fields present in record
+                update_fields = []
+                for field_name, value in record.items():
+                    # Skip internal fields
+                    if field_name in ('id', 'type'):
+                        continue
+
+                    # Skip if field doesn't exist on model
+                    if not hasattr(snapshot, field_name):
+                        continue
+
+                    # Special parsing for date fields
+                    if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
+                        if value and isinstance(value, str):
+                            value = parse_date(value)
+
+                    # Update field if value is provided and different
+                    if value is not None and getattr(snapshot, field_name) != value:
+                        setattr(snapshot, field_name, value)
+                        update_fields.append(field_name)
+
+                if update_fields:
+                    snapshot.save(update_fields=update_fields + ['modified_at'])
+
+                return snapshot
+            except Snapshot.DoesNotExist:
+                # ID not found, fall through to create-by-URL logic
+                pass
+
        url = record.get('url')
        if not url:
            return None

-        # Apply crawl context metadata
+        # Determine or create crawl (every snapshot must have a crawl)
        crawl = overrides.get('crawl')
-        snapshot = overrides.get('snapshot')  # Parent snapshot
+        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
+        created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())

-        if crawl:
-            record.setdefault('crawl_id', str(crawl.id))
-            record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
-            if snapshot:
-                record.setdefault('parent_snapshot_id', str(snapshot.id))
+        # If no crawl provided, inherit from parent or auto-create one
+        if not crawl:
+            if parent_snapshot:
+                # Inherit crawl from parent snapshot
+                crawl = parent_snapshot.crawl
+            else:
+                # Auto-create a single-URL crawl
+                from archivebox.crawls.models import Crawl
+                from archivebox.config import CONSTANTS

-        try:
-            created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
-            new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
+                timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
+                sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
+                sources_file.parent.mkdir(parents=True, exist_ok=True)
+                sources_file.write_text(url)

-            # Queue for extraction
-            new_snapshot.status = Snapshot.StatusChoices.QUEUED
-            new_snapshot.retry_at = timezone.now()
-            new_snapshot.save()
+                crawl = Crawl.objects.create(
+                    urls=url,
+                    max_depth=0,
+                    label=f'auto-created for {url[:50]}',
+                    created_by_id=created_by_id,
+                )

-            return new_snapshot
-        except ValueError:
-            return None
+        # Parse tags
+        tags_str = record.get('tags', '')
+        tag_list = []
+        if tags_str:
+            tag_list = list(dict.fromkeys(
+                tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
+                if tag.strip()
+            ))
+
+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
+        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
+
+        title = record.get('title')
+        timestamp = record.get('timestamp')
+
+        if snapshot:
+            # Update existing snapshot
+            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
+                snapshot.title = title
+                snapshot.save(update_fields=['title', 'modified_at'])
+        else:
+            # Create new snapshot
+            if timestamp:
+                while Snapshot.objects.filter(timestamp=timestamp).exists():
+                    timestamp = str(float(timestamp) + 1.0)
+
+            snapshot = Snapshot.objects.create(
+                url=url,
+                timestamp=timestamp,
+                title=title,
+                crawl=crawl,
+            )
+
+        # Update tags
+        if tag_list:
+            existing_tags = set(snapshot.tags.values_list('name', flat=True))
+            new_tags = set(tag_list) | existing_tags
+            snapshot.save_tags(new_tags)
+
+        # Queue for extraction and update additional fields
+        update_fields = []
+
+        if queue_for_extraction:
+            snapshot.status = Snapshot.StatusChoices.QUEUED
+            snapshot.retry_at = timezone.now()
+            update_fields.extend(['status', 'retry_at'])
+
+        # Update additional fields if provided
+        for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
+            value = record.get(field_name)
+            if value is not None and getattr(snapshot, field_name) != value:
+                setattr(snapshot, field_name, value)
+                update_fields.append(field_name)
+
+        if update_fields:
+            snapshot.save(update_fields=update_fields + ['modified_at'])
+
+        return snapshot

    def create_pending_archiveresults(self) -> list['ArchiveResult']:
        """
@@ -1273,7 +1340,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                    'plugin': plugin,
                    'status': ArchiveResult.INITIAL_STATE,
                    'retry_at': timezone.now(),
-                    'created_by_id': self.created_by_id,
                },
            )
            if archiveresult.status == ArchiveResult.INITIAL_STATE:
@@ -1329,6 +1395,36 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        self.save(update_fields=['current_step', 'modified_at'])
        return True

+    def is_finished_processing(self) -> bool:
+        """
+        Check if this snapshot has finished processing.
+
+        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
+
+        Returns:
+            True if all archiveresults are finished (or no work to do), False otherwise.
+        """
+        # if no archiveresults exist yet, it's not finished
+        if not self.archiveresult_set.exists():
+            return False
+
+        # Try to advance step if ready (handles step-based hook execution)
+        # This will increment current_step when all foreground hooks in current step are done
+        while self.advance_step_if_ready():
+            pass  # Keep advancing until we can't anymore
+
+        # if archiveresults exist but are still pending, it's not finished
+        if self.pending_archiveresults().exists():
+            return False
+
+        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
+        # Background hooks in STARTED state are excluded by pending_archiveresults()
+        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
+        # we can transition to sealed and cleanup() will kill the background hooks
+
+        # otherwise archiveresults exist and are all finished, so it's finished
+        return True
+
    def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
        """
        Reset failed/skipped ArchiveResults to queued for retry.
@@ -1730,6 +1826,97 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None


+# =============================================================================
+# Snapshot State Machine
+# =============================================================================
+
+class SnapshotMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing Snapshot lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for snapshot to be ready                         │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. snapshot.run()                                          │
+    │     • discover_hooks('Snapshot') → finds all plugin hooks   │
+    │     • create_pending_archiveresults() → creates ONE         │
+    │       ArchiveResult per hook (NO execution yet)             │
+    │  2. ArchiveResults process independently with their own     │
+    │     state machines (see ArchiveResultMachine)               │
+    │  3. Advance through steps 0-9 as foreground hooks complete  │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when is_finished()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SEALED State → enter_sealed()                               │
+    │  • cleanup() → kills any background hooks still running     │
+    │  • Set retry_at=None (no more processing)                   │
+    └─────────────────────────────────────────────────────────────┘
+
+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
+    """
+
+    model_attr_name = 'snapshot'
+
+    # States
+    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
+    started = State(value=Snapshot.StatusChoices.STARTED)
+    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
+
+    # Tick Event
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(sealed, cond='is_finished')
+    )
+
+    def can_start(self) -> bool:
+        can_start = bool(self.snapshot.url)
+        return can_start
+
+    def is_finished(self) -> bool:
+        """Check if snapshot processing is complete - delegates to model method."""
+        return self.snapshot.is_finished_processing()
+
+    @queued.enter
+    def enter_queued(self):
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now(),
+            status=Snapshot.StatusChoices.QUEUED,
+        )
+
+    @started.enter
+    def enter_started(self):
+        # lock the snapshot while we create the pending archiveresults
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
+        )
+
+        # Run the snapshot - creates pending archiveresults for all enabled plugins
+        self.snapshot.run()
+
+        # unlock the snapshot after we're done + set status = started
+        self.snapshot.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
+            status=Snapshot.StatusChoices.STARTED,
+        )
+
+    @sealed.enter
+    def enter_sealed(self):
+        # Clean up background hooks
+        self.snapshot.cleanup()
+
+        self.snapshot.update_and_requeue(
+            retry_at=None,
+            status=Snapshot.StatusChoices.SEALED,
+        )
+
+
 class ArchiveResultManager(models.Manager):
    def indexable(self, sorted: bool = True):
        INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
@@ -1761,7 +1948,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    # Note: unique constraint is added by migration 0027 - don't set unique=True here
    # or SQLite table recreation in earlier migrations will fail
    uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
-    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

@@ -1782,7 +1968,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

    # Binary FK (optional - set when hook reports cmd)
    binary = models.ForeignKey(
-        'machine.Binary',
+        Binary,
        on_delete=models.SET_NULL,
        null=True, blank=True,
        related_name='archiveresults',
@@ -1798,7 +1984,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
    iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)

-    state_machine_name = 'core.statemachines.ArchiveResultMachine'
+    state_machine_name = 'core.models.ArchiveResultMachine'
    retry_at_field_name = 'retry_at'
    state_field_name = 'status'
    active_state = StatusChoices.STARTED
@@ -1806,12 +1992,18 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    objects = ArchiveResultManager()

    class Meta(TypedModelMeta):
+        app_label = 'core'
        verbose_name = 'Archive Result'
        verbose_name_plural = 'Archive Results Log'

    def __str__(self):
        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'

+    @property
+    def created_by(self):
+        """Convenience property to access the user who created this archive result via its snapshot's crawl."""
+        return self.snapshot.crawl.created_by
+
    def save(self, *args, **kwargs):
        is_new = self._state.adding
        # Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
@@ -1900,6 +2092,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    def save_search_index(self):
        pass

+    def cascade_health_update(self, success: bool):
+        """Update health stats for self, parent Snapshot, and grandparent Crawl."""
+        self.increment_health_stats(success)
+        self.snapshot.increment_health_stats(success)
+        self.snapshot.crawl.increment_health_stats(success)
+
    def run(self):
        """
        Execute this ArchiveResult's hook and update status.
@@ -1911,8 +2109,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        """
        from django.utils import timezone
        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
+        from archivebox.config.configset import get_config

-        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
+        # Get merged config with proper context
+        config = get_config(
+            crawl=self.snapshot.crawl,
+            snapshot=self.snapshot,
+        )

        # Determine which hook(s) to run
        hooks = []
@@ -1962,10 +2165,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            result = run_hook(
                hook,
                output_dir=plugin_dir,
-                config_objects=config_objects,
+                config=config,
                url=self.snapshot.url,
                snapshot_id=str(self.snapshot.id),
-                crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
+                crawl_id=str(self.snapshot.crawl.id),
                depth=self.snapshot.depth,
            )

@@ -2112,9 +2315,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

            # Filter Snapshot records for depth/URL constraints
            if record_type == 'Snapshot':
-                if not self.snapshot.crawl:
-                    continue
-
                url = record.get('url')
                if not url:
                    continue
@@ -2132,19 +2332,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        overrides = {
            'snapshot': self.snapshot,
            'crawl': self.snapshot.crawl,
-            'created_by_id': self.snapshot.created_by_id,
+            'created_by_id': self.created_by.pk,
        }
        process_hook_records(filtered_records, overrides=overrides)

-        # Update snapshot title if this is the title plugin
-        plugin_name = get_plugin_name(self.plugin)
-        if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
-            self._update_snapshot_title(plugin_dir)
-
-        # Trigger search indexing if succeeded
-        if self.status == self.StatusChoices.SUCCEEDED:
-            self.trigger_search_indexing()
-
        # Cleanup PID files and empty logs
        pid_file = plugin_dir / 'hook.pid'
        pid_file.unlink(missing_ok=True)
@@ -2164,7 +2355,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        if not cmd:
            return

-        from machine.models import Machine
+        from archivebox.machine.models import Machine

        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
        machine = Machine.current()
@@ -2189,23 +2380,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        if binary:
            self.binary = binary

-    def _update_snapshot_title(self, plugin_dir: Path):
-        """
-        Update snapshot title from title plugin output.
-
-        The title plugin writes title.txt with the extracted page title.
-        This updates the Snapshot.title field if the file exists and has content.
-        """
-        title_file = plugin_dir / 'title.txt'
-        if title_file.exists():
-            try:
-                title = title_file.read_text(encoding='utf-8').strip()
-                if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
-                    self.snapshot.title = title[:512]  # Max length from model
-                    self.snapshot.save(update_fields=['title', 'modified_at'])
-            except Exception:
-                pass  # Failed to read title, that's okay
-
    def _url_passes_filters(self, url: str) -> bool:
        """Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.

@@ -2216,8 +2390,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

        # Get merged config with proper hierarchy
        config = get_config(
-            user=self.snapshot.created_by if self.snapshot else None,
-            crawl=self.snapshot.crawl if self.snapshot else None,
+            user=self.created_by,
+            crawl=self.snapshot.crawl,
            snapshot=self.snapshot,
        )

@@ -2256,23 +2430,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            return False  # No allowlist patterns matched

        return True  # No filters or passed filters
-    
-    def trigger_search_indexing(self):
-        """Run any ArchiveResult__index hooks to update search indexes."""
-        from archivebox.hooks import discover_hooks, run_hook
-
-        # Pass config objects in priority order (later overrides earlier)
-        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
-
-        for hook in discover_hooks('ArchiveResult__index'):
-            run_hook(
-                hook,
-                output_dir=self.output_dir,
-                config_objects=config_objects,
-                url=self.snapshot.url,
-                snapshot_id=str(self.snapshot.id),
-                plugin=self.plugin,
-            )

    @property
    def output_dir(self) -> Path:
@@ -2285,4 +2442,185 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        if not plugin_dir:
            return False
        pid_file = plugin_dir / 'hook.pid'
-        return pid_file.exists()
+        return pid_file.exists()
+
+
+# =============================================================================
+# ArchiveResult State Machine
+# =============================================================================
+
+class ArchiveResultMachine(BaseStateMachine, strict_states=True):
+    """
+    State machine for managing ArchiveResult (single plugin execution) lifecycle.
+
+    Hook Lifecycle:
+    ┌─────────────────────────────────────────────────────────────┐
+    │ QUEUED State                                                │
+    │  • Waiting for its turn to run                              │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() when can_start()
+    ┌─────────────────────────────────────────────────────────────┐
+    │ STARTED State → enter_started()                             │
+    │  1. archiveresult.run()                                     │
+    │     • Find specific hook by hook_name                       │
+    │     • run_hook(script, output_dir, ...) → subprocess        │
+    │                                                              │
+    │  2a. FOREGROUND hook (returns HookResult):                  │
+    │      • update_from_output() immediately                     │
+    │        - Read stdout.log                                    │
+    │        - Parse JSONL records                                │
+    │        - Extract 'ArchiveResult' record → update status     │
+    │        - Walk output_dir → populate output_files            │
+    │        - Call process_hook_records() for side effects       │
+    │                                                              │
+    │  2b. BACKGROUND hook (returns None):                        │
+    │      • Status stays STARTED                                 │
+    │      • Continues running in background                      │
+    │      • Killed by Snapshot.cleanup() when sealed             │
+    └─────────────────────────────────────────────────────────────┘
+                            ↓ tick() checks status
+    ┌─────────────────────────────────────────────────────────────┐
+    │ SUCCEEDED / FAILED / SKIPPED / BACKOFF                      │
+    │  • Set by hook's JSONL output during update_from_output()   │
+    │  • Health stats incremented (num_uses_succeeded/failed)     │
+    │  • Parent Snapshot health stats also updated                │
+    └─────────────────────────────────────────────────────────────┘
+
+    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
+    """
+
+    model_attr_name = 'archiveresult'
+
+    # States
+    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
+    started = State(value=ArchiveResult.StatusChoices.STARTED)
+    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
+    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
+    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
+    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
+
+    # Tick Event - transitions based on conditions
+    tick = (
+        queued.to.itself(unless='can_start') |
+        queued.to(started, cond='can_start') |
+        started.to.itself(unless='is_finished') |
+        started.to(succeeded, cond='is_succeeded') |
+        started.to(failed, cond='is_failed') |
+        started.to(skipped, cond='is_skipped') |
+        started.to(backoff, cond='is_backoff') |
+        backoff.to.itself(unless='can_start') |
+        backoff.to(started, cond='can_start') |
+        backoff.to(succeeded, cond='is_succeeded') |
+        backoff.to(failed, cond='is_failed') |
+        backoff.to(skipped, cond='is_skipped')
+    )
+
+    def can_start(self) -> bool:
+        can_start = bool(self.archiveresult.snapshot.url)
+        return can_start
+
+    def is_succeeded(self) -> bool:
+        """Check if extractor plugin succeeded (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
+
+    def is_failed(self) -> bool:
+        """Check if extractor plugin failed (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
+
+    def is_skipped(self) -> bool:
+        """Check if extractor plugin was skipped (status was set by run())."""
+        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
+
+    def is_backoff(self) -> bool:
+        """Check if we should backoff and retry later."""
+        # Backoff if status is still started (plugin didn't complete) and output_str is empty
+        return (
+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
+            not self.archiveresult.output_str
+        )
+
+    def is_finished(self) -> bool:
+        """Check if extraction has completed (success, failure, or skipped)."""
+        return self.archiveresult.status in (
+            ArchiveResult.StatusChoices.SUCCEEDED,
+            ArchiveResult.StatusChoices.FAILED,
+            ArchiveResult.StatusChoices.SKIPPED,
+        )
+
+    @queued.enter
+    def enter_queued(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now(),
+            status=ArchiveResult.StatusChoices.QUEUED,
+            start_ts=None,
+        )  # bump the snapshot's retry_at so they pickup any new changes
+
+    @started.enter
+    def enter_started(self):
+        from archivebox.machine.models import NetworkInterface
+
+        # Lock the object and mark start time
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
+            status=ArchiveResult.StatusChoices.STARTED,
+            start_ts=timezone.now(),
+            iface=NetworkInterface.current(),
+        )
+
+        # Run the plugin - this updates status, output, timestamps, etc.
+        self.archiveresult.run()
+
+        # Save the updated result
+        self.archiveresult.save()
+
+
+    @backoff.enter
+    def enter_backoff(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=timezone.now() + timedelta(seconds=60),
+            status=ArchiveResult.StatusChoices.BACKOFF,
+            end_ts=None,
+        )
+
+    @succeeded.enter
+    def enter_succeeded(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.SUCCEEDED,
+            end_ts=timezone.now(),
+        )
+
+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
+        self.archiveresult.cascade_health_update(success=True)
+
+    @failed.enter
+    def enter_failed(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.FAILED,
+            end_ts=timezone.now(),
+        )
+
+        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
+        self.archiveresult.cascade_health_update(success=False)
+
+    @skipped.enter
+    def enter_skipped(self):
+        self.archiveresult.update_and_requeue(
+            retry_at=None,
+            status=ArchiveResult.StatusChoices.SKIPPED,
+            end_ts=timezone.now(),
+        )
+
+    def after_transition(self, event: str, source: State, target: State):
+        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
+
+
+# =============================================================================
+# State Machine Registration
+# =============================================================================
+
+# Manually register state machines with python-statemachine registry
+# (normally auto-discovered from statemachines.py, but we define them here for clarity)
+registry.register(SnapshotMachine)
+registry.register(ArchiveResultMachine)
--- a/archivebox/core/models.py.bak
+++ b/archivebox/core/models.py.bak
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -30,9 +30,9 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
 ### Django Core Settings
 ################################################################################

-WSGI_APPLICATION = "core.wsgi.application"
-ASGI_APPLICATION = "core.asgi.application"
-ROOT_URLCONF = "core.urls"
+WSGI_APPLICATION = "archivebox.core.wsgi.application"
+ASGI_APPLICATION = "archivebox.core.asgi.application"
+ROOT_URLCONF = "archivebox.core.urls"

 LOGIN_URL = "/accounts/login/"
 LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
@@ -55,14 +55,15 @@ INSTALLED_APPS = [
    # 3rd-party apps from PyPI
    "signal_webhooks",  # handles REST API outbound webhooks                              https://github.com/MrThearMan/django-signal-webhooks
    "django_object_actions",  # provides easy Django Admin action buttons on change views       https://github.com/crccheck/django-object-actions
-    # Our ArchiveBox-provided apps
-    "config",  # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
-    "machine",  # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
-    "workers",  # handles starting and managing background workers and processes (orchestrators and actors)
-    "crawls",  # handles Crawl and CrawlSchedule models and management
-    "personas",  # handles Persona and session management
-    "core",  # core django model with Snapshot, ArchiveResult, etc.
-    "api",  # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
+    # Our ArchiveBox-provided apps (use fully qualified names)
+    # NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
+    # "archivebox.config",  # ArchiveBox config settings (no models, not a real Django app)
+    "archivebox.machine",  # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
+    "archivebox.workers",  # handles starting and managing background workers and processes (orchestrators and actors)
+    "archivebox.personas",  # handles Persona and session management
+    "archivebox.core",  # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this)
+    "archivebox.crawls",  # handles Crawl and CrawlSchedule models and management (depends on core)
+    "archivebox.api",  # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
    # ArchiveBox plugins (hook-based plugins no longer add Django apps)
    # Use hooks.py discover_hooks() for plugin functionality
    # 3rd-party apps from PyPI that need to be loaded last
@@ -72,15 +73,15 @@ INSTALLED_APPS = [


 MIDDLEWARE = [
-    "core.middleware.TimezoneMiddleware",
+    "archivebox.core.middleware.TimezoneMiddleware",
    "django.middleware.security.SecurityMiddleware",
    "django.contrib.sessions.middleware.SessionMiddleware",
    "django.middleware.common.CommonMiddleware",
    "django.middleware.csrf.CsrfViewMiddleware",
    "django.contrib.auth.middleware.AuthenticationMiddleware",
-    "core.middleware.ReverseProxyAuthMiddleware",
+    "archivebox.core.middleware.ReverseProxyAuthMiddleware",
    "django.contrib.messages.middleware.MessageMiddleware",
-    "core.middleware.CacheControlMiddleware",
+    "archivebox.core.middleware.CacheControlMiddleware",
    # Additional middlewares from plugins (if any)
 ]

@@ -370,15 +371,15 @@ LOGGING = SETTINGS_LOGGING
 ################################################################################

 # Add default webhook configuration to the User model
-SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
+SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
 SIGNAL_WEBHOOKS = {
    "HOOKS": {
        # ... is a special sigil value that means "use the default autogenerated hooks"
        "django.contrib.auth.models.User": ...,
-        "core.models.Snapshot": ...,
-        "core.models.ArchiveResult": ...,
-        "core.models.Tag": ...,
-        "api.models.APIToken": ...,
+        "archivebox.core.models.Snapshot": ...,
+        "archivebox.core.models.ArchiveResult": ...,
+        "archivebox.core.models.Tag": ...,
+        "archivebox.api.models.APIToken": ...,
    },
 }

@@ -391,11 +392,11 @@ ADMIN_DATA_VIEWS = {
    "URLS": [
        {
            "route": "config/",
-            "view": "core.views.live_config_list_view",
+            "view": "archivebox.core.views.live_config_list_view",
            "name": "Configuration",
            "items": {
                "route": "<str:key>/",
-                "view": "core.views.live_config_value_view",
+                "view": "archivebox.core.views.live_config_value_view",
                "name": "config_val",
            },
        },
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -1,319 +0,0 @@
-__package__ = 'archivebox.core'
-
-import time
-import os
-from datetime import timedelta
-from typing import ClassVar
-
-from django.db.models import F
-from django.utils import timezone
-
-from rich import print
-
-from statemachine import State, StateMachine
-
-# from workers.actor import ActorType
-
-from core.models import Snapshot, ArchiveResult
-from crawls.models import Crawl
-
-
-class SnapshotMachine(StateMachine, strict_states=True):
-    """
-    State machine for managing Snapshot lifecycle.
-    
-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
-    """
-    
-    model: Snapshot
-    
-    # States
-    queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
-    started = State(value=Snapshot.StatusChoices.STARTED)
-    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
-    
-    # Tick Event
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(sealed, cond='is_finished')
-    )
-    
-    def __init__(self, snapshot, *args, **kwargs):
-        self.snapshot = snapshot
-        super().__init__(snapshot, *args, **kwargs)
-        
-    def __repr__(self) -> str:
-        return f'Snapshot[{self.snapshot.id}]'
-
-    def __str__(self) -> str:
-        return self.__repr__()
-
-    def can_start(self) -> bool:
-        can_start = bool(self.snapshot.url)
-        # Suppressed: queue waiting logs
-        return can_start
-        
-    def is_finished(self) -> bool:
-        # if no archiveresults exist yet, it's not finished
-        if not self.snapshot.archiveresult_set.exists():
-            return False
-
-        # Try to advance step if ready (handles step-based hook execution)
-        # This will increment current_step when all foreground hooks in current step are done
-        while self.snapshot.advance_step_if_ready():
-            pass  # Keep advancing until we can't anymore
-
-        # if archiveresults exist but are still pending, it's not finished
-        if self.snapshot.pending_archiveresults().exists():
-            return False
-
-        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
-        # Background hooks in STARTED state are excluded by pending_archiveresults()
-        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
-        # we can transition to sealed and cleanup() will kill the background hooks
-
-        # otherwise archiveresults exist and are all finished, so it's finished
-        return True
-        
-    # def on_transition(self, event, state):
-    #     print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
-        
-    @queued.enter
-    def enter_queued(self):
-        # Suppressed: state transition logs
-        self.snapshot.update_for_workers(
-            retry_at=timezone.now(),
-            status=Snapshot.StatusChoices.QUEUED,
-        )
-
-    @started.enter
-    def enter_started(self):
-        # Suppressed: state transition logs
-        # lock the snapshot while we create the pending archiveresults
-        self.snapshot.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
-        )
-
-        # Run the snapshot - creates pending archiveresults for all enabled plugins
-        self.snapshot.run()
-
-        # unlock the snapshot after we're done + set status = started
-        self.snapshot.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
-            status=Snapshot.StatusChoices.STARTED,
-        )
-
-    @sealed.enter
-    def enter_sealed(self):
-        # Clean up background hooks
-        self.snapshot.cleanup()
-
-        # Suppressed: state transition logs
-        self.snapshot.update_for_workers(
-            retry_at=None,
-            status=Snapshot.StatusChoices.SEALED,
-        )
-
-
-# class SnapshotWorker(ActorType[Snapshot]):
-#     """
-#     The primary actor for progressing Snapshot objects
-#     through their lifecycle using the SnapshotMachine.
-#     """
-#     Model = Snapshot
-#     StateMachineClass = SnapshotMachine
-    
-#     ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started                    # 'started'
-    
-#     MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
-#     MAX_TICK_TIME: ClassVar[int] = 10
-#     CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
-
-
-
-
-
-class ArchiveResultMachine(StateMachine, strict_states=True):
-    """
-    State machine for managing ArchiveResult lifecycle.
-    
-    https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
-    """
-    
-    model: ArchiveResult
-    
-    # States
-    queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
-    started = State(value=ArchiveResult.StatusChoices.STARTED)
-    backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
-    succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
-    failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
-    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
-    
-    # Tick Event - transitions based on conditions
-    tick = (
-        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
-        started.to(succeeded, cond='is_succeeded') |
-        started.to(failed, cond='is_failed') |
-        started.to(skipped, cond='is_skipped') |
-        started.to(backoff, cond='is_backoff') |
-        backoff.to.itself(unless='can_start') |
-        backoff.to(started, cond='can_start') |
-        backoff.to(succeeded, cond='is_succeeded') |
-        backoff.to(failed, cond='is_failed') |
-        backoff.to(skipped, cond='is_skipped')
-    )
-
-    def __init__(self, archiveresult, *args, **kwargs):
-        self.archiveresult = archiveresult
-        super().__init__(archiveresult, *args, **kwargs)
-    
-    def __repr__(self) -> str:
-        return f'ArchiveResult[{self.archiveresult.id}]'
-
-    def __str__(self) -> str:
-        return self.__repr__()
-
-    def can_start(self) -> bool:
-        can_start = bool(self.archiveresult.snapshot.url)
-        # Suppressed: queue waiting logs
-        return can_start
-    
-    def is_succeeded(self) -> bool:
-        """Check if extractor plugin succeeded (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
-
-    def is_failed(self) -> bool:
-        """Check if extractor plugin failed (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
-
-    def is_skipped(self) -> bool:
-        """Check if extractor plugin was skipped (status was set by run())."""
-        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
-    
-    def is_backoff(self) -> bool:
-        """Check if we should backoff and retry later."""
-        # Backoff if status is still started (plugin didn't complete) and output_str is empty
-        return (
-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
-            not self.archiveresult.output_str
-        )
-    
-    def is_finished(self) -> bool:
-        """Check if extraction has completed (success, failure, or skipped)."""
-        return self.archiveresult.status in (
-            ArchiveResult.StatusChoices.SUCCEEDED,
-            ArchiveResult.StatusChoices.FAILED,
-            ArchiveResult.StatusChoices.SKIPPED,
-        )
-
-    @queued.enter
-    def enter_queued(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=timezone.now(),
-            status=ArchiveResult.StatusChoices.QUEUED,
-            start_ts=None,
-        )  # bump the snapshot's retry_at so they pickup any new changes
-
-    @started.enter
-    def enter_started(self):
-        from machine.models import NetworkInterface
-
-        # Suppressed: state transition logs
-        # Lock the object and mark start time
-        self.archiveresult.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for plugin
-            status=ArchiveResult.StatusChoices.STARTED,
-            start_ts=timezone.now(),
-            iface=NetworkInterface.current(),
-        )
-
-        # Run the plugin - this updates status, output, timestamps, etc.
-        self.archiveresult.run()
-
-        # Save the updated result
-        self.archiveresult.save()
-
-        # Suppressed: plugin result logs (already logged by worker)
-
-    @backoff.enter
-    def enter_backoff(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=60),
-            status=ArchiveResult.StatusChoices.BACKOFF,
-            end_ts=None,
-            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
-        )
-        self.archiveresult.save()
-
-    @succeeded.enter
-    def enter_succeeded(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.SUCCEEDED,
-            end_ts=timezone.now(),
-            # **self.archiveresult.get_output_dict(),     # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
-        )
-        self.archiveresult.save()
-
-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
-        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-
-        # Also update Crawl health stats if snapshot has a crawl
-        snapshot = self.archiveresult.snapshot
-        if snapshot.crawl_id:
-            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
-
-    @failed.enter
-    def enter_failed(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.FAILED,
-            end_ts=timezone.now(),
-        )
-
-        # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
-        ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
-        Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
-
-        # Also update Crawl health stats if snapshot has a crawl
-        snapshot = self.archiveresult.snapshot
-        if snapshot.crawl_id:
-            Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
-
-    @skipped.enter
-    def enter_skipped(self):
-        # Suppressed: state transition logs
-        self.archiveresult.update_for_workers(
-            retry_at=None,
-            status=ArchiveResult.StatusChoices.SKIPPED,
-            end_ts=timezone.now(),
-        )
-        
-    def after_transition(self, event: str, source: State, target: State):
-        # print(f"after '{event}' from '{source.id}' to '{target.id}'")
-        self.archiveresult.snapshot.update_for_workers()  # bump snapshot retry time so it picks up all the new changes
-
-
-# class ArchiveResultWorker(ActorType[ArchiveResult]):
-#     """
-#     The primary actor for progressing ArchiveResult objects
-#     through their lifecycle using the ArchiveResultMachine.
-#     """
-#     Model = ArchiveResult
-#     StateMachineClass = ArchiveResultMachine
-    
-#     ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started                # 'started'
-    
-#     MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
-#     MAX_TICK_TIME: ClassVar[int] = 60
-#     CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
--- a/archivebox/core/templatetags/config_tags.py
+++ b/archivebox/core/templatetags/config_tags.py
@@ -0,0 +1,20 @@
+"""Template tags for accessing config values in templates."""
+
+from django import template
+
+from archivebox.config.configset import get_config as _get_config
+
+register = template.Library()
+
+
+@register.simple_tag
+def get_config(key: str) -> any:
+    """
+    Get a config value by key.
+
+    Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
+    """
+    try:
+        return _get_config(key)
+    except (KeyError, AttributeError):
+        return None
--- a/archivebox/core/tests.py
+++ b/archivebox/core/tests.py
@@ -1,3 +1,319 @@
-#from django.test import TestCase
+"""Tests for the core views, especially AddView."""

-# Create your tests here.
+import os
+import django
+
+# Set up Django before importing any Django-dependent modules
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+django.setup()
+
+from django.test import TestCase, Client
+from django.contrib.auth.models import User
+from django.urls import reverse
+
+from archivebox.crawls.models import Crawl, CrawlSchedule
+from archivebox.core.models import Tag
+
+
+class AddViewTests(TestCase):
+    """Tests for the AddView (crawl creation form)."""
+
+    def setUp(self):
+        """Set up test user and client."""
+        self.client = Client()
+        self.user = User.objects.create_user(
+            username='testuser',
+            password='testpass123',
+            email='test@example.com'
+        )
+        self.client.login(username='testuser', password='testpass123')
+        self.add_url = reverse('add')
+
+    def test_add_view_get_requires_auth(self):
+        """Test that GET /add requires authentication."""
+        self.client.logout()
+        response = self.client.get(self.add_url)
+        # Should redirect to login or show 403/404
+        self.assertIn(response.status_code, [302, 403, 404])
+
+    def test_add_view_get_shows_form(self):
+        """Test that GET /add shows the form with all fields."""
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        # Check that form fields are present
+        self.assertContains(response, 'name="url"')
+        self.assertContains(response, 'name="tag"')
+        self.assertContains(response, 'name="depth"')
+        self.assertContains(response, 'name="notes"')
+        self.assertContains(response, 'name="schedule"')
+        self.assertContains(response, 'name="persona"')
+        self.assertContains(response, 'name="overwrite"')
+        self.assertContains(response, 'name="update"')
+        self.assertContains(response, 'name="index_only"')
+
+        # Check for plugin groups
+        self.assertContains(response, 'name="chrome_plugins"')
+        self.assertContains(response, 'name="archiving_plugins"')
+        self.assertContains(response, 'name="parsing_plugins"')
+
+    def test_add_view_shows_tag_autocomplete(self):
+        """Test that tag autocomplete datalist is rendered."""
+        # Create some tags
+        Tag.objects.create(name='test-tag-1')
+        Tag.objects.create(name='test-tag-2')
+
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        # Check for datalist with tags
+        self.assertContains(response, 'id="tag-datalist"')
+        self.assertContains(response, 'test-tag-1')
+        self.assertContains(response, 'test-tag-2')
+
+    def test_add_view_shows_plugin_presets(self):
+        """Test that plugin preset buttons are rendered."""
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        self.assertContains(response, 'Quick Archive')
+        self.assertContains(response, 'Full Chrome')
+        self.assertContains(response, 'Text Only')
+        self.assertContains(response, 'Select All')
+        self.assertContains(response, 'Clear All')
+
+    def test_add_view_shows_links_to_resources(self):
+        """Test that helpful links are present."""
+        response = self.client.get(self.add_url)
+        self.assertEqual(response.status_code, 200)
+
+        # Link to plugin documentation
+        self.assertContains(response, '/admin/environment/plugins/')
+
+        # Link to create new persona
+        self.assertContains(response, '/admin/personas/persona/add/')
+
+    def test_add_basic_crawl_without_schedule(self):
+        """Test creating a basic crawl without a schedule."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com\nhttps://example.org',
+            'tag': 'test-tag',
+            'depth': '0',
+            'notes': 'Test crawl notes',
+        })
+
+        # Should redirect to crawl admin page
+        self.assertEqual(response.status_code, 302)
+
+        # Check that crawl was created
+        self.assertEqual(Crawl.objects.count(), 1)
+        crawl = Crawl.objects.first()
+
+        self.assertIn('https://example.com', crawl.urls)
+        self.assertIn('https://example.org', crawl.urls)
+        self.assertEqual(crawl.tags_str, 'test-tag')
+        self.assertEqual(crawl.max_depth, 0)
+        self.assertEqual(crawl.notes, 'Test crawl notes')
+        self.assertEqual(crawl.created_by, self.user)
+
+        # No schedule should be created
+        self.assertIsNone(crawl.schedule)
+        self.assertEqual(CrawlSchedule.objects.count(), 0)
+
+    def test_add_crawl_with_schedule(self):
+        """Test creating a crawl with a repeat schedule."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'tag': 'scheduled',
+            'depth': '1',
+            'notes': 'Daily crawl',
+            'schedule': 'daily',
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        # Check that crawl and schedule were created
+        self.assertEqual(Crawl.objects.count(), 1)
+        self.assertEqual(CrawlSchedule.objects.count(), 1)
+
+        crawl = Crawl.objects.first()
+        schedule = CrawlSchedule.objects.first()
+
+        self.assertEqual(crawl.schedule, schedule)
+        self.assertEqual(schedule.template, crawl)
+        self.assertEqual(schedule.schedule, 'daily')
+        self.assertTrue(schedule.is_enabled)
+        self.assertEqual(schedule.created_by, self.user)
+
+    def test_add_crawl_with_cron_schedule(self):
+        """Test creating a crawl with a cron format schedule."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'schedule': '0 */6 * * *',  # Every 6 hours
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        schedule = CrawlSchedule.objects.first()
+        self.assertEqual(schedule.schedule, '0 */6 * * *')
+
+    def test_add_crawl_with_plugins(self):
+        """Test creating a crawl with specific plugins selected."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'chrome_plugins': ['screenshot', 'dom'],
+            'archiving_plugins': ['wget'],
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        crawl = Crawl.objects.first()
+        plugins = crawl.config.get('PLUGINS', '')
+
+        # Should contain the selected plugins
+        self.assertIn('screenshot', plugins)
+        self.assertIn('dom', plugins)
+        self.assertIn('wget', plugins)
+
+    def test_add_crawl_with_depth_range(self):
+        """Test creating crawls with different depth values (0-4)."""
+        for depth in range(5):
+            response = self.client.post(self.add_url, {
+                'url': f'https://example{depth}.com',
+                'depth': str(depth),
+            })
+
+            self.assertEqual(response.status_code, 302)
+
+        self.assertEqual(Crawl.objects.count(), 5)
+
+        for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
+            self.assertEqual(crawl.max_depth, i)
+
+    def test_add_crawl_with_advanced_options(self):
+        """Test creating a crawl with advanced options."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'persona': 'CustomPersona',
+            'overwrite': True,
+            'update': True,
+            'index_only': True,
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        crawl = Crawl.objects.first()
+        config = crawl.config
+
+        self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
+        self.assertEqual(config.get('OVERWRITE'), True)
+        self.assertEqual(config.get('ONLY_NEW'), False)  # opposite of update
+        self.assertEqual(config.get('INDEX_ONLY'), True)
+
+    def test_add_crawl_with_custom_config(self):
+        """Test creating a crawl with custom config overrides."""
+        # Note: Django test client can't easily POST the KeyValueWidget format,
+        # so this test would need to use the form directly or mock the cleaned_data
+        # For now, we'll skip this test or mark it as TODO
+        pass
+
+    def test_add_empty_urls_fails(self):
+        """Test that submitting without URLs fails validation."""
+        response = self.client.post(self.add_url, {
+            'url': '',
+            'depth': '0',
+        })
+
+        # Should show form again with errors, not redirect
+        self.assertEqual(response.status_code, 200)
+        self.assertFormError(response, 'form', 'url', 'This field is required.')
+
+    def test_add_invalid_urls_fails(self):
+        """Test that invalid URLs fail validation."""
+        response = self.client.post(self.add_url, {
+            'url': 'not-a-url',
+            'depth': '0',
+        })
+
+        # Should show form again with errors
+        self.assertEqual(response.status_code, 200)
+        # Check for validation error (URL regex should fail)
+        self.assertContains(response, 'error')
+
+    def test_add_success_message_without_schedule(self):
+        """Test that success message is shown without schedule link."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com\nhttps://example.org',
+            'depth': '0',
+        }, follow=True)
+
+        # Check success message mentions crawl creation
+        messages = list(response.context['messages'])
+        self.assertEqual(len(messages), 1)
+        message_text = str(messages[0])
+
+        self.assertIn('Created crawl with 2 starting URL', message_text)
+        self.assertIn('View Crawl', message_text)
+        self.assertNotIn('scheduled to repeat', message_text)
+
+    def test_add_success_message_with_schedule(self):
+        """Test that success message includes schedule link."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'schedule': 'weekly',
+        }, follow=True)
+
+        # Check success message mentions schedule
+        messages = list(response.context['messages'])
+        self.assertEqual(len(messages), 1)
+        message_text = str(messages[0])
+
+        self.assertIn('Created crawl', message_text)
+        self.assertIn('scheduled to repeat weekly', message_text)
+        self.assertIn('View Crawl', message_text)
+
+    def test_add_crawl_creates_source_file(self):
+        """Test that crawl creation saves URLs to sources file."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        # Check that source file was created in sources/ directory
+        from archivebox.config import CONSTANTS
+        sources_dir = CONSTANTS.SOURCES_DIR
+
+        # Should have created a source file
+        source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
+        self.assertGreater(len(source_files), 0)
+
+    def test_multiple_tags_are_saved(self):
+        """Test that multiple comma-separated tags are saved."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+            'tag': 'tag1,tag2,tag3',
+        })
+
+        self.assertEqual(response.status_code, 302)
+
+        crawl = Crawl.objects.first()
+        self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
+
+    def test_crawl_redirects_to_admin_change_page(self):
+        """Test that successful submission redirects to crawl admin page."""
+        response = self.client.post(self.add_url, {
+            'url': 'https://example.com',
+            'depth': '0',
+        })
+
+        crawl = Crawl.objects.first()
+        expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
+
+        self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -7,10 +7,10 @@ from django.views.generic.base import RedirectView

 from archivebox.misc.serve_static import serve_static

-from core.admin_site import archivebox_admin
-from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
+from archivebox.core.admin_site import archivebox_admin
+from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view

-from workers.views import JobsDashboardView
+from archivebox.workers.views import JobsDashboardView

 # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
 # from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -23,7 +23,7 @@ from admin_data_views.typing import TableContext, ItemContext
 from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink

 import archivebox
-from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
+from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
 from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
 from archivebox.config.configset import get_flat_config, get_config, get_all_configs
 from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
@@ -31,9 +31,9 @@ from archivebox.misc.serve_static import serve_static_with_byterange_support
 from archivebox.misc.logging_util import printable_filesize
 from archivebox.search import query_search_index

-from core.models import Snapshot
-from core.forms import AddLinkForm
-from crawls.models import Crawl
+from archivebox.core.models import Snapshot
+from archivebox.core.forms import AddLinkForm
+from archivebox.crawls.models import Crawl
 from archivebox.hooks import get_extractors, get_extractor_name


@@ -150,7 +150,6 @@ class SnapshotView(View):
            'status_color': 'success' if snapshot.is_archived else 'danger',
            'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
            'warc_path': warc_path,
-            'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
            'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
            'best_result': best_result,
@@ -421,35 +420,34 @@ class AddView(UserPassesTestMixin, FormView):
        return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated

    def get_context_data(self, **kwargs):
+        from archivebox.core.models import Tag
+
        return {
            **super().get_context_data(**kwargs),
-            'title': "Add URLs",
+            'title': "Create Crawl",
            # We can't just call request.build_absolute_uri in the template, because it would include query parameters
            'absolute_add_path': self.request.build_absolute_uri(self.request.path),
            'VERSION': VERSION,
            'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
            'stdout': '',
+            'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
        }

    def form_valid(self, form):
        urls = form.cleaned_data["url"]
        print(f'[+] Adding URL: {urls}')
-        parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
-        tag = form.cleaned_data["tag"]
-        depth = 0 if form.cleaned_data["depth"] == "0" else 1
-        plugins = ','.join(form.cleaned_data["archive_methods"])
-        input_kwargs = {
-            "urls": urls,
-            "tag": tag,
-            "depth": depth,
-            "parser": parser,
-            "update_all": False,
-            "out_dir": DATA_DIR,
-            "created_by_id": self.request.user.pk,
-        }
-        if plugins:
-            input_kwargs.update({"plugins": plugins})

+        # Extract all form fields
+        tag = form.cleaned_data["tag"]
+        depth = int(form.cleaned_data["depth"])
+        plugins = ','.join(form.cleaned_data.get("plugins", []))
+        schedule = form.cleaned_data.get("schedule", "").strip()
+        persona = form.cleaned_data.get("persona", "Default")
+        overwrite = form.cleaned_data.get("overwrite", False)
+        update = form.cleaned_data.get("update", False)
+        index_only = form.cleaned_data.get("index_only", False)
+        notes = form.cleaned_data.get("notes", "")
+        custom_config = form.cleaned_data.get("config", {})

        from archivebox.config.permissions import HOSTNAME

@@ -461,33 +459,59 @@ class AddView(UserPassesTestMixin, FormView):
        # 2. create a new Crawl with the URLs from the file
        timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
        urls_content = sources_file.read_text()
+        # Build complete config
+        config = {
+            'ONLY_NEW': not update,
+            'INDEX_ONLY': index_only,
+            'OVERWRITE': overwrite,
+            'DEPTH': depth,
+            'PLUGINS': plugins or '',
+            'DEFAULT_PERSONA': persona or 'Default',
+        }
+
+        # Merge custom config overrides
+        config.update(custom_config)
+
        crawl = Crawl.objects.create(
            urls=urls_content,
            max_depth=depth,
            tags_str=tag,
+            notes=notes,
            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
            created_by_id=self.request.user.pk,
-            config={
-                # 'ONLY_NEW': not update,
-                # 'INDEX_ONLY': index_only,
-                # 'OVERWRITE': False,
-                'DEPTH': depth,
-                'PLUGINS': plugins or '',
-                # 'DEFAULT_PERSONA': persona or 'Default',
-            }
+            config=config
        )
-        
+
+        # 3. create a CrawlSchedule if schedule is provided
+        if schedule:
+            from crawls.models import CrawlSchedule
+            crawl_schedule = CrawlSchedule.objects.create(
+                template=crawl,
+                schedule=schedule,
+                is_enabled=True,
+                label=crawl.label,
+                notes=f"Auto-created from add page. {notes}".strip(),
+                created_by_id=self.request.user.pk,
+            )
+            crawl.schedule = crawl_schedule
+            crawl.save(update_fields=['schedule'])
+
        # 4. start the Orchestrator & wait until it completes
        #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
-        # from crawls.actors import CrawlActor
-        # from core.actors import SnapshotActor, ArchiveResultActor
-    
+        # from archivebox.crawls.actors import CrawlActor
+        # from archivebox.core.actors import SnapshotActor, ArchiveResultActor
+

        rough_url_count = urls.count('://')

+        # Build success message with schedule link if created
+        schedule_msg = ""
+        if schedule:
+            schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
+
        messages.success(
            self.request,
-            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
+            mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
        )

        # Orchestrator (managed by supervisord) will pick up the queued crawl
@@ -516,8 +540,8 @@ def live_progress_view(request):
    """Simple JSON endpoint for live progress status - used by admin progress monitor."""
    try:
        from workers.orchestrator import Orchestrator
-        from crawls.models import Crawl
-        from core.models import Snapshot, ArchiveResult
+        from archivebox.crawls.models import Crawl
+        from archivebox.core.models import Snapshot, ArchiveResult
        from django.db.models import Case, When, Value, IntegerField

        # Get orchestrator status
@@ -764,9 +788,9 @@ def key_is_safe(key: str) -> bool:
 def find_config_source(key: str, merged_config: dict) -> str:
    """Determine where a config value comes from."""
    import os
-    from machine.models import Machine
+    from archivebox.machine.models import Machine

-    # Check if it's from machine config
+    # Check if it's from archivebox.machine.config
    try:
        machine = Machine.current()
        if machine.config and key in machine.config:
@@ -778,7 +802,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
    if key in os.environ:
        return 'Environment'

-    # Check if it's from config file
+    # Check if it's from archivebox.config.file
    from archivebox.config.configset import BaseConfigSet
    file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
    if key in file_config:
@@ -796,7 +820,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:

    # Get merged config that includes Machine.config overrides
    try:
-        from machine.models import Machine
+        from archivebox.machine.models import Machine
        machine = Machine.current()
        merged_config = get_config()
    except Exception as e:
@@ -859,7 +883,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
 def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
    import os
-    from machine.models import Machine
+    from archivebox.machine.models import Machine
    from archivebox.config.configset import BaseConfigSet

    CONFIGS = get_all_configs()