remove huey

2026-04-05 15:27:53 +10:00 · 2025-12-24 23:40:18 -08:00
parent 6c769d831c
commit d95f0dc186
105 changed files with 3635 additions and 1402 deletions
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError
 from django.urls import reverse, resolve
 from django.utils import timezone

-from huey_monitor.admin import TaskModel
-
 from archivebox.config import DATA_DIR
 from archivebox.config.common import SERVER_CONFIG
 from archivebox.misc.paginators import AccelleratedPaginator
 from archivebox.base_models.admin import BaseModelAdmin
+from archivebox.hooks import get_extractor_icon


 from core.models import ArchiveResult, Snapshot



-
-def result_url(result: TaskModel) -> str:
-    url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
-    return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
-
-
-
 class ArchiveResultInline(admin.TabularInline):
    name = 'Archive Results Log'
    model = ArchiveResult
@@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline):


 class ArchiveResultAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str')
+    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
    sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
-    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary')
+    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
    search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
    fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
    autocomplete_fields = ['snapshot']
@@ -144,17 +136,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
    def tags_str(self, result):
        return result.snapshot.tags_str()

+    @admin.display(description='Extractor', ordering='extractor')
+    def extractor_with_icon(self, result):
+        icon = get_extractor_icon(result.extractor)
+        return format_html(
+            '<span title="{}">{}</span> {}',
+            result.extractor,
+            icon,
+            result.extractor,
+        )
+
    def cmd_str(self, result):
        return format_html(
            '<pre>{}</pre>',
            ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
        )
-    
+
    def output_str(self, result):
+        # Determine output link path - use output if file exists, otherwise link to index
+        output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
        return format_html(
            '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
            result.snapshot.timestamp,
-            result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
+            output_path,
            result.output,
        )

@@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
                is_hidden = filename.startswith('.')
                output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())

-        return output_str + format_html('</code></pre>')
+        return output_str + mark_safe('</code></pre>')



--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -35,8 +35,19 @@ def register_admin_site():

    admin.site = archivebox_admin
    sites.site = archivebox_admin
-    
-    # Plugin admin registration is now handled by individual app admins
-    # No longer using archivebox.pm.hook.register_admin()
-    
+
+    # Register admin views for each app
+    # (Previously handled by ABX plugin system, now called directly)
+    from core.admin import register_admin as register_core_admin
+    from crawls.admin import register_admin as register_crawls_admin
+    from api.admin import register_admin as register_api_admin
+    from machine.admin import register_admin as register_machine_admin
+    from workers.admin import register_admin as register_workers_admin
+
+    register_core_admin(archivebox_admin)
+    register_crawls_admin(archivebox_admin)
+    register_api_admin(archivebox_admin)
+    register_machine_admin(archivebox_admin)
+    register_workers_admin(archivebox_admin)
+
    return archivebox_admin
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add

 from core.models import Tag
 from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline, result_url
+from core.admin_archiveresults import ArchiveResultInline


 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm):
 class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
    sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
-    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
+    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
    search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
-    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
+    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
    ordering = ['-created_at']
    actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
    inlines = [TagInline, ArchiveResultInline]
@@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    #     self.request = request
    #     return super().get_queryset(request).prefetch_related('archiveresult_set').distinct()  # .annotate(archiveresult_count=Count('archiveresult'))

-    @admin.action(
-        description="Imported Timestamp"
-    )
+    @admin.display(description="Imported Timestamp")
    def imported_timestamp(self, obj):
        context = RequestContext(self.request, {
-            'bookmarked_date': obj.bookmarked,
+            'bookmarked_date': obj.bookmarked_at,
            'timestamp': obj.timestamp,
        })

@@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):

    def status_info(self, obj):
        return format_html(
-            # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
            '''
            Archived: {} ({} files {}) &nbsp; &nbsp;
            Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
-            Status code: {} &nbsp; &nbsp;<br/>
-            Server: {} &nbsp; &nbsp;
-            Content type: {} &nbsp; &nbsp;
            Extension: {} &nbsp; &nbsp;
            ''',
            '✅' if obj.is_archived else '❌',
            obj.num_outputs,
            self.size(obj) or '0kb',
            f'/archive/{obj.timestamp}/favicon.ico',
-            obj.status_code or '-',
-            obj.headers and obj.headers.get('Server') or '-',
-            obj.headers and obj.headers.get('Content-Type') or '-',
            obj.extension or '-',
        )

@@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            obj.archive_path,
            obj.archive_path,
            obj.archive_path,
-            'fetched' if obj.latest_title or obj.title else 'pending',
-            urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
+            'fetched' if obj.title else 'pending',
+            urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
        ) + mark_safe(f' <span class="tags">{tags}</span>')

    @admin.display(
@@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
        description="ℹ️ Get Title"
    )
    def update_titles(self, request, queryset):
-        from core.models import Snapshot
        count = queryset.count()

        # Queue snapshots for archiving via the state machine system
-        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
+        queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
        messages.success(
            request,
-            mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
+            f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
        )

    @admin.action(
@@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    def update_snapshots(self, request, queryset):
        count = queryset.count()

-        result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
+        queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})

        messages.success(
            request,
-            mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
+            f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
        )


@@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            timestamp = timezone.now().isoformat('T', 'seconds')
            new_url = snapshot.url.split('#')[0] + f'#{timestamp}'

-            result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
+            bg_add({'urls': new_url, 'tag': snapshot.tags_str()})

        messages.success(
            request,
-            mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
+            f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
        )

    @admin.action(
@@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    def overwrite_snapshots(self, request, queryset):
        count = queryset.count()

-        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
+        queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})

        messages.success(
            request,
-            mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
+            f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
        )

    @admin.action(
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -1,5 +1,7 @@
 __package__ = 'archivebox.core'

+import sys
+
 from django.apps import AppConfig


@@ -10,6 +12,41 @@ class CoreConfig(AppConfig):
        """Register the archivebox.core.admin_site as the main django admin site"""
        from core.admin_site import register_admin_site
        register_admin_site()
-        

+        # Auto-start the orchestrator when running the web server
+        self._maybe_start_orchestrator()

+    def _maybe_start_orchestrator(self):
+        """Start the orchestrator if we're running a web server."""
+        import os
+
+        # Don't start orchestrator during migrations, shell, tests, etc.
+        # Only start when running: runserver, daphne, gunicorn, uwsgi
+        if not self._is_web_server():
+            return
+
+        # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
+        if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
+            return
+
+        # Don't start in autoreload child process (avoid double-start)
+        if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
+            return
+
+        try:
+            from workers.orchestrator import Orchestrator
+
+            if not Orchestrator.is_running():
+                # Start orchestrator as daemon (won't exit on idle when started by server)
+                orchestrator = Orchestrator(exit_on_idle=False)
+                orchestrator.start()
+        except Exception as e:
+            # Don't crash the server if orchestrator fails to start
+            import logging
+            logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
+
+    def _is_web_server(self) -> bool:
+        """Check if we're running a web server command."""
+        # Check for common web server indicators
+        server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
+        return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS
 from archivebox.misc.system import get_dir_size, atomic_write
 from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
 from archivebox.misc.hashing import get_dir_info
-from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from archivebox.hooks import (
+    ARCHIVE_METHODS_INDEXING_PRECEDENCE,
+    get_extractors, get_extractor_name, get_extractor_icon,
+    DEFAULT_EXTRACTOR_ICONS,
+)
 from archivebox.base_models.models import (
    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
@@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def icons(self) -> str:
        """Generate HTML icons showing which extractors have succeeded for this snapshot"""
        from django.utils.html import format_html, mark_safe
-        from collections import defaultdict

        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'

        def calc_icons():
            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
-                archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
+                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
            else:
-                archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
+                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}

            path = self.archive_path
            canon = self.canonical_outputs()
            output = ""
            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
-            icons = {
-                "singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
-                "screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
-                "readability": "🆁", "mercury": "🅼", "warc": "📦"
-            }
-            exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]

-            extractor_outputs = defaultdict(lambda: None)
-            for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
-                for result in archive_results:
-                    if result.extractor == extractor:
-                        extractor_outputs[extractor] = result
+            # Get all extractors from hooks system (sorted by numeric prefix)
+            all_extractors = [get_extractor_name(e) for e in get_extractors()]

-            for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
-                if extractor not in exclude:
-                    existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-                    output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
-                if extractor == "wget":
-                    exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-                    output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
-                if extractor == "archive_org":
-                    exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
-                    output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
+            for extractor in all_extractors:
+                result = archive_results.get(extractor)
+                existing = result and result.status == 'succeeded' and result.output
+                icon = get_extractor_icon(extractor)
+                output += format_html(
+                    output_template,
+                    path,
+                    canon.get(extractor, extractor + '/'),
+                    str(bool(existing)),
+                    extractor,
+                    icon
+                )

-            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
+            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))

        cache_result = cache.get(cache_key)
        if cache_result:
@@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        FAILED = 'failed', 'Failed'
        SKIPPED = 'skipped', 'Skipped'

-    EXTRACTOR_CHOICES = (
-        ('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'),
-        ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'),
-        ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'),
-        ('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
-    )
+    @classmethod
+    def get_extractor_choices(cls):
+        """Get extractor choices from discovered hooks (for forms/admin)."""
+        extractors = [get_extractor_name(e) for e in get_extractors()]
+        return tuple((e, e) for e in extractors)

    # Keep AutoField for backward compatibility with 0.7.x databases
    # UUID field is added separately by migration for new records
@@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    modified_at = models.DateTimeField(auto_now=True)

    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)  # type: ignore
-    extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
+    # No choices= constraint - extractor names come from plugin system and can be any string
+    extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True)
    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
    cmd = models.JSONField(default=None, null=True, blank=True)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    def output_exists(self) -> bool:
        return os.path.exists(Path(self.snapshot_dir) / self.extractor)

+    def embed_path(self) -> Optional[str]:
+        """
+        Get the relative path to the embeddable output file for this result.
+
+        Returns the output field if set and file exists, otherwise tries to
+        find a reasonable default based on the extractor type.
+        """
+        if self.output:
+            return self.output
+
+        # Try to find output file based on extractor's canonical output path
+        canonical = self.snapshot.canonical_outputs()
+        extractor_key = f'{self.extractor}_path'
+        if extractor_key in canonical:
+            return canonical[extractor_key]
+
+        # Fallback to extractor directory
+        return f'{self.extractor}/'
+
    def create_output_dir(self):
        output_dir = Path(self.snapshot_dir) / self.extractor
        output_dir.mkdir(parents=True, exist_ok=True)
@@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            output_dir=extractor_dir,
            config_objects=config_objects,
            url=self.snapshot.url,
+            snapshot_id=str(self.snapshot.id),
        )
        end_ts = timezone.now()

@@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
                hook,
                output_dir=self.output_dir,
                config_objects=config_objects,
+                url=self.snapshot.url,
                snapshot_id=str(self.snapshot.id),
                extractor=self.extractor,
            )
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -68,9 +68,6 @@ INSTALLED_APPS = [
    # 3rd-party apps from PyPI that need to be loaded last
    "admin_data_views",  # handles rendering some convenient automatic read-only views of data in Django admin
    "django_extensions",  # provides Django Debug Toolbar (and other non-debug helpers)
-    "django_huey",  # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
-    "bx_django_utils",  # needed for huey_monitor https://github.com/boxine/bx_django_utils
-    "huey_monitor",  # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
 ]


@@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None}
 # as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
 DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

-HUEY = {
-    "huey_class": "huey.SqliteHuey",
-    "filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
-    "name": "commands",
-    "results": True,
-    "store_none": True,
-    "immediate": False,
-    "utc": True,
-    "consumer": {
-        "workers": 1,
-        "worker_type": "thread",
-        "initial_delay": 0.1,  # Smallest polling interval, same as -d.
-        "backoff": 1.15,  # Exponential backoff using this rate, -b.
-        "max_delay": 10.0,  # Max possible polling interval, -m.
-        "scheduler_interval": 1,  # Check schedule every second, -s.
-        "periodic": True,  # Enable crontab feature.
-        "check_worker_health": True,  # Enable worker health checks.
-        "health_check_interval": 1,  # Check worker health every second.
-    },
-}
-
-# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
-# https://github.com/gaiacoop/django-huey
-DJANGO_HUEY = {
-    "default": "commands",
-    "queues": {
-        HUEY["name"]: HUEY.copy(),
-        # more registered here at plugin import-time by BaseQueue.register()
-        # Additional huey queues configured via settings
-    },
-}
-
-
-class HueyDBRouter:
-    """
-    A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database.
-    We keep the databases separate because the queue database receives many more reads/writes per second
-    and we want to avoid single-write lock contention with the main database. Also all the in-progress task
-    data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical
-    temp data by just deleting queue.sqlite3 and leaving index.sqlite3.
-    """
-
-    route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
-    db_name = "queue"
-
-    def db_for_read(self, model, **hints):
-        if model._meta.app_label in self.route_app_labels:
-            return self.db_name
-        return "default"
-
-    def db_for_write(self, model, **hints):
-        if model._meta.app_label in self.route_app_labels:
-            return self.db_name
-        return "default"
-
-    def allow_relation(self, obj1, obj2, **hints):
-        if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
-            return obj1._meta.app_label == obj2._meta.app_label
-        return None
-
-    def allow_migrate(self, db, app_label, model_name=None, **hints):
-        if app_label in self.route_app_labels:
-            return db == self.db_name
-        return db == "default"


 # class FilestoreDBRouter:
@@ -311,7 +244,7 @@ class HueyDBRouter:
 #             return db == self.db_name
 #         return db == "default"

-DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
+DATABASE_ROUTERS = []

 CACHES = {
    "default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -1,9 +1,13 @@
 from django import template
 from django.contrib.admin.templatetags.base import InclusionAdminNode
-
+from django.utils.safestring import mark_safe

 from typing import Union

+from archivebox.hooks import (
+    get_extractor_icon, get_extractor_template, get_extractor_name,
+)
+

 register = template.Library()

@@ -44,3 +48,115 @@ def url_replace(context, **kwargs):
    dict_ = context['request'].GET.copy()
    dict_.update(**kwargs)
    return dict_.urlencode()
+
+
+@register.simple_tag
+def extractor_icon(extractor: str) -> str:
+    """
+    Render the icon for an extractor.
+
+    Usage: {% extractor_icon "screenshot" %}
+    """
+    return mark_safe(get_extractor_icon(extractor))
+
+
+@register.simple_tag(takes_context=True)
+def extractor_thumbnail(context, result) -> str:
+    """
+    Render the thumbnail template for an archive result.
+
+    Usage: {% extractor_thumbnail result %}
+
+    Context variables passed to template:
+        - result: ArchiveResult object
+        - snapshot: Parent Snapshot object
+        - output_path: Path to output relative to snapshot dir (from embed_path())
+        - extractor: Extractor base name
+    """
+    extractor = get_extractor_name(result.extractor)
+    template_str = get_extractor_template(extractor, 'thumbnail')
+
+    if not template_str:
+        return ''
+
+    # Use embed_path() for the display path (includes canonical paths)
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+
+    # Create a mini template and render it with context
+    try:
+        tpl = template.Template(template_str)
+        ctx = template.Context({
+            'result': result,
+            'snapshot': result.snapshot,
+            'output_path': output_path,
+            'extractor': extractor,
+        })
+        return mark_safe(tpl.render(ctx))
+    except Exception:
+        return ''
+
+
+@register.simple_tag(takes_context=True)
+def extractor_embed(context, result) -> str:
+    """
+    Render the embed iframe template for an archive result.
+
+    Usage: {% extractor_embed result %}
+    """
+    extractor = get_extractor_name(result.extractor)
+    template_str = get_extractor_template(extractor, 'embed')
+
+    if not template_str:
+        return ''
+
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+
+    try:
+        tpl = template.Template(template_str)
+        ctx = template.Context({
+            'result': result,
+            'snapshot': result.snapshot,
+            'output_path': output_path,
+            'extractor': extractor,
+        })
+        return mark_safe(tpl.render(ctx))
+    except Exception:
+        return ''
+
+
+@register.simple_tag(takes_context=True)
+def extractor_fullscreen(context, result) -> str:
+    """
+    Render the fullscreen template for an archive result.
+
+    Usage: {% extractor_fullscreen result %}
+    """
+    extractor = get_extractor_name(result.extractor)
+    template_str = get_extractor_template(extractor, 'fullscreen')
+
+    if not template_str:
+        return ''
+
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+
+    try:
+        tpl = template.Template(template_str)
+        ctx = template.Context({
+            'result': result,
+            'snapshot': result.snapshot,
+            'output_path': output_path,
+            'extractor': extractor,
+        })
+        return mark_safe(tpl.render(ctx))
+    except Exception:
+        return ''
+
+
+@register.filter
+def extractor_name(value: str) -> str:
+    """
+    Get the base name of an extractor (strips numeric prefix).
+
+    Usage: {{ result.extractor|extractor_name }}
+    """
+    return get_extractor_name(value)
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
 from archivebox.misc.serve_static import serve_static

 from core.admin_site import archivebox_admin
-from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
+from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view

 from workers.views import JobsDashboardView

@@ -43,8 +43,10 @@ urlpatterns = [


    path('accounts/', include('django.contrib.auth.urls')),
+
+    path('admin/live-progress/', live_progress_view, name='live_progress'),
    path('admin/', archivebox_admin.urls),
-    
+
    path("api/",      include('api.urls'), name='api'),

    path('health/', HealthCheckView.as_view(), name='healthcheck'),
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
 from core.models import Snapshot
 from core.forms import AddLinkForm
 from crawls.models import Seed, Crawl
+from archivebox.hooks import get_extractors, get_extractor_name



@@ -54,8 +55,10 @@ class SnapshotView(View):
    @staticmethod
    def render_live_index(request, snapshot):
        TITLE_LOADING_MSG = 'Not yet archived...'
-        HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')

+        # Dict of extractor -> ArchiveResult object
+        archiveresult_objects = {}
+        # Dict of extractor -> result info dict (for template compatibility)
        archiveresults = {}

        results = snapshot.archiveresult_set.all()
@@ -65,18 +68,21 @@ class SnapshotView(View):
            abs_path = result.snapshot_dir / (embed_path or 'None')

            if (result.status == 'succeeded'
-                and (result.extractor not in HIDDEN_RESULTS)
                and embed_path
                and os.access(abs_path, os.R_OK)
                and abs_path.exists()):
                if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
                    continue

+                # Store the full ArchiveResult object for template tags
+                archiveresult_objects[result.extractor] = result
+
                result_info = {
                    'name': result.extractor,
                    'path': embed_path,
                    'ts': ts_to_date_str(result.end_ts),
                    'size': abs_path.stat().st_size or '?',
+                    'result': result,  # Include the full object for template tags
                }
                archiveresults[result.extractor] = result_info

@@ -101,11 +107,11 @@ class SnapshotView(View):
        }


-        # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
+        # iterate through all the files in the snapshot dir and add the biggest ones to the result list
        snap_dir = Path(snapshot.output_dir)
        if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
            return {}
-        
+
        for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
            extension = result_file.suffix.lstrip('.').lower()
            if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
@@ -121,12 +127,16 @@ class SnapshotView(View):
                    'path': result_file.relative_to(snap_dir),
                    'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
                    'size': file_size,
+                    'result': None,  # No ArchiveResult object for filesystem-discovered files
                }

-        preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
+        # Get available extractors from hooks (sorted by numeric prefix for ordering)
+        # Convert to base names for display ordering
+        all_extractors = [get_extractor_name(e) for e in get_extractors()]
+        preferred_types = tuple(all_extractors)
        all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)

-        best_result = {'path': 'None'}
+        best_result = {'path': 'None', 'result': None}
        for result_type in preferred_types:
            if result_type in archiveresults:
                best_result = archiveresults[result_type]
@@ -157,6 +167,7 @@ class SnapshotView(View):
            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
            'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
            'best_result': best_result,
+            'snapshot': snapshot,  # Pass the snapshot object for template tags
        }
        return render(template_name='core/snapshot_live.html', request=request, context=context)

@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
    def form_valid(self, form):
        urls = form.cleaned_data["url"]
        print(f'[+] Adding URL: {urls}')
-        parser = form.cleaned_data["parser"]
+        parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
        tag = form.cleaned_data["tag"]
        depth = 0 if form.cleaned_data["depth"] == "0" else 1
        extractors = ','.join(form.cleaned_data["archive_methods"])
@@ -452,18 +463,19 @@ class AddView(UserPassesTestMixin, FormView):
        if extractors:
            input_kwargs.update({"extractors": extractors})

-        
+
        from archivebox.config.permissions import HOSTNAME
-    
-    
+
+
        # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
        sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
-        
+
        # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
+        timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
        seed = Seed.from_file(
            sources_file,
-            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
+            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
            parser=parser,
            tag=tag,
            created_by=self.request.user.pk,
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
                # 'INDEX_ONLY': index_only,
                # 'OVERWRITE': False,
                'DEPTH': depth,
-                'EXTRACTORS': parser,
+                'EXTRACTORS': extractors or '',
                # 'DEFAULT_PERSONA': persona or 'Default',
            })
        # 3. create a new Crawl pointing to the Seed
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
            self.request,
            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
        )
-        # if not bg:
-        #     from workers.orchestrator import Orchestrator
-        #     orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
-        #     orchestrator.start()
+
+        # Start orchestrator in background to process the queued crawl
+        try:
+            from archivebox.workers.tasks import ensure_orchestrator_running
+            ensure_orchestrator_running()
+        except Exception as e:
+            # Orchestrator may already be running via supervisord, or fail to start
+            # This is not fatal - the crawl will be processed when orchestrator runs
+            print(f'[!] Failed to start orchestrator: {e}')

        return redirect(crawl.admin_change_url)

@@ -513,6 +530,141 @@ class HealthCheckView(View):
        )


+import json
+from django.http import JsonResponse
+
+def live_progress_view(request):
+    """Simple JSON endpoint for live progress status - used by admin progress monitor."""
+    try:
+        from workers.orchestrator import Orchestrator
+        from crawls.models import Crawl
+        from core.models import Snapshot, ArchiveResult
+
+        # Get orchestrator status
+        orchestrator_running = Orchestrator.is_running()
+        total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
+
+        # Get model counts by status
+        crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
+        crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
+
+        # Get recent crawls (last 24 hours)
+        from datetime import timedelta
+        one_day_ago = timezone.now() - timedelta(days=1)
+        crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
+
+        snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
+        snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
+
+        archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
+        archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
+        archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
+        archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
+
+        # Build hierarchical active crawls with nested snapshots and archive results
+        active_crawls = []
+        for crawl in Crawl.objects.filter(
+            status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
+        ).order_by('-modified_at')[:10]:
+            # Get snapshots for this crawl
+            crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
+            total_snapshots = crawl_snapshots.count()
+            completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
+            pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
+
+            # Calculate crawl progress
+            crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
+
+            # Get active snapshots for this crawl
+            active_snapshots_for_crawl = []
+            for snapshot in crawl_snapshots.filter(
+                status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
+            ).order_by('-modified_at')[:5]:
+                # Get archive results for this snapshot
+                snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
+                total_extractors = snapshot_results.count()
+                completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
+                failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
+                pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
+
+                # Calculate snapshot progress
+                snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
+
+                # Get active extractors for this snapshot
+                active_extractors = [
+                    {
+                        'id': str(ar.id),
+                        'extractor': ar.extractor,
+                        'status': ar.status,
+                        'started': ar.start_ts.isoformat() if ar.start_ts else None,
+                        'progress': 50,
+                    }
+                    for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
+                ]
+
+                active_snapshots_for_crawl.append({
+                    'id': str(snapshot.id),
+                    'url': snapshot.url[:80],
+                    'status': snapshot.status,
+                    'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
+                    'progress': snapshot_progress,
+                    'total_extractors': total_extractors,
+                    'completed_extractors': completed_extractors,
+                    'failed_extractors': failed_extractors,
+                    'pending_extractors': pending_extractors,
+                    'active_extractors': active_extractors,
+                })
+
+            active_crawls.append({
+                'id': str(crawl.id),
+                'label': str(crawl)[:60],
+                'status': crawl.status,
+                'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
+                'progress': crawl_progress,
+                'max_depth': crawl.max_depth,
+                'total_snapshots': total_snapshots,
+                'completed_snapshots': completed_snapshots,
+                'failed_snapshots': 0,
+                'pending_snapshots': pending_snapshots,
+                'active_snapshots': active_snapshots_for_crawl,
+            })
+
+        return JsonResponse({
+            'orchestrator_running': orchestrator_running,
+            'total_workers': total_workers,
+            'crawls_pending': crawls_pending,
+            'crawls_started': crawls_started,
+            'crawls_recent': crawls_recent,
+            'snapshots_pending': snapshots_pending,
+            'snapshots_started': snapshots_started,
+            'archiveresults_pending': archiveresults_pending,
+            'archiveresults_started': archiveresults_started,
+            'archiveresults_succeeded': archiveresults_succeeded,
+            'archiveresults_failed': archiveresults_failed,
+            'active_crawls': active_crawls,
+            'server_time': timezone.now().isoformat(),
+        })
+    except Exception as e:
+        import traceback
+        return JsonResponse({
+            'error': str(e),
+            'traceback': traceback.format_exc(),
+            'orchestrator_running': False,
+            'total_workers': 0,
+            'crawls_pending': 0,
+            'crawls_started': 0,
+            'crawls_recent': 0,
+            'snapshots_pending': 0,
+            'snapshots_started': 0,
+            'archiveresults_pending': 0,
+            'archiveresults_started': 0,
+            'archiveresults_succeeded': 0,
+            'archiveresults_failed': 0,
+            'active_crawls': [],
+            'server_time': timezone.now().isoformat(),
+        }, status=500)
+
+
 def find_config_section(key: str) -> str:
    CONFIGS = get_all_configs()