remove huey

2026-04-03 14:27:55 +10:00 · 2025-12-24 23:40:18 -08:00
parent 6c769d831c
commit d95f0dc186
105 changed files with 3635 additions and 1402 deletions
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -42,6 +42,7 @@ def register_urls(api: NinjaAPI) -> NinjaAPI:
    api.add_router('/crawls/',   'api.v1_crawls.router')
    api.add_router('/cli/',      'api.v1_cli.router')
    api.add_router('/workers/',  'api.v1_workers.router')
    api.add_router('/machine/',  'api.v1_machine.router')
    return api
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -115,8 +115,9 @@ def cli_add(request, args: AddCommandSchema):
        update=args.update,
        index_only=args.index_only,
        overwrite=args.overwrite,
-        extract=args.extract,
+        plugins=args.extract,  # extract in API maps to plugins param
        parser=args.parser,
        bg=True,  # Always run in background for API calls
    )
    return {
--- a/archivebox/api/v1_machine.py
+++ b/archivebox/api/v1_machine.py
@@ -0,0 +1,206 @@
 __package__ = 'archivebox.api'
 from uuid import UUID
 from typing import List, Optional
 from datetime import datetime
 from ninja import Router, Schema, FilterSchema, Field, Query
 from ninja.pagination import paginate
 from api.v1_core import CustomPagination
 router = Router(tags=['Machine and Dependencies'])
 # ============================================================================
 # Machine Schemas
 # ============================================================================
 class MachineSchema(Schema):
    """Schema for Machine model."""
    TYPE: str = 'machine.Machine'
    id: UUID
    created_at: datetime
    modified_at: datetime
    guid: str
    hostname: str
    hw_in_docker: bool
    hw_in_vm: bool
    hw_manufacturer: str
    hw_product: str
    hw_uuid: str
    os_arch: str
    os_family: str
    os_platform: str
    os_release: str
    os_kernel: str
    stats: dict
    num_uses_succeeded: int
    num_uses_failed: int
 class MachineFilterSchema(FilterSchema):
    id: Optional[str] = Field(None, q='id__startswith')
    hostname: Optional[str] = Field(None, q='hostname__icontains')
    os_platform: Optional[str] = Field(None, q='os_platform__icontains')
    os_arch: Optional[str] = Field(None, q='os_arch')
    hw_in_docker: Optional[bool] = Field(None, q='hw_in_docker')
    hw_in_vm: Optional[bool] = Field(None, q='hw_in_vm')
 # ============================================================================
 # Dependency Schemas
 # ============================================================================
 class DependencySchema(Schema):
    """Schema for Dependency model."""
    TYPE: str = 'machine.Dependency'
    id: UUID
    created_at: datetime
    modified_at: datetime
    bin_name: str
    bin_providers: str
    custom_cmds: dict
    config: dict
    is_installed: bool
    installed_count: int
    @staticmethod
    def resolve_is_installed(obj) -> bool:
        return obj.is_installed
    @staticmethod
    def resolve_installed_count(obj) -> int:
        return obj.installed_binaries.count()
 class DependencyFilterSchema(FilterSchema):
    id: Optional[str] = Field(None, q='id__startswith')
    bin_name: Optional[str] = Field(None, q='bin_name__icontains')
    bin_providers: Optional[str] = Field(None, q='bin_providers__icontains')
 # ============================================================================
 # InstalledBinary Schemas
 # ============================================================================
 class InstalledBinarySchema(Schema):
    """Schema for InstalledBinary model."""
    TYPE: str = 'machine.InstalledBinary'
    id: UUID
    created_at: datetime
    modified_at: datetime
    machine_id: UUID
    machine_hostname: str
    dependency_id: Optional[UUID]
    dependency_bin_name: Optional[str]
    name: str
    binprovider: str
    abspath: str
    version: str
    sha256: str
    is_valid: bool
    num_uses_succeeded: int
    num_uses_failed: int
    @staticmethod
    def resolve_machine_hostname(obj) -> str:
        return obj.machine.hostname
    @staticmethod
    def resolve_dependency_id(obj) -> Optional[UUID]:
        return obj.dependency_id
    @staticmethod
    def resolve_dependency_bin_name(obj) -> Optional[str]:
        return obj.dependency.bin_name if obj.dependency else None
    @staticmethod
    def resolve_is_valid(obj) -> bool:
        return obj.is_valid
 class InstalledBinaryFilterSchema(FilterSchema):
    id: Optional[str] = Field(None, q='id__startswith')
    name: Optional[str] = Field(None, q='name__icontains')
    binprovider: Optional[str] = Field(None, q='binprovider')
    machine_id: Optional[str] = Field(None, q='machine_id__startswith')
    dependency_id: Optional[str] = Field(None, q='dependency_id__startswith')
    version: Optional[str] = Field(None, q='version__icontains')
 # ============================================================================
 # Machine Endpoints
 # ============================================================================
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
@paginate(CustomPagination)
 def get_machines(request, filters: MachineFilterSchema = Query(...)):
    """List all machines."""
    from machine.models import Machine
    return filters.filter(Machine.objects.all()).distinct()
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
 def get_machine(request, machine_id: str):
    """Get a specific machine by ID."""
    from machine.models import Machine
    from django.db.models import Q
    return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
 def get_current_machine(request):
    """Get the current machine."""
    from machine.models import Machine
    return Machine.current()
 # ============================================================================
 # Dependency Endpoints
 # ============================================================================
@router.get("/dependencies", response=List[DependencySchema], url_name="get_dependencies")
@paginate(CustomPagination)
 def get_dependencies(request, filters: DependencyFilterSchema = Query(...)):
    """List all dependencies."""
    from machine.models import Dependency
    return filters.filter(Dependency.objects.all()).distinct()
@router.get("/dependency/{dependency_id}", response=DependencySchema, url_name="get_dependency")
 def get_dependency(request, dependency_id: str):
    """Get a specific dependency by ID or bin_name."""
    from machine.models import Dependency
    from django.db.models import Q
    try:
        return Dependency.objects.get(Q(id__startswith=dependency_id))
    except Dependency.DoesNotExist:
        return Dependency.objects.get(bin_name__iexact=dependency_id)
 # ============================================================================
 # InstalledBinary Endpoints
 # ============================================================================
@router.get("/binaries", response=List[InstalledBinarySchema], url_name="get_binaries")
@paginate(CustomPagination)
 def get_binaries(request, filters: InstalledBinaryFilterSchema = Query(...)):
    """List all installed binaries."""
    from machine.models import InstalledBinary
    return filters.filter(InstalledBinary.objects.all().select_related('machine', 'dependency')).distinct()
@router.get("/binary/{binary_id}", response=InstalledBinarySchema, url_name="get_binary")
 def get_binary(request, binary_id: str):
    """Get a specific installed binary by ID."""
    from machine.models import InstalledBinary
    return InstalledBinary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
@router.get("/binary/by-name/{name}", response=List[InstalledBinarySchema], url_name="get_binaries_by_name")
 def get_binaries_by_name(request, name: str):
    """Get all installed binaries with the given name."""
    from machine.models import InstalledBinary
    return list(InstalledBinary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))
--- a/archivebox/api/v1_workers.py
+++ b/archivebox/api/v1_workers.py
@@ -4,125 +4,157 @@ from uuid import UUID
 from typing import List, Any
 from datetime import datetime
 from ninja import Router, Schema
 router = Router(tags=['Workers and Tasks'])
-class TaskSchema(Schema):
+class QueueItemSchema(Schema):
    """Schema for a single item in a worker's queue."""
    TYPE: str
    id: UUID
    description: str
    status: str
    retry_at: datetime | None
    created_at: datetime
    modified_at: datetime
-    created_by_id: int
+    description: str
    @staticmethod
    def resolve_TYPE(obj) -> str:
        return f'{obj._meta.app_label}.{obj._meta.model_name}'
    @staticmethod
    def resolve_description(obj) -> str:
        return str(obj)
-class ActorSchema(Schema):
+class WorkerSchema(Schema):
-    # TYPE: str = 'workers.actor.ActorType'
+    """Schema for a Worker type."""
-
+    name: str
    # name: str
    #pid: int | None
    idle_count: int
    launch_kwargs: dict[str, Any]
    mode: str
    model: str
-    statemachine: str
+    max_tick_time: int
-    ACTIVE_STATE: str
+    max_concurrent_tasks: int
-    EVENT_NAME: str
+    poll_interval: float
-    CLAIM_ORDER: list[str]
+    idle_timeout: int
-    CLAIM_FROM_TOP_N: int
+    running_count: int
-    CLAIM_ATOMIC: bool
+    running_workers: List[dict[str, Any]]
-    MAX_TICK_TIME: int
+    queue_count: int
-    MAX_CONCURRENT_ACTORS: int
+    queue: List[QueueItemSchema]
    future: list[TaskSchema]
    pending: list[TaskSchema]
    stalled: list[TaskSchema]
    active: list[TaskSchema]
    past: list[TaskSchema]
    @staticmethod
    def resolve_model(obj) -> str:
-        return obj.Model.__name__
+        Model = obj.get_model()
        return f'{Model._meta.app_label}.{Model._meta.model_name}'
    @staticmethod
-    def resolve_statemachine(obj) -> str:
+    def resolve_max_tick_time(obj) -> int:
-        return obj.StateMachineClass.__name__
+        return obj.MAX_TICK_TIME
    @staticmethod
-    def resolve_name(obj) -> str:
+    def resolve_max_concurrent_tasks(obj) -> int:
-        return str(obj)
+        return obj.MAX_CONCURRENT_TASKS
    @staticmethod
-    def resolve_ACTIVE_STATE(obj) -> str:
+    def resolve_poll_interval(obj) -> float:
-        return str(obj.ACTIVE_STATE)
+        return obj.POLL_INTERVAL
    @staticmethod
-    def resolve_FINAL_STATES(obj) -> list[str]:
+    def resolve_idle_timeout(obj) -> int:
-        return [str(state) for state in obj.FINAL_STATES]
+        return obj.IDLE_TIMEOUT
    @staticmethod
-    def resolve_future(obj) -> list[TaskSchema]:
+    def resolve_running_count(obj) -> int:
-        return [obj for obj in obj.qs.filter(obj.future_q).order_by('-retry_at')]
+        return len(obj.get_running_workers())
    @staticmethod
-    def resolve_pending(obj) -> list[TaskSchema]:
+    def resolve_running_workers(obj) -> List[dict[str, Any]]:
-        return [obj for obj in obj.qs.filter(obj.pending_q).order_by('-retry_at')]
+        return obj.get_running_workers()
    @staticmethod
-    def resolve_stalled(obj) -> list[TaskSchema]:
+    def resolve_queue_count(obj) -> int:
-        return [obj for obj in obj.qs.filter(obj.stalled_q).order_by('-retry_at')]
+        return obj.get_queue().count()
    @staticmethod
-    def resolve_active(obj) -> list[TaskSchema]:
+    def resolve_queue(obj) -> List[QueueItemSchema]:
-        return [obj for obj in obj.qs.filter(obj.active_q).order_by('-retry_at')]
+        return list(obj.get_queue()[:50])  # Limit to 50 items
    @staticmethod
    def resolve_past(obj) -> list[TaskSchema]:
        return [obj for obj in obj.qs.filter(obj.final_q).order_by('-modified_at')]
 class OrchestratorSchema(Schema):
-    # TYPE: str = 'workers.orchestrator.Orchestrator'
+    """Schema for the Orchestrator."""
-
+    is_running: bool
-    #pid: int | None
+    poll_interval: float
-    exit_on_idle: bool
+    idle_timeout: int
-    mode: str
+    max_workers_per_type: int
-
+    max_total_workers: int
-    actors: list[ActorSchema]
+    total_worker_count: int
-    
+    workers: List[WorkerSchema]
    @staticmethod
    def resolve_actors(obj) -> list[ActorSchema]:
        return [actor() for actor in obj.actor_types.values()]
-@router.get("/orchestrators", response=List[OrchestratorSchema], url_name="get_orchestrators")
+@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
-def get_orchestrators(request):
+def get_orchestrator(request):
-    """List all the task orchestrators (aka Orchestrators) that are currently running"""
+    """Get the orchestrator status and all worker queues."""
    from workers.orchestrator import Orchestrator
    from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
    orchestrator = Orchestrator()
-    return [orchestrator]
+    # Create temporary worker instances to query their queues
    workers = [
        CrawlWorker(worker_id=-1),
        SnapshotWorker(worker_id=-1),
        ArchiveResultWorker(worker_id=-1),
    ]
    return {
        'is_running': orchestrator.is_running(),
        'poll_interval': orchestrator.POLL_INTERVAL,
        'idle_timeout': orchestrator.IDLE_TIMEOUT,
        'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE,
        'max_total_workers': orchestrator.MAX_TOTAL_WORKERS,
        'total_worker_count': orchestrator.get_total_worker_count(),
        'workers': workers,
    }
-@router.get("/actors", response=List[ActorSchema], url_name="get_actors")
+@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
-def get_actors(request):
+def get_workers(request):
-    """List all the task consumer workers (aka Actors) that are currently running"""
+    """List all worker types and their current status."""
    from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
-    from workers.orchestrator import Orchestrator
+    # Create temporary instances to query their queues
-    orchestrator = Orchestrator()
+    return [
-    return orchestrator.actor_types.values()
+        CrawlWorker(worker_id=-1),
        SnapshotWorker(worker_id=-1),
        ArchiveResultWorker(worker_id=-1),
    ]
@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker")
 def get_worker(request, worker_name: str):
    """Get status and queue for a specific worker type."""
    from workers.worker import WORKER_TYPES
    if worker_name not in WORKER_TYPES:
        from ninja.errors import HttpError
        raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
    WorkerClass = WORKER_TYPES[worker_name]
    return WorkerClass(worker_id=-1)
@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue")
 def get_worker_queue(request, worker_name: str, limit: int = 100):
    """Get the current queue for a specific worker type."""
    from workers.worker import WORKER_TYPES
    if worker_name not in WORKER_TYPES:
        from ninja.errors import HttpError
        raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
    WorkerClass = WORKER_TYPES[worker_name]
    worker = WorkerClass(worker_id=-1)
    return list(worker.get_queue()[:limit])
 # Progress endpoint moved to core.views.live_progress_view for simplicity
--- a/archivebox/base_models/admin.py
+++ b/archivebox/base_models/admin.py
@@ -2,76 +2,226 @@
 __package__ = 'archivebox.base_models'
 import json
 from django import forms
 from django.contrib import admin
 from django.utils.html import format_html, mark_safe
 from django_object_actions import DjangoObjectActions
 class KeyValueWidget(forms.Widget):
    """
    A widget that renders JSON dict as editable key-value input fields
    with + and - buttons to add/remove rows.
    Includes autocomplete for available config keys from the plugin system.
    """
    template_name = None  # We render manually
    class Media:
        css = {
            'all': []
        }
        js = []
    def _get_config_options(self):
        """Get available config options from plugins."""
        try:
            from archivebox.hooks import discover_plugin_configs
            plugin_configs = discover_plugin_configs()
            options = {}
            for plugin_name, schema in plugin_configs.items():
                for key, prop in schema.get('properties', {}).items():
                    options[key] = {
                        'plugin': plugin_name,
                        'type': prop.get('type', 'string'),
                        'default': prop.get('default', ''),
                        'description': prop.get('description', ''),
                    }
            return options
        except Exception:
            return {}
    def render(self, name, value, attrs=None, renderer=None):
        # Parse JSON value to dict
        if value is None:
            data = {}
        elif isinstance(value, str):
            try:
                data = json.loads(value) if value else {}
            except json.JSONDecodeError:
                data = {}
        elif isinstance(value, dict):
            data = value
        else:
            data = {}
        widget_id = attrs.get('id', name) if attrs else name
        config_options = self._get_config_options()
        # Build datalist options
        datalist_options = '\n'.join(
            f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>'
            for key, opt in sorted(config_options.items())
        )
        # Build config metadata as JSON for JS
        config_meta_json = json.dumps(config_options)
        html = f'''
        <div id="{widget_id}_container" class="key-value-editor" style="max-width: 700px;">
            <datalist id="{widget_id}_keys">
                {datalist_options}
            </datalist>
            <div id="{widget_id}_rows" class="key-value-rows">
        '''
        # Render existing key-value pairs
        row_idx = 0
        for key, val in data.items():
            val_str = json.dumps(val) if not isinstance(val, str) else val
            html += self._render_row(widget_id, row_idx, key, val_str)
            row_idx += 1
        # Always add one empty row for new entries
        html += self._render_row(widget_id, row_idx, '', '')
        html += f'''
            </div>
            <div style="display: flex; gap: 8px; align-items: center; margin-top: 8px;">
                <button type="button" onclick="addKeyValueRow_{widget_id}()"
                        style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;">
                    + Add Row
                </button>
                <span id="{widget_id}_hint" style="font-size: 11px; color: #666; font-style: italic;"></span>
            </div>
            <input type="hidden" name="{name}" id="{widget_id}" value="">
            <script>
                (function() {{
                    var configMeta_{widget_id} = {config_meta_json};
                    function showKeyHint_{widget_id}(key) {{
                        var hint = document.getElementById('{widget_id}_hint');
                        var meta = configMeta_{widget_id}[key];
                        if (meta) {{
                            hint.innerHTML = '<b>' + key + '</b>: ' + (meta.description || meta.type) +
                                (meta.default !== '' ? ' <span style="color:#888">(default: ' + meta.default + ')</span>' : '');
                        }} else {{
                            hint.textContent = key ? 'Custom key: ' + key : '';
                        }}
                    }}
                    function updateHiddenField_{widget_id}() {{
                        var container = document.getElementById('{widget_id}_rows');
                        var rows = container.querySelectorAll('.key-value-row');
                        var result = {{}};
                        rows.forEach(function(row) {{
                            var keyInput = row.querySelector('.kv-key');
                            var valInput = row.querySelector('.kv-value');
                            if (keyInput && valInput && keyInput.value.trim()) {{
                                var key = keyInput.value.trim();
                                var val = valInput.value.trim();
                                // Try to parse as JSON (for booleans, numbers, etc)
                                try {{
                                    if (val === 'true') result[key] = true;
                                    else if (val === 'false') result[key] = false;
                                    else if (val === 'null') result[key] = null;
                                    else if (!isNaN(val) && val !== '') result[key] = Number(val);
                                    else if ((val.startsWith('{{') && val.endsWith('}}')) ||
                                             (val.startsWith('[') && val.endsWith(']')) ||
                                             (val.startsWith('"') && val.endsWith('"')))
                                        result[key] = JSON.parse(val);
                                    else result[key] = val;
                                }} catch(e) {{
                                    result[key] = val;
                                }}
                            }}
                        }});
                        document.getElementById('{widget_id}').value = JSON.stringify(result);
                    }}
                    window.addKeyValueRow_{widget_id} = function() {{
                        var container = document.getElementById('{widget_id}_rows');
                        var rows = container.querySelectorAll('.key-value-row');
                        var newIdx = rows.length;
                        var newRow = document.createElement('div');
                        newRow.className = 'key-value-row';
                        newRow.style.cssText = 'display: flex; gap: 8px; margin-bottom: 6px; align-items: center;';
                        newRow.innerHTML = '<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
                            'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
                            'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">' +
                            '<input type="text" class="kv-value" placeholder="value" ' +
                            'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
                            'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">' +
                            '<button type="button" onclick="removeKeyValueRow_{widget_id}(this)" ' +
                            'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>';
                        container.appendChild(newRow);
                        newRow.querySelector('.kv-key').focus();
                    }};
                    window.removeKeyValueRow_{widget_id} = function(btn) {{
                        var row = btn.parentElement;
                        row.remove();
                        updateHiddenField_{widget_id}();
                    }};
                    window.showKeyHint_{widget_id} = showKeyHint_{widget_id};
                    window.updateHiddenField_{widget_id} = updateHiddenField_{widget_id};
                    // Initialize on load
                    document.addEventListener('DOMContentLoaded', function() {{
                        updateHiddenField_{widget_id}();
                    }});
                    // Also run immediately in case DOM is already ready
                    if (document.readyState !== 'loading') {{
                        updateHiddenField_{widget_id}();
                    }}
                    // Update on any input change
                    document.getElementById('{widget_id}_rows').addEventListener('input', updateHiddenField_{widget_id});
                }})();
            </script>
        </div>
        '''
        return mark_safe(html)
    def _render_row(self, widget_id, idx, key, value):
        return f'''
            <div class="key-value-row" style="display: flex; gap: 8px; margin-bottom: 6px; align-items: center;">
                <input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
                       style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
                       onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">
                <input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
                       style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
                       onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">
                <button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
                        style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>
            </div>
        '''
    def _escape(self, s):
        """Escape HTML special chars in attribute values."""
        if not s:
            return ''
        return str(s).replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
    def value_from_datadict(self, data, files, name):
        value = data.get(name, '{}')
        return value
 class ConfigEditorMixin:
    """
    Mixin for admin classes with a config JSON field.
-    Provides a readonly field that shows available config options
+    Provides a key-value editor widget with autocomplete for available config keys.
    from all discovered plugin schemas.
    """
-    @admin.display(description='Available Config Options')
+    def formfield_for_dbfield(self, db_field, request, **kwargs):
-    def available_config_options(self, obj):
+        """Use KeyValueWidget for the config JSON field."""
-        """Show documentation for available config keys."""
+        if db_field.name == 'config':
-        try:
+            kwargs['widget'] = KeyValueWidget()
-            from archivebox.hooks import discover_plugin_configs
+        return super().formfield_for_dbfield(db_field, request, **kwargs)
            plugin_configs = discover_plugin_configs()
        except ImportError:
            return format_html('<i>Plugin config system not available</i>')
        html_parts = [
            '<details>',
            '<summary style="cursor: pointer; font-weight: bold; padding: 4px;">',
            'Click to see available config keys ({})</summary>'.format(
                sum(len(s.get('properties', {})) for s in plugin_configs.values())
            ),
            '<div style="max-height: 400px; overflow-y: auto; padding: 8px; background: #f8f8f8; border-radius: 4px; font-family: monospace; font-size: 11px;">',
        ]
        for plugin_name, schema in sorted(plugin_configs.items()):
            properties = schema.get('properties', {})
            if not properties:
                continue
            html_parts.append(f'<div style="margin: 8px 0;"><strong style="color: #333;">{plugin_name}</strong></div>')
            html_parts.append('<table style="width: 100%; border-collapse: collapse; margin-bottom: 12px;">')
            html_parts.append('<tr style="background: #eee;"><th style="text-align: left; padding: 4px;">Key</th><th style="text-align: left; padding: 4px;">Type</th><th style="text-align: left; padding: 4px;">Default</th><th style="text-align: left; padding: 4px;">Description</th></tr>')
            for key, prop in sorted(properties.items()):
                prop_type = prop.get('type', 'string')
                default = prop.get('default', '')
                description = prop.get('description', '')
                # Truncate long defaults
                default_str = str(default)
                if len(default_str) > 30:
                    default_str = default_str[:27] + '...'
                html_parts.append(
                    f'<tr style="border-bottom: 1px solid #ddd;">'
                    f'<td style="padding: 4px; font-weight: bold;">{key}</td>'
                    f'<td style="padding: 4px; color: #666;">{prop_type}</td>'
                    f'<td style="padding: 4px; color: #666;">{default_str}</td>'
                    f'<td style="padding: 4px;">{description}</td>'
                    f'</tr>'
                )
            html_parts.append('</table>')
        html_parts.append('</div></details>')
        html_parts.append(
            '<p style="margin-top: 8px; color: #666; font-size: 11px;">'
            '<strong>Usage:</strong> Add key-value pairs in JSON format, e.g., '
            '<code>{"SAVE_WGET": false, "WGET_TIMEOUT": 120}</code>'
            '</p>'
        )
        return mark_safe(''.join(html_parts))
 class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -72,9 +72,10 @@ def add(urls: str | list[str],
        cli_args[0] = 'archivebox'
    cmd_str = ' '.join(cli_args)
    timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
    seed = Seed.from_file(
        sources_file,
-        label=f'{USER}@{HOSTNAME} $ {cmd_str}',
+        label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
        parser=parser,
        tag=tag,
        created_by=created_by_id,
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -11,21 +11,53 @@ __package__ = "archivebox.config"
 import os
 import json
 from pathlib import Path
-from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
+from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
 from configparser import ConfigParser
 from pydantic import Field
-from pydantic_settings import BaseSettings
+from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
 class IniConfigSettingsSource(PydanticBaseSettingsSource):
    """
    Custom settings source that reads from ArchiveBox.conf (INI format).
    Flattens all sections into a single namespace.
    """
    def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
        config_vals = self._load_config_file()
        field_value = config_vals.get(field_name.upper())
        return field_value, field_name, False
    def __call__(self) -> Dict[str, Any]:
        return self._load_config_file()
    def _load_config_file(self) -> Dict[str, Any]:
        try:
            from archivebox.config.constants import CONSTANTS
            config_path = CONSTANTS.CONFIG_FILE
        except ImportError:
            return {}
        if not config_path.exists():
            return {}
        parser = ConfigParser()
        parser.optionxform = lambda x: x  # preserve case
        parser.read(config_path)
        # Flatten all sections into single namespace (ignore section headers)
        return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
 class BaseConfigSet(BaseSettings):
    """
    Base class for config sections.
-    Automatically loads values from:
+    Automatically loads values from (highest to lowest priority):
-    1. Environment variables (highest priority)
+    1. Environment variables
-    2. ArchiveBox.conf file (if exists)
+    2. ArchiveBox.conf file (INI format, flattened)
-    3. Default values (lowest priority)
+    3. Default values
    Subclasses define fields with defaults and types:
@@ -35,11 +67,30 @@ class BaseConfigSet(BaseSettings):
    """
    class Config:
        # Use env vars with ARCHIVEBOX_ prefix or raw name
        env_prefix = ""
        extra = "ignore"
        validate_default = True
    @classmethod
    def settings_customise_sources(
        cls,
        settings_cls: Type[BaseSettings],
        init_settings: PydanticBaseSettingsSource,
        env_settings: PydanticBaseSettingsSource,
        dotenv_settings: PydanticBaseSettingsSource,
        file_secret_settings: PydanticBaseSettingsSource,
    ) -> Tuple[PydanticBaseSettingsSource, ...]:
        """
        Define the order of settings sources (first = highest priority).
        """
        return (
            init_settings,           # 1. Passed to __init__
            env_settings,            # 2. Environment variables
            IniConfigSettingsSource(settings_cls),  # 3. ArchiveBox.conf file
            # dotenv_settings,       # Skip .env files
            # file_secret_settings,  # Skip secrets files
        )
    @classmethod
    def load_from_file(cls, config_path: Path) -> Dict[str, str]:
        """Load config values from INI file."""
@@ -47,7 +98,7 @@ class BaseConfigSet(BaseSettings):
            return {}
        parser = ConfigParser()
-        parser.optionxform = lambda x: x  # type: ignore  # preserve case
+        parser.optionxform = lambda x: x  # preserve case
        parser.read(config_path)
        # Flatten all sections into single namespace
--- a/archivebox/config/views.py
+++ b/archivebox/config/views.py
@@ -256,7 +256,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
        # Show a helpful message when no plugins found
        rows['Name'].append('(no plugins found)')
        rows['Source'].append('-')
-        rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
+        rows['Path'].append(mark_safe('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
        rows['Hooks'].append('-')
    return TableContext(
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError
 from django.urls import reverse, resolve
 from django.utils import timezone
 from huey_monitor.admin import TaskModel
 from archivebox.config import DATA_DIR
 from archivebox.config.common import SERVER_CONFIG
 from archivebox.misc.paginators import AccelleratedPaginator
 from archivebox.base_models.admin import BaseModelAdmin
 from archivebox.hooks import get_extractor_icon
 from core.models import ArchiveResult, Snapshot
 def result_url(result: TaskModel) -> str:
    url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
    return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
 class ArchiveResultInline(admin.TabularInline):
    name = 'Archive Results Log'
    model = ArchiveResult
@@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline):
 class ArchiveResultAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str')
+    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
    sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
-    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary')
+    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
    search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
    fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
    autocomplete_fields = ['snapshot']
@@ -144,6 +136,16 @@ class ArchiveResultAdmin(BaseModelAdmin):
    def tags_str(self, result):
        return result.snapshot.tags_str()
    @admin.display(description='Extractor', ordering='extractor')
    def extractor_with_icon(self, result):
        icon = get_extractor_icon(result.extractor)
        return format_html(
            '<span title="{}">{}</span> {}',
            result.extractor,
            icon,
            result.extractor,
        )
    def cmd_str(self, result):
        return format_html(
            '<pre>{}</pre>',
@@ -151,10 +153,12 @@ class ArchiveResultAdmin(BaseModelAdmin):
        )
    def output_str(self, result):
        # Determine output link path - use output if file exists, otherwise link to index
        output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
        return format_html(
            '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
            result.snapshot.timestamp,
-            result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
+            output_path,
            result.output,
        )
@@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
                is_hidden = filename.startswith('.')
                output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
-        return output_str + format_html('</code></pre>')
+        return output_str + mark_safe('</code></pre>')
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -36,7 +36,18 @@ def register_admin_site():
    admin.site = archivebox_admin
    sites.site = archivebox_admin
-    # Plugin admin registration is now handled by individual app admins
+    # Register admin views for each app
-    # No longer using archivebox.pm.hook.register_admin()
+    # (Previously handled by ABX plugin system, now called directly)
    from core.admin import register_admin as register_core_admin
    from crawls.admin import register_admin as register_crawls_admin
    from api.admin import register_admin as register_api_admin
    from machine.admin import register_admin as register_machine_admin
    from workers.admin import register_admin as register_workers_admin
    register_core_admin(archivebox_admin)
    register_crawls_admin(archivebox_admin)
    register_api_admin(archivebox_admin)
    register_machine_admin(archivebox_admin)
    register_workers_admin(archivebox_admin)
    return archivebox_admin
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
 from core.models import Tag
 from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline, result_url
+from core.admin_archiveresults import ArchiveResultInline
 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm):
 class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
    sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
-    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
+    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
    search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
-    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
+    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
    ordering = ['-created_at']
    actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
    inlines = [TagInline, ArchiveResultInline]
@@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    #     self.request = request
    #     return super().get_queryset(request).prefetch_related('archiveresult_set').distinct()  # .annotate(archiveresult_count=Count('archiveresult'))
-    @admin.action(
+    @admin.display(description="Imported Timestamp")
        description="Imported Timestamp"
    )
    def imported_timestamp(self, obj):
        context = RequestContext(self.request, {
-            'bookmarked_date': obj.bookmarked,
+            'bookmarked_date': obj.bookmarked_at,
            'timestamp': obj.timestamp,
        })
@@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    def status_info(self, obj):
        return format_html(
            # URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
            '''
            Archived: {} ({} files {}) &nbsp; &nbsp;
            Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
            Status code: {} &nbsp; &nbsp;<br/>
            Server: {} &nbsp; &nbsp;
            Content type: {} &nbsp; &nbsp;
            Extension: {} &nbsp; &nbsp;
            ''',
            '✅' if obj.is_archived else '❌',
            obj.num_outputs,
            self.size(obj) or '0kb',
            f'/archive/{obj.timestamp}/favicon.ico',
            obj.status_code or '-',
            obj.headers and obj.headers.get('Server') or '-',
            obj.headers and obj.headers.get('Content-Type') or '-',
            obj.extension or '-',
        )
@@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            obj.archive_path,
            obj.archive_path,
            obj.archive_path,
-            'fetched' if obj.latest_title or obj.title else 'pending',
+            'fetched' if obj.title else 'pending',
-            urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
+            urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
        ) + mark_safe(f' <span class="tags">{tags}</span>')
    @admin.display(
@@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
        description="ℹ️ Get Title"
    )
    def update_titles(self, request, queryset):
        from core.models import Snapshot
        count = queryset.count()
        # Queue snapshots for archiving via the state machine system
-        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
+        queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
        messages.success(
            request,
-            mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
+            f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
        )
    @admin.action(
@@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    def update_snapshots(self, request, queryset):
        count = queryset.count()
-        result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
+        queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
        messages.success(
            request,
-            mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
+            f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
        )
@@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            timestamp = timezone.now().isoformat('T', 'seconds')
            new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
-            result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
+            bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
        messages.success(
            request,
-            mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
+            f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
        )
    @admin.action(
@@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    def overwrite_snapshots(self, request, queryset):
        count = queryset.count()
-        result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
+        queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
        messages.success(
            request,
-            mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
+            f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
        )
    @admin.action(
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -1,5 +1,7 @@
 __package__ = 'archivebox.core'
 import sys
 from django.apps import AppConfig
@@ -11,5 +13,40 @@ class CoreConfig(AppConfig):
        from core.admin_site import register_admin_site
        register_admin_site()
        # Auto-start the orchestrator when running the web server
        self._maybe_start_orchestrator()
    def _maybe_start_orchestrator(self):
        """Start the orchestrator if we're running a web server."""
        import os
        # Don't start orchestrator during migrations, shell, tests, etc.
        # Only start when running: runserver, daphne, gunicorn, uwsgi
        if not self._is_web_server():
            return
        # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
        if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
            return
        # Don't start in autoreload child process (avoid double-start)
        if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
            return
        try:
            from workers.orchestrator import Orchestrator
            if not Orchestrator.is_running():
                # Start orchestrator as daemon (won't exit on idle when started by server)
                orchestrator = Orchestrator(exit_on_idle=False)
                orchestrator.start()
        except Exception as e:
            # Don't crash the server if orchestrator fails to start
            import logging
            logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
    def _is_web_server(self) -> bool:
        """Check if we're running a web server command."""
        # Check for common web server indicators
        server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
        return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS
 from archivebox.misc.system import get_dir_size, atomic_write
 from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
 from archivebox.misc.hashing import get_dir_info
-from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from archivebox.hooks import (
    ARCHIVE_METHODS_INDEXING_PRECEDENCE,
    get_extractors, get_extractor_name, get_extractor_icon,
    DEFAULT_EXTRACTOR_ICONS,
 )
 from archivebox.base_models.models import (
    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
@@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def icons(self) -> str:
        """Generate HTML icons showing which extractors have succeeded for this snapshot"""
        from django.utils.html import format_html, mark_safe
        from collections import defaultdict
        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
        def calc_icons():
            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
-                archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
+                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
            else:
-                archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
+                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
            path = self.archive_path
            canon = self.canonical_outputs()
            output = ""
            output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
            icons = {
                "singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
                "screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
                "readability": "🆁", "mercury": "🅼", "warc": "📦"
            }
            exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
-            extractor_outputs = defaultdict(lambda: None)
+            # Get all extractors from hooks system (sorted by numeric prefix)
-            for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
+            all_extractors = [get_extractor_name(e) for e in get_extractors()]
                for result in archive_results:
                    if result.extractor == extractor:
                        extractor_outputs[extractor] = result
-            for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
+            for extractor in all_extractors:
-                if extractor not in exclude:
+                result = archive_results.get(extractor)
-                    existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                existing = result and result.status == 'succeeded' and result.output
-                    output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
+                icon = get_extractor_icon(extractor)
-                if extractor == "wget":
+                output += format_html(
-                    exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                    output_template,
-                    output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
+                    path,
-                if extractor == "archive_org":
+                    canon.get(extractor, extractor + '/'),
-                    exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
+                    str(bool(existing)),
-                    output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
+                    extractor,
                    icon
                )
-            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
+            return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
        cache_result = cache.get(cache_key)
        if cache_result:
@@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        FAILED = 'failed', 'Failed'
        SKIPPED = 'skipped', 'Skipped'
-    EXTRACTOR_CHOICES = (
+    @classmethod
-        ('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'),
+    def get_extractor_choices(cls):
-        ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'),
+        """Get extractor choices from discovered hooks (for forms/admin)."""
-        ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'),
+        extractors = [get_extractor_name(e) for e in get_extractors()]
-        ('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
+        return tuple((e, e) for e in extractors)
    )
    # Keep AutoField for backward compatibility with 0.7.x databases
    # UUID field is added separately by migration for new records
@@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    modified_at = models.DateTimeField(auto_now=True)
    snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)  # type: ignore
-    extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
+    # No choices= constraint - extractor names come from plugin system and can be any string
    extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True)
    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
    cmd = models.JSONField(default=None, null=True, blank=True)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    def output_exists(self) -> bool:
        return os.path.exists(Path(self.snapshot_dir) / self.extractor)
    def embed_path(self) -> Optional[str]:
        """
        Get the relative path to the embeddable output file for this result.
        Returns the output field if set and file exists, otherwise tries to
        find a reasonable default based on the extractor type.
        """
        if self.output:
            return self.output
        # Try to find output file based on extractor's canonical output path
        canonical = self.snapshot.canonical_outputs()
        extractor_key = f'{self.extractor}_path'
        if extractor_key in canonical:
            return canonical[extractor_key]
        # Fallback to extractor directory
        return f'{self.extractor}/'
    def create_output_dir(self):
        output_dir = Path(self.snapshot_dir) / self.extractor
        output_dir.mkdir(parents=True, exist_ok=True)
@@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            output_dir=extractor_dir,
            config_objects=config_objects,
            url=self.snapshot.url,
            snapshot_id=str(self.snapshot.id),
        )
        end_ts = timezone.now()
@@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
                hook,
                output_dir=self.output_dir,
                config_objects=config_objects,
                url=self.snapshot.url,
                snapshot_id=str(self.snapshot.id),
                extractor=self.extractor,
            )
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -68,9 +68,6 @@ INSTALLED_APPS = [
    # 3rd-party apps from PyPI that need to be loaded last
    "admin_data_views",  # handles rendering some convenient automatic read-only views of data in Django admin
    "django_extensions",  # provides Django Debug Toolbar (and other non-debug helpers)
    "django_huey",  # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
    "bx_django_utils",  # needed for huey_monitor https://github.com/boxine/bx_django_utils
    "huey_monitor",  # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
 ]
@@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None}
 # as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
 DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
 HUEY = {
    "huey_class": "huey.SqliteHuey",
    "filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
    "name": "commands",
    "results": True,
    "store_none": True,
    "immediate": False,
    "utc": True,
    "consumer": {
        "workers": 1,
        "worker_type": "thread",
        "initial_delay": 0.1,  # Smallest polling interval, same as -d.
        "backoff": 1.15,  # Exponential backoff using this rate, -b.
        "max_delay": 10.0,  # Max possible polling interval, -m.
        "scheduler_interval": 1,  # Check schedule every second, -s.
        "periodic": True,  # Enable crontab feature.
        "check_worker_health": True,  # Enable worker health checks.
        "health_check_interval": 1,  # Check worker health every second.
    },
 }
 # https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
 # https://github.com/gaiacoop/django-huey
 DJANGO_HUEY = {
    "default": "commands",
    "queues": {
        HUEY["name"]: HUEY.copy(),
        # more registered here at plugin import-time by BaseQueue.register()
        # Additional huey queues configured via settings
    },
 }
 class HueyDBRouter:
    """
    A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database.
    We keep the databases separate because the queue database receives many more reads/writes per second
    and we want to avoid single-write lock contention with the main database. Also all the in-progress task
    data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical
    temp data by just deleting queue.sqlite3 and leaving index.sqlite3.
    """
    route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
    db_name = "queue"
    def db_for_read(self, model, **hints):
        if model._meta.app_label in self.route_app_labels:
            return self.db_name
        return "default"
    def db_for_write(self, model, **hints):
        if model._meta.app_label in self.route_app_labels:
            return self.db_name
        return "default"
    def allow_relation(self, obj1, obj2, **hints):
        if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
            return obj1._meta.app_label == obj2._meta.app_label
        return None
    def allow_migrate(self, db, app_label, model_name=None, **hints):
        if app_label in self.route_app_labels:
            return db == self.db_name
        return db == "default"
 # class FilestoreDBRouter:
@@ -311,7 +244,7 @@ class HueyDBRouter:
 #             return db == self.db_name
 #         return db == "default"
-DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
+DATABASE_ROUTERS = []
 CACHES = {
    "default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -1,9 +1,13 @@
 from django import template
 from django.contrib.admin.templatetags.base import InclusionAdminNode
-
+from django.utils.safestring import mark_safe
 from typing import Union
 from archivebox.hooks import (
    get_extractor_icon, get_extractor_template, get_extractor_name,
 )
 register = template.Library()
@@ -44,3 +48,115 @@ def url_replace(context, **kwargs):
    dict_ = context['request'].GET.copy()
    dict_.update(**kwargs)
    return dict_.urlencode()
@register.simple_tag
 def extractor_icon(extractor: str) -> str:
    """
    Render the icon for an extractor.
    Usage: {% extractor_icon "screenshot" %}
    """
    return mark_safe(get_extractor_icon(extractor))
@register.simple_tag(takes_context=True)
 def extractor_thumbnail(context, result) -> str:
    """
    Render the thumbnail template for an archive result.
    Usage: {% extractor_thumbnail result %}
    Context variables passed to template:
        - result: ArchiveResult object
        - snapshot: Parent Snapshot object
        - output_path: Path to output relative to snapshot dir (from embed_path())
        - extractor: Extractor base name
    """
    extractor = get_extractor_name(result.extractor)
    template_str = get_extractor_template(extractor, 'thumbnail')
    if not template_str:
        return ''
    # Use embed_path() for the display path (includes canonical paths)
    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
    # Create a mini template and render it with context
    try:
        tpl = template.Template(template_str)
        ctx = template.Context({
            'result': result,
            'snapshot': result.snapshot,
            'output_path': output_path,
            'extractor': extractor,
        })
        return mark_safe(tpl.render(ctx))
    except Exception:
        return ''
@register.simple_tag(takes_context=True)
 def extractor_embed(context, result) -> str:
    """
    Render the embed iframe template for an archive result.
    Usage: {% extractor_embed result %}
    """
    extractor = get_extractor_name(result.extractor)
    template_str = get_extractor_template(extractor, 'embed')
    if not template_str:
        return ''
    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
    try:
        tpl = template.Template(template_str)
        ctx = template.Context({
            'result': result,
            'snapshot': result.snapshot,
            'output_path': output_path,
            'extractor': extractor,
        })
        return mark_safe(tpl.render(ctx))
    except Exception:
        return ''
@register.simple_tag(takes_context=True)
 def extractor_fullscreen(context, result) -> str:
    """
    Render the fullscreen template for an archive result.
    Usage: {% extractor_fullscreen result %}
    """
    extractor = get_extractor_name(result.extractor)
    template_str = get_extractor_template(extractor, 'fullscreen')
    if not template_str:
        return ''
    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
    try:
        tpl = template.Template(template_str)
        ctx = template.Context({
            'result': result,
            'snapshot': result.snapshot,
            'output_path': output_path,
            'extractor': extractor,
        })
        return mark_safe(tpl.render(ctx))
    except Exception:
        return ''
@register.filter
 def extractor_name(value: str) -> str:
    """
    Get the base name of an extractor (strips numeric prefix).
    Usage: {{ result.extractor|extractor_name }}
    """
    return get_extractor_name(value)
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
 from archivebox.misc.serve_static import serve_static
 from core.admin_site import archivebox_admin
-from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
+from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
 from workers.views import JobsDashboardView
@@ -43,6 +43,8 @@ urlpatterns = [
    path('accounts/', include('django.contrib.auth.urls')),
    path('admin/live-progress/', live_progress_view, name='live_progress'),
    path('admin/', archivebox_admin.urls),
    path("api/",      include('api.urls'), name='api'),
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
 from core.models import Snapshot
 from core.forms import AddLinkForm
 from crawls.models import Seed, Crawl
 from archivebox.hooks import get_extractors, get_extractor_name
@@ -54,8 +55,10 @@ class SnapshotView(View):
    @staticmethod
    def render_live_index(request, snapshot):
        TITLE_LOADING_MSG = 'Not yet archived...'
        HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
        # Dict of extractor -> ArchiveResult object
        archiveresult_objects = {}
        # Dict of extractor -> result info dict (for template compatibility)
        archiveresults = {}
        results = snapshot.archiveresult_set.all()
@@ -65,18 +68,21 @@ class SnapshotView(View):
            abs_path = result.snapshot_dir / (embed_path or 'None')
            if (result.status == 'succeeded'
                and (result.extractor not in HIDDEN_RESULTS)
                and embed_path
                and os.access(abs_path, os.R_OK)
                and abs_path.exists()):
                if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
                    continue
                # Store the full ArchiveResult object for template tags
                archiveresult_objects[result.extractor] = result
                result_info = {
                    'name': result.extractor,
                    'path': embed_path,
                    'ts': ts_to_date_str(result.end_ts),
                    'size': abs_path.stat().st_size or '?',
                    'result': result,  # Include the full object for template tags
                }
                archiveresults[result.extractor] = result_info
@@ -101,7 +107,7 @@ class SnapshotView(View):
        }
-        # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
+        # iterate through all the files in the snapshot dir and add the biggest ones to the result list
        snap_dir = Path(snapshot.output_dir)
        if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
            return {}
@@ -121,12 +127,16 @@ class SnapshotView(View):
                    'path': result_file.relative_to(snap_dir),
                    'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
                    'size': file_size,
                    'result': None,  # No ArchiveResult object for filesystem-discovered files
                }
-        preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
+        # Get available extractors from hooks (sorted by numeric prefix for ordering)
        # Convert to base names for display ordering
        all_extractors = [get_extractor_name(e) for e in get_extractors()]
        preferred_types = tuple(all_extractors)
        all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
-        best_result = {'path': 'None'}
+        best_result = {'path': 'None', 'result': None}
        for result_type in preferred_types:
            if result_type in archiveresults:
                best_result = archiveresults[result_type]
@@ -157,6 +167,7 @@ class SnapshotView(View):
            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
            'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
            'best_result': best_result,
            'snapshot': snapshot,  # Pass the snapshot object for template tags
        }
        return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
    def form_valid(self, form):
        urls = form.cleaned_data["url"]
        print(f'[+] Adding URL: {urls}')
-        parser = form.cleaned_data["parser"]
+        parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
        tag = form.cleaned_data["tag"]
        depth = 0 if form.cleaned_data["depth"] == "0" else 1
        extractors = ','.join(form.cleaned_data["archive_methods"])
@@ -461,9 +472,10 @@ class AddView(UserPassesTestMixin, FormView):
        sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
        # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
        timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
        seed = Seed.from_file(
            sources_file,
-            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
+            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
            parser=parser,
            tag=tag,
            created_by=self.request.user.pk,
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
                # 'INDEX_ONLY': index_only,
                # 'OVERWRITE': False,
                'DEPTH': depth,
-                'EXTRACTORS': parser,
+                'EXTRACTORS': extractors or '',
                # 'DEFAULT_PERSONA': persona or 'Default',
            })
        # 3. create a new Crawl pointing to the Seed
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
            self.request,
            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
        )
-        # if not bg:
+
-        #     from workers.orchestrator import Orchestrator
+        # Start orchestrator in background to process the queued crawl
-        #     orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
+        try:
-        #     orchestrator.start()
+            from archivebox.workers.tasks import ensure_orchestrator_running
            ensure_orchestrator_running()
        except Exception as e:
            # Orchestrator may already be running via supervisord, or fail to start
            # This is not fatal - the crawl will be processed when orchestrator runs
            print(f'[!] Failed to start orchestrator: {e}')
        return redirect(crawl.admin_change_url)
@@ -513,6 +530,141 @@ class HealthCheckView(View):
        )
 import json
 from django.http import JsonResponse
 def live_progress_view(request):
    """Simple JSON endpoint for live progress status - used by admin progress monitor."""
    try:
        from workers.orchestrator import Orchestrator
        from crawls.models import Crawl
        from core.models import Snapshot, ArchiveResult
        # Get orchestrator status
        orchestrator_running = Orchestrator.is_running()
        total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
        # Get model counts by status
        crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
        crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
        # Get recent crawls (last 24 hours)
        from datetime import timedelta
        one_day_ago = timezone.now() - timedelta(days=1)
        crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
        snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
        snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
        archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
        archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
        archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
        archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
        # Build hierarchical active crawls with nested snapshots and archive results
        active_crawls = []
        for crawl in Crawl.objects.filter(
            status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
        ).order_by('-modified_at')[:10]:
            # Get snapshots for this crawl
            crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
            total_snapshots = crawl_snapshots.count()
            completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
            pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
            # Calculate crawl progress
            crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
            # Get active snapshots for this crawl
            active_snapshots_for_crawl = []
            for snapshot in crawl_snapshots.filter(
                status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
            ).order_by('-modified_at')[:5]:
                # Get archive results for this snapshot
                snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
                total_extractors = snapshot_results.count()
                completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
                failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
                pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
                # Calculate snapshot progress
                snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
                # Get active extractors for this snapshot
                active_extractors = [
                    {
                        'id': str(ar.id),
                        'extractor': ar.extractor,
                        'status': ar.status,
                        'started': ar.start_ts.isoformat() if ar.start_ts else None,
                        'progress': 50,
                    }
                    for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
                ]
                active_snapshots_for_crawl.append({
                    'id': str(snapshot.id),
                    'url': snapshot.url[:80],
                    'status': snapshot.status,
                    'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
                    'progress': snapshot_progress,
                    'total_extractors': total_extractors,
                    'completed_extractors': completed_extractors,
                    'failed_extractors': failed_extractors,
                    'pending_extractors': pending_extractors,
                    'active_extractors': active_extractors,
                })
            active_crawls.append({
                'id': str(crawl.id),
                'label': str(crawl)[:60],
                'status': crawl.status,
                'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
                'progress': crawl_progress,
                'max_depth': crawl.max_depth,
                'total_snapshots': total_snapshots,
                'completed_snapshots': completed_snapshots,
                'failed_snapshots': 0,
                'pending_snapshots': pending_snapshots,
                'active_snapshots': active_snapshots_for_crawl,
            })
        return JsonResponse({
            'orchestrator_running': orchestrator_running,
            'total_workers': total_workers,
            'crawls_pending': crawls_pending,
            'crawls_started': crawls_started,
            'crawls_recent': crawls_recent,
            'snapshots_pending': snapshots_pending,
            'snapshots_started': snapshots_started,
            'archiveresults_pending': archiveresults_pending,
            'archiveresults_started': archiveresults_started,
            'archiveresults_succeeded': archiveresults_succeeded,
            'archiveresults_failed': archiveresults_failed,
            'active_crawls': active_crawls,
            'server_time': timezone.now().isoformat(),
        })
    except Exception as e:
        import traceback
        return JsonResponse({
            'error': str(e),
            'traceback': traceback.format_exc(),
            'orchestrator_running': False,
            'total_workers': 0,
            'crawls_pending': 0,
            'crawls_started': 0,
            'crawls_recent': 0,
            'snapshots_pending': 0,
            'snapshots_started': 0,
            'archiveresults_pending': 0,
            'archiveresults_started': 0,
            'archiveresults_succeeded': 0,
            'archiveresults_failed': 0,
            'active_crawls': [],
            'server_time': timezone.now().isoformat(),
        }, status=500)
 def find_config_section(key: str) -> str:
    CONFIGS = get_all_configs()
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -1,10 +1,18 @@
 __package__ = 'archivebox.crawls'
-from django.utils.html import format_html, format_html_join
+import json
-from django.contrib import admin
+from pathlib import Path
 from django.utils.html import format_html, format_html_join, mark_safe
 from django.contrib import admin, messages
 from django.urls import path
 from django.http import JsonResponse
 from django.views.decorators.http import require_POST
 from archivebox import DATA_DIR
 from django_object_actions import action
 from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
 from core.models import Snapshot
@@ -16,8 +24,8 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
    sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
    search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
-    readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents', 'available_config_options')
+    readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
-    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'available_config_options', 'created_by', *readonly_fields[:-1])
+    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
    list_filter = ('extractor', 'created_by')
    ordering = ['-created_at']
@@ -34,19 +42,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (scheduledcrawl.admin_change_url, scheduledcrawl)
            for scheduledcrawl in  obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Scheduled Crawls yet...</i>')
+        )) or mark_safe('<i>No Scheduled Crawls yet...</i>')
    def crawls(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (crawl.admin_change_url, crawl)
            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Crawls yet...</i>')
+        )) or mark_safe('<i>No Crawls yet...</i>')
    def snapshots(self, obj):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (snapshot.admin_change_url, snapshot)
            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Snapshots yet...</i>')
+        )) or mark_safe('<i>No Snapshots yet...</i>')
    def contents(self, obj):
        if obj.uri.startswith('file:///data/'):
@@ -69,13 +77,80 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
    sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
    search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
-    readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents', 'available_config_options')
+    readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
-    fields = ('label', 'notes', 'urls', 'config', 'available_config_options', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields[:-1])
+    fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
    list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
    ordering = ['-created_at', '-retry_at']
    list_per_page = 100
    actions = ["delete_selected"]
    change_actions = ['recrawl']
    @action(label='Recrawl', description='Create a new crawl with the same settings')
    def recrawl(self, request, obj):
        """Duplicate this crawl as a new crawl with the same seed and settings."""
        from django.utils import timezone
        new_crawl = Crawl.objects.create(
            seed=obj.seed,
            urls=obj.urls,
            max_depth=obj.max_depth,
            config=obj.config,
            schedule=obj.schedule,
            label=f"{obj.label} (recrawl)" if obj.label else "",
            notes=obj.notes,
            created_by=request.user,
            status=Crawl.StatusChoices.QUEUED,
            retry_at=timezone.now(),
        )
        messages.success(
            request,
            f'Created new crawl {new_crawl.id} with the same settings. '
            f'It will start processing shortly.'
        )
        # Redirect to the new crawl's change page
        from django.shortcuts import redirect
        return redirect('admin:crawls_crawl_change', new_crawl.id)
    def get_urls(self):
        urls = super().get_urls()
        custom_urls = [
            path('<path:object_id>/save_seed_contents/',
                 self.admin_site.admin_view(self.save_seed_contents_view),
                 name='crawls_crawl_save_seed_contents'),
        ]
        return custom_urls + urls
    def save_seed_contents_view(self, request, object_id):
        """Handle saving seed file contents via AJAX."""
        if request.method != 'POST':
            return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
        try:
            crawl = Crawl.objects.get(pk=object_id)
        except Crawl.DoesNotExist:
            return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
        if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
            return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
        try:
            data = json.loads(request.body)
            contents = data.get('contents', '')
        except json.JSONDecodeError:
            return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
        source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
        try:
            # Ensure parent directory exists
            source_file.parent.mkdir(parents=True, exist_ok=True)
            source_file.write_text(contents)
            return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
        except Exception as e:
            return JsonResponse({'success': False, 'error': str(e)}, status=500)
    def num_snapshots(self, obj):
        return obj.snapshot_set.count()
@@ -84,35 +159,175 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
        return format_html_join('<br/>', '<a href="{}">{}</a>', (
            (snapshot.admin_change_url, snapshot)
            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Snapshots yet...</i>')
+        )) or mark_safe('<i>No Snapshots yet...</i>')
    @admin.display(description='Schedule', ordering='schedule')
    def schedule_str(self, obj):
        if not obj.schedule:
-            return format_html('<i>None</i>')
+            return mark_safe('<i>None</i>')
        return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
    @admin.display(description='Seed', ordering='seed')
    def seed_str(self, obj):
        if not obj.seed:
-            return format_html('<i>None</i>')
+            return mark_safe('<i>None</i>')
        return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
-    def seed_contents(self, obj):
+    @admin.display(description='URLs')
-        if not (obj.seed and obj.seed.uri):
+    def seed_urls_editor(self, obj):
-            return format_html('<i>None</i>')
+        """Combined editor showing seed URL and file contents."""
        widget_id = f'seed_urls_{obj.pk}'
-        if obj.seed.uri.startswith('file:///data/'):
+        # Get the seed URI (or use urls field if no seed)
-            source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1)
+        seed_uri = ''
-            contents = ""
+        if obj.seed and obj.seed.uri:
            seed_uri = obj.seed.uri
        elif obj.urls:
            seed_uri = obj.urls
        # Check if it's a local file we can edit
        is_file = seed_uri.startswith('file:///data/')
        contents = ""
        error = None
        source_file = None
        if is_file:
            source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
            try:
-                contents = source_file.read_text().strip()[:14_000]
+                contents = source_file.read_text().strip()
            except Exception as e:
-                contents = f'Error reading {source_file}: {e}'
+                error = f'Error reading {source_file}: {e}'
-            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
+        # Escape for safe HTML embedding
        escaped_uri = seed_uri.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
        escaped_contents = (contents or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
-        return format_html('See URLs here: <a href="{}">{}</a>', obj.seed.uri, obj.seed.uri)
+        # Count lines for auto-expand logic
        line_count = len(contents.split('\n')) if contents else 0
        uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
        html = f'''
        <div id="{widget_id}_container" style="max-width: 900px;">
            <!-- Seed URL input (auto-expands) -->
            <div style="margin-bottom: 12px;">
                <label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
                <textarea id="{widget_id}_uri"
                          style="width: 100%; font-family: monospace; font-size: 13px;
                                 padding: 8px; border: 1px solid #ccc; border-radius: 4px;
                                 resize: vertical; min-height: 32px; overflow: hidden;"
                          rows="{uri_rows}"
                          placeholder="file:///data/sources/... or https://..."
                          {"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
            </div>
            {"" if not is_file else f'''
            <!-- File contents editor -->
            <div style="margin-bottom: 8px;">
                <label style="font-weight: bold; display: block; margin-bottom: 4px;">
                    File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
                </label>
                {"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
                <textarea id="{widget_id}_contents"
                          style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
                                 padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
                          placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
            </div>
            <div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
                <button type="button" id="{widget_id}_save_btn"
                        onclick="saveSeedUrls_{widget_id}()"
                        style="padding: 8px 20px; background: #417690; color: white; border: none;
                               border-radius: 4px; cursor: pointer; font-weight: bold;">
                    Save URLs
                </button>
                <span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
                <span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
            </div>
            '''}
            {"" if is_file else f'''
            <div style="margin-top: 8px; color: #666;">
                <a href="{seed_uri}" target="_blank">{seed_uri}</a>
            </div>
            '''}
            <script>
                (function() {{
                    var uriInput = document.getElementById('{widget_id}_uri');
                    var contentsInput = document.getElementById('{widget_id}_contents');
                    var status = document.getElementById('{widget_id}_status');
                    var lineCount = document.getElementById('{widget_id}_line_count');
                    var saveBtn = document.getElementById('{widget_id}_save_btn');
                    // Auto-resize URI input
                    function autoResizeUri() {{
                        uriInput.style.height = 'auto';
                        uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
                    }}
                    uriInput.addEventListener('input', autoResizeUri);
                    autoResizeUri();
                    if (contentsInput) {{
                        function updateLineCount() {{
                            var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
                            lineCount.textContent = lines.length + ' URLs';
                        }}
                        contentsInput.addEventListener('input', function() {{
                            updateLineCount();
                            if (status) {{
                                status.textContent = '(unsaved changes)';
                                status.style.color = '#c4820e';
                            }}
                        }});
                        updateLineCount();
                    }}
                    window.saveSeedUrls_{widget_id} = function() {{
                        if (!saveBtn) return;
                        saveBtn.disabled = true;
                        saveBtn.textContent = 'Saving...';
                        if (status) status.textContent = '';
                        fetch(window.location.pathname + 'save_seed_contents/', {{
                            method: 'POST',
                            headers: {{
                                'Content-Type': 'application/json',
                                'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
                            }},
                            body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
                        }})
                        .then(function(response) {{ return response.json(); }})
                        .then(function(data) {{
                            if (data.success) {{
                                if (status) {{
                                    status.textContent = '✓ ' + data.message;
                                    status.style.color = '#28a745';
                                }}
                            }} else {{
                                if (status) {{
                                    status.textContent = '✗ ' + data.error;
                                    status.style.color = '#dc3545';
                                }}
                            }}
                        }})
                        .catch(function(err) {{
                            if (status) {{
                                status.textContent = '✗ Error: ' + err;
                                status.style.color = '#dc3545';
                            }}
                        }})
                        .finally(function() {{
                            saveBtn.disabled = false;
                            saveBtn.textContent = 'Save URLs';
                        }});
                    }};
                }})();
            </script>
        </div>
        '''
        return mark_safe(html)
@@ -143,14 +358,14 @@ class CrawlScheduleAdmin(BaseModelAdmin):
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (crawl.admin_change_url, crawl)
            for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
-        )) or format_html('<i>No Crawls yet...</i>')
+        )) or mark_safe('<i>No Crawls yet...</i>')
    def snapshots(self, obj):
        crawl_ids = obj.crawl_set.values_list('pk', flat=True)
        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
            (snapshot.admin_change_url, snapshot)
            for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
-        )) or format_html('<i>No Snapshots yet...</i>')
+        )) or mark_safe('<i>No Snapshots yet...</i>')
 def register_admin(admin_site):
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -865,3 +865,189 @@ def export_plugin_config_to_env(
    return env
 # =============================================================================
 # Plugin Template Discovery
 # =============================================================================
 #
 # Plugins can provide custom templates for rendering their output in the UI.
 # Templates are discovered by filename convention inside each plugin's templates/ dir:
 #
 #     archivebox/plugins/<plugin_name>/
 #         templates/
 #             icon.html          # Icon for admin table view (small inline HTML)
 #             thumbnail.html     # Preview thumbnail for snapshot cards
 #             embed.html         # Iframe embed content for main preview
 #             fullscreen.html    # Fullscreen view template
 #
 # Template context variables available:
 #     {{ result }}         - ArchiveResult object
 #     {{ snapshot }}       - Parent Snapshot object
 #     {{ output_path }}    - Path to output file/dir relative to snapshot dir
 #     {{ extractor }}      - Extractor name (e.g., 'screenshot', 'singlefile')
 #
 # Default templates used when plugin doesn't provide one
 DEFAULT_TEMPLATES = {
    'icon': '''<span title="{{ extractor }}">{{ icon }}</span>''',
    'thumbnail': '''
        <img src="{{ output_path }}"
             alt="{{ extractor }} output"
             style="max-width: 100%; max-height: 100px; object-fit: cover;"
             onerror="this.style.display='none'">
    ''',
    'embed': '''
        <iframe src="{{ output_path }}"
                style="width: 100%; height: 100%; border: none;"
                sandbox="allow-same-origin allow-scripts">
        </iframe>
    ''',
    'fullscreen': '''
        <iframe src="{{ output_path }}"
                style="width: 100%; height: 100vh; border: none;"
                sandbox="allow-same-origin allow-scripts allow-forms">
        </iframe>
    ''',
 }
 # Default icons for known extractors (emoji or short HTML)
 DEFAULT_EXTRACTOR_ICONS = {
    'screenshot': '📷',
    'pdf': '📄',
    'singlefile': '📦',
    'dom': '🌐',
    'wget': '📥',
    'media': '🎬',
    'git': '📂',
    'readability': '📖',
    'mercury': '☿️',
    'favicon': '⭐',
    'title': '📝',
    'headers': '📋',
    'archive_org': '🏛️',
    'htmltotext': '📃',
    'warc': '🗄️',
 }
 def get_plugin_template(extractor: str, template_name: str) -> Optional[str]:
    """
    Get a plugin template by extractor name and template type.
    Args:
        extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
        template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
    Returns:
        Template content as string, or None if not found.
    """
    base_name = get_extractor_name(extractor)
    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
        if not base_dir.exists():
            continue
        # Look for plugin directory matching extractor name
        for plugin_dir in base_dir.iterdir():
            if not plugin_dir.is_dir():
                continue
            # Match by directory name (exact or partial)
            if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
                template_path = plugin_dir / 'templates' / f'{template_name}.html'
                if template_path.exists():
                    return template_path.read_text()
    return None
 def get_extractor_template(extractor: str, template_name: str) -> str:
    """
    Get template for an extractor, falling back to defaults.
    Args:
        extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
        template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
    Returns:
        Template content as string (plugin template or default).
    """
    # Try plugin-provided template first
    template = get_plugin_template(extractor, template_name)
    if template:
        return template
    # Fall back to default template
    return DEFAULT_TEMPLATES.get(template_name, '')
 def get_extractor_icon(extractor: str) -> str:
    """
    Get the icon for an extractor.
    First checks for plugin-provided icon.html template,
    then falls back to DEFAULT_EXTRACTOR_ICONS.
    Args:
        extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
    Returns:
        Icon HTML/emoji string.
    """
    base_name = get_extractor_name(extractor)
    # Try plugin-provided icon template
    icon_template = get_plugin_template(extractor, 'icon')
    if icon_template:
        return icon_template.strip()
    # Fall back to default icon
    return DEFAULT_EXTRACTOR_ICONS.get(base_name, '📁')
 def get_all_extractor_icons() -> Dict[str, str]:
    """
    Get icons for all discovered extractors.
    Returns:
        Dict mapping extractor base names to their icons.
    """
    icons = {}
    for extractor in get_extractors():
        base_name = get_extractor_name(extractor)
        icons[base_name] = get_extractor_icon(extractor)
    return icons
 def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
    """
    Discover all plugin templates organized by extractor.
    Returns:
        Dict mapping extractor names to dicts of template_name -> template_path.
        e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
    """
    templates: Dict[str, Dict[str, str]] = {}
    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
        if not base_dir.exists():
            continue
        for plugin_dir in base_dir.iterdir():
            if not plugin_dir.is_dir():
                continue
            templates_dir = plugin_dir / 'templates'
            if not templates_dir.exists():
                continue
            plugin_templates = {}
            for template_file in templates_dir.glob('*.html'):
                template_name = template_file.stem  # icon, thumbnail, embed, fullscreen
                plugin_templates[template_name] = str(template_file)
            if plugin_templates:
                templates[plugin_dir.name] = plugin_templates
    return templates
--- a/archivebox/machine/admin.py
+++ b/archivebox/machine/admin.py
@@ -3,16 +3,16 @@ __package__ = 'archivebox.machine'
 from django.contrib import admin
 from django.utils.html import format_html
-from archivebox.base_models.admin import BaseModelAdmin
+from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
-from machine.models import Machine, NetworkInterface, InstalledBinary
+from machine.models import Machine, NetworkInterface, InstalledBinary, Dependency
-class MachineAdmin(BaseModelAdmin):
+class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
    list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
    sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
    readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
-    fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed')
+    fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
    list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
    ordering = ['-created_at']
@@ -48,15 +48,43 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
        )
 class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
    list_display = ('id', 'created_at', 'bin_name', 'bin_providers', 'is_installed', 'installed_count')
    sort_fields = ('id', 'created_at', 'bin_name', 'bin_providers')
    search_fields = ('id', 'bin_name', 'bin_providers')
    readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
    fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
    list_filter = ('bin_providers', 'created_at')
    ordering = ['-created_at']
    list_per_page = 100
    actions = ["delete_selected"]
    @admin.display(description='Installed', boolean=True)
    def is_installed(self, dependency):
        return dependency.is_installed
    @admin.display(description='# Binaries')
    def installed_count(self, dependency):
        count = dependency.installed_binaries.count()
        if count:
            return format_html(
                '<a href="/admin/machine/installedbinary/?dependency__id__exact={}">{}</a>',
                dependency.id, count,
            )
        return '0'
 class InstalledBinaryAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health')
+    list_display = ('id', 'created_at', 'machine_info', 'name', 'dependency_link', 'binprovider', 'version', 'abspath', 'sha256', 'health')
    sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
-    search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
+    search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
    readonly_fields = ('created_at', 'modified_at')
-    fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
+    fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
-    list_filter = ('name', 'binprovider', 'machine_id')
+    list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
    ordering = ['-created_at']
    list_per_page = 100
    actions = ["delete_selected"]
@@ -68,8 +96,18 @@ class InstalledBinaryAdmin(BaseModelAdmin):
            installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname,
        )
    @admin.display(description='Dependency', ordering='dependency__bin_name')
    def dependency_link(self, installed_binary):
        if installed_binary.dependency:
            return format_html(
                '<a href="/admin/machine/dependency/{}/change">{}</a>',
                installed_binary.dependency.id, installed_binary.dependency.bin_name,
            )
        return '-'
 def register_admin(admin_site):
    admin_site.register(Machine, MachineAdmin)
    admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
    admin_site.register(Dependency, DependencyAdmin)
    admin_site.register(InstalledBinary, InstalledBinaryAdmin)
--- a/archivebox/misc/db.py
+++ b/archivebox/misc/db.py
@@ -37,15 +37,13 @@ def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
    """Apply pending Django migrations"""
    from django.core.management import call_command
-    out1, out2 = StringIO(), StringIO()
+    out1 = StringIO()
    call_command("migrate", interactive=False, database='default', stdout=out1)
    out1.seek(0)
    call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
    out2.seek(0)
    return [
-        line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
+        line.strip() for line in out1.readlines() if line.strip()
    ]
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -480,6 +480,138 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
    return '%3.1f %s' % (num_bytes, 'TB')
@enforce_types
 def format_duration(seconds: float) -> str:
    """Format duration in human-readable form."""
    if seconds < 1:
        return f'{seconds*1000:.0f}ms'
    elif seconds < 60:
        return f'{seconds:.1f}s'
    elif seconds < 3600:
        minutes = int(seconds // 60)
        secs = int(seconds % 60)
        return f'{minutes}min {secs}s' if secs else f'{minutes}min'
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
@enforce_types
 def truncate_url(url: str, max_length: int = 60) -> str:
    """Truncate URL to max_length, keeping domain and adding ellipsis."""
    if len(url) <= max_length:
        return url
    # Try to keep the domain and beginning of path
    if '://' in url:
        protocol, rest = url.split('://', 1)
        if '/' in rest:
            domain, path = rest.split('/', 1)
            available = max_length - len(protocol) - len(domain) - 6  # for "://", "/", "..."
            if available > 10:
                return f'{protocol}://{domain}/{path[:available]}...'
    # Fallback: just truncate
    return url[:max_length-3] + '...'
@enforce_types
 def log_worker_event(
    worker_type: str,
    event: str,
    indent_level: int = 0,
    pid: Optional[int] = None,
    worker_id: Optional[str] = None,
    url: Optional[str] = None,
    extractor: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    error: Optional[Exception] = None,
 ) -> None:
    """
    Log a worker event with structured metadata and indentation.
    Args:
        worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
        event: Event name (Starting, Completed, Failed, etc.)
        indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
        pid: Process ID
        worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker)
        url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
        extractor: Extractor name (for ArchiveResultWorker)
        metadata: Dict of metadata to show in curly braces
        error: Exception if event is an error
    """
    indent = '    ' * indent_level
    # Build worker identifier
    worker_parts = [worker_type]
    if pid:
        worker_parts.append(f'pid={pid}')
    if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
        worker_parts.append(f'id={worker_id}')
    if url and worker_type == 'SnapshotWorker':
        worker_parts.append(f'url={truncate_url(url)}')
    if extractor and worker_type == 'ArchiveResultWorker':
        worker_parts.append(f'extractor={extractor}')
    worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
    # Build metadata string
    metadata_str = ''
    if metadata:
        # Format metadata nicely
        meta_parts = []
        for k, v in metadata.items():
            if isinstance(v, float):
                # Format floats nicely (durations, sizes)
                if 'duration' in k.lower():
                    meta_parts.append(f'{k}: {format_duration(v)}')
                elif 'size' in k.lower():
                    meta_parts.append(f'{k}: {printable_filesize(int(v))}')
                else:
                    meta_parts.append(f'{k}: {v:.2f}')
            elif isinstance(v, int):
                # Format integers - check if it's a size
                if 'size' in k.lower() or 'bytes' in k.lower():
                    meta_parts.append(f'{k}: {printable_filesize(v)}')
                else:
                    meta_parts.append(f'{k}: {v}')
            elif isinstance(v, (list, tuple)):
                meta_parts.append(f'{k}: {len(v)}')
            else:
                meta_parts.append(f'{k}: {v}')
        metadata_str = ' {' + ', '.join(meta_parts) + '}'
    # Determine color based on event
    color = 'white'
    if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
        color = 'green'
    elif event in ('Processing...', 'PROCESSING'):
        color = 'blue'
    elif event in ('Completed', 'COMPLETED', 'All work complete'):
        color = 'blue'
    elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
        color = 'red'
    elif event in ('Shutting down', 'SHUTDOWN'):
        color = 'grey53'
    # Build final message
    error_str = f' {type(error).__name__}: {error}' if error else ''
    # Build colored message - worker_label needs to be inside color tags
    # But first we need to format the color tags separately from the worker label
    from archivebox.misc.logging import CONSOLE
    from rich.text import Text
    # Create a Rich Text object for proper formatting
    text = Text()
    text.append(indent)  # Indentation
    # Append worker label and event with color
    text.append(f'{worker_label} {event}{error_str}', style=color)
    # Append metadata without color
    text.append(metadata_str)
    CONSOLE.print(text)
@enforce_types
 def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
    return '\n'.join(
--- a/archivebox/plugins/archive_org/templates/icon.html
+++ b/archivebox/plugins/archive_org/templates/icon.html
@@ -0,0 +1 @@
 🏛️
--- a/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py
+++ b/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py
@@ -7,7 +7,7 @@ new plugin-based output structure to the legacy canonical output paths that
 ArchiveBox has historically used. This maintains backward compatibility with
 existing tools and scripts that expect outputs at specific locations.
-Canonical output paths (from Snapshot.canonical_outputs()):
+Canonical output paths:
    - favicon.ico → favicon/favicon.ico
    - singlefile.html → singlefile/singlefile.html
    - readability/content.html → readability/content.html
@@ -27,27 +27,20 @@ New plugin outputs:
    - redirects.json → redirects/redirects.json
    - console.jsonl → consolelog/console.jsonl
-Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
+Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
 Environment variables:
    SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
    DATA_DIR: ArchiveBox data directory
    ARCHIVE_DIR: Archive output directory
 """
 __package__ = 'archivebox.plugins.canonical_outputs'
 import os
 import sys
 import json
 from pathlib import Path
-from typing import Dict, Optional
+from datetime import datetime, timezone
-
+from typing import Dict
 # Configure Django if running standalone
 if __name__ == '__main__':
    parent_dir = str(Path(__file__).resolve().parent.parent.parent)
    if parent_dir not in sys.path:
        sys.path.insert(0, parent_dir)
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    import django
    django.setup()
 import rich_click as click
@@ -150,10 +143,7 @@ def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
 def main(url: str, snapshot_id: str):
    """Create symlinks from plugin outputs to canonical legacy locations."""
-    from datetime import datetime
+    start_ts = datetime.now(timezone.utc)
    from archivebox.core.models import Snapshot
    start_ts = datetime.now()
    status = 'failed'
    output = None
    error = ''
@@ -161,31 +151,20 @@ def main(url: str, snapshot_id: str):
    try:
        # Check if enabled
        from archivebox.config import CONSTANTS
        save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
        if not save_canonical:
            click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
            status = 'skipped'
-            end_ts = datetime.now()
+            click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
            click.echo(f'START_TS={start_ts.isoformat()}')
            click.echo(f'END_TS={end_ts.isoformat()}')
            click.echo(f'STATUS={status}')
            click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
            sys.exit(0)
-        # Get snapshot
+        # Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
-        try:
+        # Parent is the snapshot directory
-            snapshot = Snapshot.objects.get(id=snapshot_id)
+        output_dir = Path.cwd()
-        except Snapshot.DoesNotExist:
+        snapshot_dir = output_dir.parent
            error = f'Snapshot {snapshot_id} not found'
            raise ValueError(error)
        # Get snapshot directory
        snapshot_dir = Path(snapshot.output_dir)
        if not snapshot_dir.exists():
-            error = f'Snapshot directory not found: {snapshot_dir}'
+            raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
            raise FileNotFoundError(error)
        # Create canonical symlinks
        results = create_canonical_symlinks(snapshot_dir)
@@ -203,37 +182,18 @@ def main(url: str, snapshot_id: str):
        status = 'failed'
        click.echo(f'Error: {error}', err=True)
-    end_ts = datetime.now()
+    end_ts = datetime.now(timezone.utc)
    duration = (end_ts - start_ts).total_seconds()
-    # Print results
+    # Print JSON result for hook runner
-    click.echo(f'START_TS={start_ts.isoformat()}')
+    result = {
    click.echo(f'END_TS={end_ts.isoformat()}')
    click.echo(f'DURATION={duration:.2f}')
    if output:
        click.echo(f'OUTPUT={output}')
    click.echo(f'STATUS={status}')
    if error:
        click.echo(f'ERROR={error}', err=True)
    # Print JSON result
    import json
    result_json = {
        'extractor': 'canonical_outputs',
        'url': url,
        'snapshot_id': snapshot_id,
        'status': status,
        'start_ts': start_ts.isoformat(),
        'end_ts': end_ts.isoformat(),
        'duration': round(duration, 2),
        'output': output,
        'symlinks_created': symlinks_created,
        'error': error or None,
        'symlinks_created': symlinks_created,
    }
-    click.echo(f'RESULT_JSON={json.dumps(result_json)}')
+    click.echo(json.dumps(result))
-    sys.exit(0 if status == 'succeeded' else 1)
+    sys.exit(0 if status in ('succeeded', 'skipped') else 1)
 if __name__ == '__main__':
--- a/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py
+++ b/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py
@@ -1,149 +0,0 @@
 #!/usr/bin/env python3
 """
 Install Chrome/Chromium if not already available.
 Runs at crawl start to ensure Chrome is installed.
 Uses playwright to install chromium if no system Chrome found.
 Outputs JSONL for InstalledBinary.
 """
 import json
 import sys
 import os
 import shutil
 from pathlib import Path
 def find_chrome():
    """Try to find system Chrome/Chromium."""
    # Comprehensive list of Chrome/Chromium binary names and paths
    chromium_names_linux = [
        'chromium',
        'chromium-browser',
        'chromium-browser-beta',
        'chromium-browser-unstable',
        'chromium-browser-canary',
        'chromium-browser-dev',
    ]
    chrome_names_linux = [
        'google-chrome',
        'google-chrome-stable',
        'google-chrome-beta',
        'google-chrome-canary',
        'google-chrome-unstable',
        'google-chrome-dev',
        'chrome',
    ]
    chrome_paths_macos = [
        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
        '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
        '/Applications/Chromium.app/Contents/MacOS/Chromium',
    ]
    chrome_paths_linux = [
        '/usr/bin/google-chrome',
        '/usr/bin/google-chrome-stable',
        '/usr/bin/chromium',
        '/usr/bin/chromium-browser',
        '/snap/bin/chromium',
        '/opt/google/chrome/chrome',
    ]
    all_chrome_names = chrome_names_linux + chromium_names_linux
    all_chrome_paths = chrome_paths_macos + chrome_paths_linux
    # Check env var first
    env_path = os.environ.get('CHROME_BINARY', '')
    if env_path and Path(env_path).is_file():
        return env_path
    # Try shutil.which for various names
    for name in all_chrome_names:
        abspath = shutil.which(name)
        if abspath:
            return abspath
    # Check common paths
    for path in all_chrome_paths:
        if Path(path).is_file():
            return path
    return None
 def main():
    try:
        # First try to find system Chrome
        system_chrome = find_chrome()
        if system_chrome:
            print(json.dumps({
                'type': 'InstalledBinary',
                'name': 'chrome',
                'abspath': str(system_chrome),
                'version': None,
                'sha256': None,
                'binprovider': 'env',
            }))
            sys.exit(0)
        # If not found in system, try to install chromium via apt/brew
        from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
        AptProvider.model_rebuild()
        BrewProvider.model_rebuild()
        EnvProvider.model_rebuild()
        # Try chromium-browser or chromium via system package managers
        for binary_name in ['chromium', 'chromium-browser', 'google-chrome']:
            try:
                chrome_binary = Binary(
                    name=binary_name,
                    binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
                )
                # Try to load, install if not found
                try:
                    loaded = chrome_binary.load()
                    if not loaded or not loaded.abspath:
                        raise Exception("Not loaded")
                except Exception:
                    # Install via system package manager
                    loaded = chrome_binary.install()
                if loaded and loaded.abspath:
                    # Output InstalledBinary JSONL
                    print(json.dumps({
                        'type': 'InstalledBinary',
                        'name': 'chrome',
                        'abspath': str(loaded.abspath),
                        'version': str(loaded.version) if loaded.version else None,
                        'sha256': loaded.sha256,
                        'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
                    }))
                    sys.exit(0)
            except Exception:
                continue
        # If all attempts failed
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'chrome',
            'bin_providers': 'apt,brew,env',
        }))
        print("Failed to install Chrome/Chromium", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'chrome',
            'bin_providers': 'apt,brew,env',
        }))
        print(f"Error installing Chrome: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/chrome_session/tests/test_chrome_session.py
+++ b/archivebox/plugins/chrome_session/tests/test_chrome_session.py
@@ -2,7 +2,7 @@
 Integration tests for chrome_session plugin
 Tests verify:
-1. Install hook finds system Chrome or installs chromium
+1. Validate hook checks for Chrome/Chromium binary
 2. Verify deps with abx-pkg
 3. Chrome session script exists
 """
@@ -14,7 +14,7 @@ from pathlib import Path
 import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
-CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py'
+CHROME_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_chrome.py'
 CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
@@ -23,37 +23,50 @@ def test_hook_script_exists():
    assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
-def test_chrome_install_hook():
+def test_chrome_validate_hook():
-    """Test chrome install hook to find or install Chrome/Chromium."""
+    """Test chrome validate hook checks for Chrome/Chromium binary."""
    result = subprocess.run(
-        [sys.executable, str(CHROME_INSTALL_HOOK)],
+        [sys.executable, str(CHROME_VALIDATE_HOOK)],
        capture_output=True,
        text=True,
-        timeout=600
+        timeout=30
    )
-    assert result.returncode == 0, f"Install hook failed: {result.stderr}"
+    # Hook exits 0 if binary found, 1 if not found (with Dependency record)
-
+    if result.returncode == 0:
-    # Verify InstalledBinary JSONL output
+        # Binary found - verify InstalledBinary JSONL output
-    found_binary = False
+        found_binary = False
-    for line in result.stdout.strip().split('\n'):
+        for line in result.stdout.strip().split('\n'):
-        if line.strip():
+            if line.strip():
-            try:
+                try:
-                record = json.loads(line)
+                    record = json.loads(line)
-                if record.get('type') == 'InstalledBinary':
+                    if record.get('type') == 'InstalledBinary':
-                    assert record['name'] == 'chrome'
+                        assert record['name'] == 'chrome'
-                    assert record['abspath']
+                        assert record['abspath']
-                    assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
+                        assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
-                    found_binary = True
+                        found_binary = True
-                    break
+                        break
-            except json.JSONDecodeError:
+                except json.JSONDecodeError:
-                pass
+                    pass
-
+        assert found_binary, "Should output InstalledBinary record when binary found"
-    assert found_binary, "Should output InstalledBinary record"
+    else:
        # Binary not found - verify Dependency JSONL output
        found_dependency = False
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                try:
                    record = json.loads(line)
                    if record.get('type') == 'Dependency':
                        assert record['bin_name'] == 'chrome'
                        found_dependency = True
                        break
                except json.JSONDecodeError:
                    pass
        assert found_dependency, "Should output Dependency record when binary not found"
 def test_verify_deps_with_abx_pkg():
-    """Verify chrome is available via abx-pkg after hook installation."""
+    """Verify chrome is available via abx-pkg."""
    from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
    AptProvider.model_rebuild()
@@ -75,10 +88,10 @@ def test_verify_deps_with_abx_pkg():
        except Exception:
            continue
-    # If we get here, chrome should still be available from system
+    # If we get here, chrome not available
    import shutil
-    assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \
+    if not (shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome')):
-        "Chrome should be available after install hook"
+        pytest.skip("Chrome/Chromium not available - Dependency record should have been emitted")
 if __name__ == '__main__':
--- a/archivebox/plugins/dom/templates/embed.html
+++ b/archivebox/plugins/dom/templates/embed.html
@@ -0,0 +1,6 @@
 <!-- DOM embed - full iframe of captured DOM HTML -->
 <iframe src="{{ output_path }}"
        class="extractor-embed dom-embed"
        style="width: 100%; height: 100%; min-height: 500px; border: none;"
        sandbox="allow-same-origin allow-scripts allow-forms">
 </iframe>
--- a/archivebox/plugins/dom/templates/fullscreen.html
+++ b/archivebox/plugins/dom/templates/fullscreen.html
@@ -0,0 +1,6 @@
 <!-- DOM fullscreen - full page iframe -->
 <iframe src="{{ output_path }}"
        class="extractor-fullscreen dom-fullscreen"
        style="width: 100%; height: 100vh; border: none;"
        sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
 </iframe>
--- a/archivebox/plugins/dom/templates/icon.html
+++ b/archivebox/plugins/dom/templates/icon.html
@@ -0,0 +1 @@
 🌐
--- a/archivebox/plugins/dom/templates/thumbnail.html
+++ b/archivebox/plugins/dom/templates/thumbnail.html
@@ -0,0 +1,8 @@
 <!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
 <div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
    <iframe src="{{ output_path }}"
            style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
            loading="lazy"
            sandbox="allow-same-origin">
    </iframe>
 </div>
--- a/archivebox/plugins/favicon/templates/icon.html
+++ b/archivebox/plugins/favicon/templates/icon.html
@@ -0,0 +1 @@
 ⭐
--- a/archivebox/plugins/git/on_Crawl__00_install_git.py
+++ b/archivebox/plugins/git/on_Crawl__00_install_git.py
@@ -1,68 +0,0 @@
 #!/usr/bin/env python3
 """
 Install git if not already available.
 Runs at crawl start to ensure git is installed.
 Outputs JSONL for InstalledBinary.
 """
 import json
 import sys
 from pathlib import Path
 def main():
    try:
        from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
        AptProvider.model_rebuild()
        BrewProvider.model_rebuild()
        EnvProvider.model_rebuild()
        # git binary and package have same name
        git_binary = Binary(
            name='git',
            binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
        )
        # Try to load, install if not found
        try:
            loaded = git_binary.load()
            if not loaded or not loaded.abspath:
                raise Exception("Not loaded")
        except Exception:
            # Install via system package manager
            loaded = git_binary.install()
        if loaded and loaded.abspath:
            # Output InstalledBinary JSONL
            print(json.dumps({
                'type': 'InstalledBinary',
                'name': 'git',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256,
                'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
            }))
            sys.exit(0)
        else:
            print(json.dumps({
                'type': 'Dependency',
                'bin_name': 'git',
                'bin_providers': 'apt,brew,env',
            }))
            print("Failed to install git", file=sys.stderr)
            sys.exit(1)
    except Exception as e:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'git',
            'bin_providers': 'apt,brew,env',
        }))
        print(f"Error installing git: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/git/templates/embed.html
+++ b/archivebox/plugins/git/templates/embed.html
@@ -0,0 +1,6 @@
 <!-- Git embed - directory listing of cloned repo -->
 <iframe src="{{ output_path }}"
        class="extractor-embed git-embed"
        style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
        sandbox="allow-same-origin">
 </iframe>
--- a/archivebox/plugins/git/templates/fullscreen.html
+++ b/archivebox/plugins/git/templates/fullscreen.html
@@ -0,0 +1,6 @@
 <!-- Git fullscreen - full directory listing -->
 <iframe src="{{ output_path }}"
        class="extractor-fullscreen git-fullscreen"
        style="width: 100%; height: 100vh; border: none; background: #fff;"
        sandbox="allow-same-origin">
 </iframe>
--- a/archivebox/plugins/git/templates/icon.html
+++ b/archivebox/plugins/git/templates/icon.html
@@ -0,0 +1 @@
 📂
--- a/archivebox/plugins/git/templates/thumbnail.html
+++ b/archivebox/plugins/git/templates/thumbnail.html
@@ -0,0 +1,5 @@
 <!-- Git thumbnail - shows git repository icon and info -->
 <div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
    <span style="font-size: 32px;">📂</span>
    <span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
 </div>
--- a/archivebox/plugins/git/tests/test_git.py
+++ b/archivebox/plugins/git/tests/test_git.py
@@ -2,7 +2,7 @@
 Integration tests for git plugin
 Tests verify:
-1. Install hook installs git via abx-pkg
+1. Validate hook checks for git binary
 2. Verify deps with abx-pkg
 3. Standalone git extractor execution
 """
@@ -17,50 +17,64 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
-GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
+GIT_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_git.py'
 TEST_URL = 'https://github.com/example/repo.git'
 def test_hook_script_exists():
    assert GIT_HOOK.exists()
-def test_git_install_hook():
+def test_git_validate_hook():
-    """Test git install hook to install git if needed."""
+    """Test git validate hook checks for git binary."""
    result = subprocess.run(
-        [sys.executable, str(GIT_INSTALL_HOOK)],
+        [sys.executable, str(GIT_VALIDATE_HOOK)],
        capture_output=True,
        text=True,
-        timeout=600
+        timeout=30
    )
-    assert result.returncode == 0, f"Install hook failed: {result.stderr}"
+    # Hook exits 0 if binary found, 1 if not found (with Dependency record)
-
+    if result.returncode == 0:
-    # Verify InstalledBinary JSONL output
+        # Binary found - verify InstalledBinary JSONL output
-    found_binary = False
+        found_binary = False
-    for line in result.stdout.strip().split('\n'):
+        for line in result.stdout.strip().split('\n'):
-        if line.strip():
+            if line.strip():
-            try:
+                try:
-                record = json.loads(line)
+                    record = json.loads(line)
-                if record.get('type') == 'InstalledBinary':
+                    if record.get('type') == 'InstalledBinary':
-                    assert record['name'] == 'git'
+                        assert record['name'] == 'git'
-                    assert record['abspath']
+                        assert record['abspath']
-                    found_binary = True
+                        found_binary = True
-                    break
+                        break
-            except json.JSONDecodeError:
+                except json.JSONDecodeError:
-                pass
+                    pass
-
+        assert found_binary, "Should output InstalledBinary record when binary found"
-    assert found_binary, "Should output InstalledBinary record"
+    else:
        # Binary not found - verify Dependency JSONL output
        found_dependency = False
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                try:
                    record = json.loads(line)
                    if record.get('type') == 'Dependency':
                        assert record['bin_name'] == 'git'
                        assert 'env' in record['bin_providers']
                        found_dependency = True
                        break
                except json.JSONDecodeError:
                    pass
        assert found_dependency, "Should output Dependency record when binary not found"
 def test_verify_deps_with_abx_pkg():
-    """Verify git is available via abx-pkg after hook installation."""
+    """Verify git is available via abx-pkg."""
-    from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
+    from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
    AptProvider.model_rebuild()
    BrewProvider.model_rebuild()
    EnvProvider.model_rebuild()
    git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
    git_loaded = git_binary.load()
-    assert git_loaded and git_loaded.abspath, "git should be available after install hook"
+
    if git_loaded and git_loaded.abspath:
        assert True, "git is available"
    else:
        pytest.skip("git not available - Dependency record should have been emitted")
 def test_reports_missing_git():
    with tempfile.TemporaryDirectory() as tmpdir:
--- a/archivebox/plugins/headers/templates/icon.html
+++ b/archivebox/plugins/headers/templates/icon.html
@@ -0,0 +1 @@
 📋
--- a/archivebox/plugins/htmltotext/templates/icon.html
+++ b/archivebox/plugins/htmltotext/templates/icon.html
@@ -0,0 +1 @@
 📃
--- a/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py
+++ b/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py
@@ -1,67 +0,0 @@
 #!/usr/bin/env python3
 """
 Install yt-dlp if not already available.
 Runs at crawl start to ensure yt-dlp is installed.
 Outputs JSONL for InstalledBinary.
 """
 import json
 import sys
 from pathlib import Path
 def main():
    try:
        from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
        PipProvider.model_rebuild()
        EnvProvider.model_rebuild()
        # yt-dlp binary and package have same name
        ytdlp_binary = Binary(
            name='yt-dlp',
            binproviders=[PipProvider(), EnvProvider()]
        )
        # Try to load, install if not found
        try:
            loaded = ytdlp_binary.load()
            if not loaded or not loaded.abspath:
                raise Exception("Not loaded")
        except Exception:
            # Install via pip
            loaded = ytdlp_binary.install()
        if loaded and loaded.abspath:
            # Output InstalledBinary JSONL
            print(json.dumps({
                'type': 'InstalledBinary',
                'name': 'yt-dlp',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256,
                'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
            }))
            sys.exit(0)
        else:
            print(json.dumps({
                'type': 'Dependency',
                'bin_name': 'yt-dlp',
                'bin_providers': 'pip,brew,env',
            }))
            print("Failed to install yt-dlp", file=sys.stderr)
            sys.exit(1)
    except Exception as e:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'yt-dlp',
            'bin_providers': 'pip,brew,env',
        }))
        print(f"Error installing yt-dlp: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
+++ b/archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
@@ -0,0 +1,278 @@
 #!/usr/bin/env python3
 """
 Validation hook for yt-dlp and its dependencies (node, ffmpeg).
 Runs at crawl start to verify yt-dlp and required binaries are available.
 Outputs JSONL for InstalledBinary and Machine config updates.
 """
 import os
 import sys
 import json
 import shutil
 import hashlib
 import subprocess
 from pathlib import Path
 def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
    """Get version string from binary."""
    try:
        result = subprocess.run(
            [abspath, version_flag],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if result.returncode == 0 and result.stdout:
            first_line = result.stdout.strip().split('\n')[0]
            return first_line[:64]
    except Exception:
        pass
    return None
 def get_binary_hash(abspath: str) -> str | None:
    """Get SHA256 hash of binary."""
    try:
        with open(abspath, 'rb') as f:
            return hashlib.sha256(f.read()).hexdigest()
    except Exception:
        return None
 def find_ytdlp() -> dict | None:
    """Find yt-dlp binary."""
    try:
        from abx_pkg import Binary, PipProvider, EnvProvider
        class YtdlpBinary(Binary):
            name: str = 'yt-dlp'
            binproviders_supported = [PipProvider(), EnvProvider()]
        binary = YtdlpBinary()
        loaded = binary.load()
        if loaded and loaded.abspath:
            return {
                'name': 'yt-dlp',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
            }
    except ImportError:
        pass
    except Exception:
        pass
    # Fallback to shutil.which
    abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
    if abspath and Path(abspath).is_file():
        return {
            'name': 'yt-dlp',
            'abspath': abspath,
            'version': get_binary_version(abspath),
            'sha256': get_binary_hash(abspath),
            'binprovider': 'env',
        }
    return None
 def find_node() -> dict | None:
    """Find node binary."""
    try:
        from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
        class NodeBinary(Binary):
            name: str = 'node'
            binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
            overrides: dict = {'apt': {'packages': ['nodejs']}}
        binary = NodeBinary()
        loaded = binary.load()
        if loaded and loaded.abspath:
            return {
                'name': 'node',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
            }
    except ImportError:
        pass
    except Exception:
        pass
    # Fallback to shutil.which
    abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
    if abspath and Path(abspath).is_file():
        return {
            'name': 'node',
            'abspath': abspath,
            'version': get_binary_version(abspath),
            'sha256': get_binary_hash(abspath),
            'binprovider': 'env',
        }
    return None
 def find_ffmpeg() -> dict | None:
    """Find ffmpeg binary."""
    try:
        from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
        class FfmpegBinary(Binary):
            name: str = 'ffmpeg'
            binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
        binary = FfmpegBinary()
        loaded = binary.load()
        if loaded and loaded.abspath:
            return {
                'name': 'ffmpeg',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
            }
    except ImportError:
        pass
    except Exception:
        pass
    # Fallback to shutil.which
    abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
    if abspath and Path(abspath).is_file():
        return {
            'name': 'ffmpeg',
            'abspath': abspath,
            'version': get_binary_version(abspath),
            'sha256': get_binary_hash(abspath),
            'binprovider': 'env',
        }
    return None
 def main():
    # Check for yt-dlp (required)
    ytdlp_result = find_ytdlp()
    # Check for node (required for JS extraction)
    node_result = find_node()
    # Check for ffmpeg (required for video conversion)
    ffmpeg_result = find_ffmpeg()
    missing_deps = []
    # Emit results for yt-dlp
    if ytdlp_result and ytdlp_result.get('abspath'):
        print(json.dumps({
            'type': 'InstalledBinary',
            'name': ytdlp_result['name'],
            'abspath': ytdlp_result['abspath'],
            'version': ytdlp_result['version'],
            'sha256': ytdlp_result['sha256'],
            'binprovider': ytdlp_result['binprovider'],
        }))
        print(json.dumps({
            'type': 'Machine',
            '_method': 'update',
            'key': 'config/YTDLP_BINARY',
            'value': ytdlp_result['abspath'],
        }))
        if ytdlp_result['version']:
            print(json.dumps({
                'type': 'Machine',
                '_method': 'update',
                'key': 'config/YTDLP_VERSION',
                'value': ytdlp_result['version'],
            }))
    else:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'yt-dlp',
            'bin_providers': 'pip,env',
        }))
        missing_deps.append('yt-dlp')
    # Emit results for node
    if node_result and node_result.get('abspath'):
        print(json.dumps({
            'type': 'InstalledBinary',
            'name': node_result['name'],
            'abspath': node_result['abspath'],
            'version': node_result['version'],
            'sha256': node_result['sha256'],
            'binprovider': node_result['binprovider'],
        }))
        print(json.dumps({
            'type': 'Machine',
            '_method': 'update',
            'key': 'config/NODE_BINARY',
            'value': node_result['abspath'],
        }))
        if node_result['version']:
            print(json.dumps({
                'type': 'Machine',
                '_method': 'update',
                'key': 'config/NODE_VERSION',
                'value': node_result['version'],
            }))
    else:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'node',
            'bin_providers': 'apt,brew,env',
        }))
        missing_deps.append('node')
    # Emit results for ffmpeg
    if ffmpeg_result and ffmpeg_result.get('abspath'):
        print(json.dumps({
            'type': 'InstalledBinary',
            'name': ffmpeg_result['name'],
            'abspath': ffmpeg_result['abspath'],
            'version': ffmpeg_result['version'],
            'sha256': ffmpeg_result['sha256'],
            'binprovider': ffmpeg_result['binprovider'],
        }))
        print(json.dumps({
            'type': 'Machine',
            '_method': 'update',
            'key': 'config/FFMPEG_BINARY',
            'value': ffmpeg_result['abspath'],
        }))
        if ffmpeg_result['version']:
            print(json.dumps({
                'type': 'Machine',
                '_method': 'update',
                'key': 'config/FFMPEG_VERSION',
                'value': ffmpeg_result['version'],
            }))
    else:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'ffmpeg',
            'bin_providers': 'apt,brew,env',
        }))
        missing_deps.append('ffmpeg')
    if missing_deps:
        print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
        sys.exit(1)
    else:
        sys.exit(0)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/media/templates/embed.html
+++ b/archivebox/plugins/media/templates/embed.html
@@ -0,0 +1,9 @@
 <!-- Media embed - video/audio player -->
 <div class="extractor-embed media-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
    <video src="{{ output_path }}"
           style="max-width: 100%; max-height: 100%;"
           controls
           preload="metadata">
        Your browser does not support the video tag.
    </video>
 </div>
--- a/archivebox/plugins/media/templates/fullscreen.html
+++ b/archivebox/plugins/media/templates/fullscreen.html
@@ -0,0 +1,10 @@
 <!-- Media fullscreen - full video/audio player -->
 <div class="extractor-fullscreen media-fullscreen" style="width: 100%; height: 100vh; background: #000; display: flex; align-items: center; justify-content: center;">
    <video src="{{ output_path }}"
           style="max-width: 100%; max-height: 100%;"
           controls
           autoplay
           preload="auto">
        Your browser does not support the video tag.
    </video>
 </div>
--- a/archivebox/plugins/media/templates/icon.html
+++ b/archivebox/plugins/media/templates/icon.html
@@ -0,0 +1 @@
 🎬
--- a/archivebox/plugins/media/templates/thumbnail.html
+++ b/archivebox/plugins/media/templates/thumbnail.html
@@ -0,0 +1,14 @@
 <!-- Media thumbnail - shows video/audio player or placeholder -->
 <div class="extractor-thumbnail media-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
    <video src="{{ output_path }}"
           style="width: 100%; height: 100px; object-fit: contain;"
           poster=""
           preload="metadata"
           muted
           onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
    </video>
    <div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
        <span style="font-size: 32px;">🎬</span>
        <span>Media</span>
    </div>
 </div>
--- a/archivebox/plugins/media/tests/test_media.py
+++ b/archivebox/plugins/media/tests/test_media.py
@@ -21,7 +21,7 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
 MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
-MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
+MEDIA_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_ytdlp.py'
 TEST_URL = 'https://example.com/video.mp4'
 def test_hook_script_exists():
@@ -29,46 +29,72 @@ def test_hook_script_exists():
    assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
-def test_ytdlp_install_hook():
+def test_ytdlp_validate_hook():
-    """Test yt-dlp install hook to install yt-dlp if needed."""
+    """Test yt-dlp validate hook checks for yt-dlp and dependencies (node, ffmpeg)."""
-    # Run yt-dlp install hook
+    # Run yt-dlp validate hook
    result = subprocess.run(
-        [sys.executable, str(MEDIA_INSTALL_HOOK)],
+        [sys.executable, str(MEDIA_VALIDATE_HOOK)],
        capture_output=True,
        text=True,
-        timeout=600
+        timeout=30
    )
-    assert result.returncode == 0, f"Install hook failed: {result.stderr}"
+    # Hook exits 0 if all binaries found, 1 if any not found
    # Parse output for InstalledBinary and Dependency records
    found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
    found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
    # Verify InstalledBinary JSONL output
    found_binary = False
    for line in result.stdout.strip().split('\n'):
        if line.strip():
            try:
                record = json.loads(line)
                if record.get('type') == 'InstalledBinary':
-                    assert record['name'] == 'yt-dlp'
+                    name = record['name']
-                    assert record['abspath']
+                    if name in found_binaries:
-                    found_binary = True
+                        assert record['abspath'], f"{name} should have abspath"
-                    break
+                        found_binaries[name] = True
                elif record.get('type') == 'Dependency':
                    name = record['bin_name']
                    if name in found_dependencies:
                        found_dependencies[name] = True
            except json.JSONDecodeError:
                pass
-    assert found_binary, "Should output InstalledBinary record"
+    # Each binary should either be found (InstalledBinary) or missing (Dependency)
    for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
        assert found_binaries[binary_name] or found_dependencies[binary_name], \
            f"{binary_name} should have either InstalledBinary or Dependency record"
 def test_verify_deps_with_abx_pkg():
-    """Verify yt-dlp is available via abx-pkg after hook installation."""
+    """Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
-    from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
+    from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
-    PipProvider.model_rebuild()
+    missing_binaries = []
    EnvProvider.model_rebuild()
    # Verify yt-dlp is available
    ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
    ytdlp_loaded = ytdlp_binary.load()
-    assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook"
+    if not (ytdlp_loaded and ytdlp_loaded.abspath):
        missing_binaries.append('yt-dlp')
    # Verify node is available (yt-dlp needs it for JS extraction)
    node_binary = Binary(
        name='node',
        binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
    )
    node_loaded = node_binary.load()
    if not (node_loaded and node_loaded.abspath):
        missing_binaries.append('node')
    # Verify ffmpeg is available (yt-dlp needs it for video conversion)
    ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
    ffmpeg_loaded = ffmpeg_binary.load()
    if not (ffmpeg_loaded and ffmpeg_loaded.abspath):
        missing_binaries.append('ffmpeg')
    if missing_binaries:
        pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
 def test_handles_non_media_url():
    """Test that media extractor handles non-media URLs gracefully via hook."""
--- a/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py
+++ b/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py
@@ -1,68 +0,0 @@
 #!/usr/bin/env python3
 """
 Install mercury-parser if not already available.
 Runs at crawl start to ensure mercury-parser is installed.
 Outputs JSONL for InstalledBinary.
 """
 import json
 import sys
 from pathlib import Path
 def main():
    try:
        from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
        NpmProvider.model_rebuild()
        EnvProvider.model_rebuild()
        # Note: npm package is @postlight/mercury-parser, binary is mercury-parser
        mercury_binary = Binary(
            name='mercury-parser',
            binproviders=[NpmProvider(), EnvProvider()],
            overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
        )
        # Try to load, install if not found
        try:
            loaded = mercury_binary.load()
            if not loaded or not loaded.abspath:
                raise Exception("Not loaded")
        except Exception:
            # Install via npm
            loaded = mercury_binary.install()
        if loaded and loaded.abspath:
            # Output InstalledBinary JSONL
            print(json.dumps({
                'type': 'InstalledBinary',
                'name': 'mercury-parser',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256,
                'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
            }))
            sys.exit(0)
        else:
            print(json.dumps({
                'type': 'Dependency',
                'bin_name': 'mercury-parser',
                'bin_providers': 'npm,env',
            }))
            print("Failed to install mercury-parser", file=sys.stderr)
            sys.exit(1)
    except Exception as e:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'mercury-parser',
            'bin_providers': 'npm,env',
        }))
        print(f"Error installing mercury-parser: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
+++ b/archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
@@ -0,0 +1,123 @@
 #!/usr/bin/env python3
 """
 Validation hook for postlight-parser binary.
 Runs at crawl start to verify postlight-parser is available.
 Outputs JSONL for InstalledBinary and Machine config updates.
 """
 import os
 import sys
 import json
 import shutil
 import hashlib
 import subprocess
 from pathlib import Path
 def get_binary_version(abspath: str) -> str | None:
    """Get version string from binary."""
    try:
        result = subprocess.run(
            [abspath, '--version'],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if result.returncode == 0 and result.stdout:
            first_line = result.stdout.strip().split('\n')[0]
            return first_line[:64]
    except Exception:
        pass
    return None
 def get_binary_hash(abspath: str) -> str | None:
    """Get SHA256 hash of binary."""
    try:
        with open(abspath, 'rb') as f:
            return hashlib.sha256(f.read()).hexdigest()
    except Exception:
        return None
 def find_mercury() -> dict | None:
    """Find postlight-parser binary."""
    try:
        from abx_pkg import Binary, NpmProvider, EnvProvider
        class MercuryBinary(Binary):
            name: str = 'postlight-parser'
            binproviders_supported = [NpmProvider(), EnvProvider()]
            overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
        binary = MercuryBinary()
        loaded = binary.load()
        if loaded and loaded.abspath:
            return {
                'name': 'postlight-parser',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
            }
    except ImportError:
        pass
    except Exception:
        pass
    # Fallback to shutil.which
    abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
    if abspath and Path(abspath).is_file():
        return {
            'name': 'postlight-parser',
            'abspath': abspath,
            'version': get_binary_version(abspath),
            'sha256': get_binary_hash(abspath),
            'binprovider': 'env',
        }
    return None
 def main():
    result = find_mercury()
    if result and result.get('abspath'):
        print(json.dumps({
            'type': 'InstalledBinary',
            'name': result['name'],
            'abspath': result['abspath'],
            'version': result['version'],
            'sha256': result['sha256'],
            'binprovider': result['binprovider'],
        }))
        print(json.dumps({
            'type': 'Machine',
            '_method': 'update',
            'key': 'config/MERCURY_BINARY',
            'value': result['abspath'],
        }))
        if result['version']:
            print(json.dumps({
                'type': 'Machine',
                '_method': 'update',
                'key': 'config/MERCURY_VERSION',
                'value': result['version'],
            }))
        sys.exit(0)
    else:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'postlight-parser',
            'bin_providers': 'npm,env',
        }))
        print(f"postlight-parser binary not found", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
@@ -6,10 +6,10 @@ Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
 Output: Creates mercury/ directory with content.html, content.txt, article.json
 Environment variables:
-    MERCURY_BINARY: Path to mercury-parser binary
+    MERCURY_BINARY: Path to postlight-parser binary
    TIMEOUT: Timeout in seconds (default: 60)
-Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
+Note: Requires postlight-parser: npm install -g @postlight/parser
 """
 import json
@@ -25,7 +25,7 @@ import rich_click as click
 # Extractor metadata
 EXTRACTOR_NAME = 'mercury'
-BIN_NAME = 'mercury-parser'
+BIN_NAME = 'postlight-parser'
 BIN_PROVIDERS = 'npm,env'
 OUTPUT_DIR = 'mercury'
@@ -42,12 +42,12 @@ def get_env_int(name: str, default: int = 0) -> int:
 def find_mercury() -> str | None:
-    """Find mercury-parser binary."""
+    """Find postlight-parser binary."""
    mercury = get_env('MERCURY_BINARY')
    if mercury and os.path.isfile(mercury):
        return mercury
-    for name in ['mercury-parser', 'mercury']:
+    for name in ['postlight-parser']:
        binary = shutil.which(name)
        if binary:
            return binary
@@ -56,7 +56,7 @@ def find_mercury() -> str | None:
 def get_version(binary: str) -> str:
-    """Get mercury-parser version."""
+    """Get postlight-parser version."""
    try:
        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
        return result.stdout.strip()[:64]
@@ -83,12 +83,12 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
        if result_text.returncode != 0:
            stderr = result_text.stderr.decode('utf-8', errors='replace')
-            return False, None, f'mercury-parser failed: {stderr[:200]}'
+            return False, None, f'postlight-parser failed: {stderr[:200]}'
        try:
            text_json = json.loads(result_text.stdout)
        except json.JSONDecodeError:
-            return False, None, 'mercury-parser returned invalid JSON'
+            return False, None, 'postlight-parser returned invalid JSON'
        if text_json.get('failed'):
            return False, None, 'Mercury was not able to extract article'
@@ -139,7 +139,7 @@ def main(url: str, snapshot_id: str):
        # Find binary
        binary = find_mercury()
        if not binary:
-            print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
+            print(f'ERROR: postlight-parser binary not found', file=sys.stderr)
            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
            sys.exit(1)
--- a/archivebox/plugins/mercury/templates/embed.html
+++ b/archivebox/plugins/mercury/templates/embed.html
@@ -0,0 +1,6 @@
 <!-- Mercury embed - Mercury parser article view -->
 <iframe src="{{ output_path }}"
        class="extractor-embed mercury-embed"
        style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
        sandbox="allow-same-origin">
 </iframe>
--- a/archivebox/plugins/mercury/templates/fullscreen.html
+++ b/archivebox/plugins/mercury/templates/fullscreen.html
@@ -0,0 +1,6 @@
 <!-- Mercury fullscreen - full Mercury parser article -->
 <iframe src="{{ output_path }}"
        class="extractor-fullscreen mercury-fullscreen"
        style="width: 100%; height: 100vh; border: none; background: #fefefe;"
        sandbox="allow-same-origin">
 </iframe>
--- a/archivebox/plugins/mercury/templates/icon.html
+++ b/archivebox/plugins/mercury/templates/icon.html
@@ -0,0 +1 @@
 ☿️
--- a/archivebox/plugins/mercury/templates/thumbnail.html
+++ b/archivebox/plugins/mercury/templates/thumbnail.html
@@ -0,0 +1,8 @@
 <!-- Mercury thumbnail - shows Mercury parser extracted article content -->
 <div class="extractor-thumbnail mercury-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
    <iframe src="{{ output_path }}"
            style="width: 100%; height: 300px; border: none; pointer-events: none;"
            loading="lazy"
            sandbox="allow-same-origin">
    </iframe>
 </div>
--- a/archivebox/plugins/mercury/tests/test_mercury.py
+++ b/archivebox/plugins/mercury/tests/test_mercury.py
@@ -21,7 +21,7 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
 MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
-MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
+MERCURY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_mercury.py'
 TEST_URL = 'https://example.com'
 def test_hook_script_exists():
@@ -29,53 +29,70 @@ def test_hook_script_exists():
    assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
-def test_mercury_install_hook():
+def test_mercury_validate_hook():
-    """Test mercury install hook to install mercury-parser if needed."""
+    """Test mercury validate hook checks for postlight-parser."""
-    # Run mercury install hook
+    # Run mercury validate hook
    result = subprocess.run(
-        [sys.executable, str(MERCURY_INSTALL_HOOK)],
+        [sys.executable, str(MERCURY_VALIDATE_HOOK)],
        capture_output=True,
        text=True,
-        timeout=600
+        timeout=30
    )
-    assert result.returncode == 0, f"Install hook failed: {result.stderr}"
+    # Hook exits 0 if binary found, 1 if not found (with Dependency record)
-
+    if result.returncode == 0:
-    # Verify InstalledBinary JSONL output
+        # Binary found - verify InstalledBinary JSONL output
-    found_binary = False
+        found_binary = False
-    for line in result.stdout.strip().split('\n'):
+        for line in result.stdout.strip().split('\n'):
-        if line.strip():
+            if line.strip():
-            try:
+                try:
-                record = json.loads(line)
+                    record = json.loads(line)
-                if record.get('type') == 'InstalledBinary':
+                    if record.get('type') == 'InstalledBinary':
-                    assert record['name'] == 'mercury-parser'
+                        assert record['name'] == 'postlight-parser'
-                    assert record['abspath']
+                        assert record['abspath']
-                    found_binary = True
+                        found_binary = True
-                    break
+                        break
-            except json.JSONDecodeError:
+                except json.JSONDecodeError:
-                pass
+                    pass
-
+        assert found_binary, "Should output InstalledBinary record when binary found"
-    assert found_binary, "Should output InstalledBinary record"
+    else:
        # Binary not found - verify Dependency JSONL output
        found_dependency = False
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                try:
                    record = json.loads(line)
                    if record.get('type') == 'Dependency':
                        assert record['bin_name'] == 'postlight-parser'
                        assert 'npm' in record['bin_providers']
                        found_dependency = True
                        break
                except json.JSONDecodeError:
                    pass
        assert found_dependency, "Should output Dependency record when binary not found"
 def test_verify_deps_with_abx_pkg():
-    """Verify mercury-parser is available via abx-pkg after hook installation."""
+    """Verify postlight-parser is available via abx-pkg."""
    from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
-    NpmProvider.model_rebuild()
+    # Verify postlight-parser is available
    EnvProvider.model_rebuild()
    # Verify mercury-parser is available
    mercury_binary = Binary(
-        name='mercury-parser',
+        name='postlight-parser',
        binproviders=[NpmProvider(), EnvProvider()],
-        overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
+        overrides={'npm': {'packages': ['@postlight/parser']}}
    )
    mercury_loaded = mercury_binary.load()
-    assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook"
+
    # If validate hook found it (exit 0), this should succeed
    # If validate hook didn't find it (exit 1), this may fail unless binprovider installed it
    if mercury_loaded and mercury_loaded.abspath:
        assert True, "postlight-parser is available"
    else:
        pytest.skip("postlight-parser not available - Dependency record should have been emitted")
 def test_extracts_with_mercury_parser():
-    """Test full workflow: extract with mercury-parser from real HTML via hook."""
+    """Test full workflow: extract with postlight-parser from real HTML via hook."""
    # Prerequisites checked by earlier test
    with tempfile.TemporaryDirectory() as tmpdir:
--- a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
+++ b/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
@@ -2,46 +2,28 @@
 """
 Create a Merkle tree of all archived outputs.
-This plugin runs after all extractors and post-processing complete (priority 92)
+This plugin runs after all extractors complete (priority 93) and generates
-and generates a cryptographic Merkle tree of all files in the snapshot directory.
+a cryptographic Merkle tree of all files in the snapshot directory.
 This provides:
    - Tamper detection: verify archive integrity
    - Efficient updates: only re-hash changed files
    - Compact proofs: prove file inclusion without sending all files
    - Deduplication: identify identical content across snapshots
-Output: merkletree/merkletree.json containing:
+Output: merkletree.json containing root_hash, tree structure, file list, metadata
    - root_hash: SHA256 hash of the Merkle root
    - tree: Full tree structure with internal nodes
    - files: List of all files with their hashes
    - metadata: Timestamp, file count, total size
-Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
+Usage: on_Snapshot__93_merkletree.py --url=<url> --snapshot-id=<uuid>
 Environment variables:
    SAVE_MERKLETREE: Enable merkle tree generation (default: true)
    DATA_DIR: ArchiveBox data directory
    ARCHIVE_DIR: Archive output directory
 """
 __package__ = 'archivebox.plugins.merkletree'
 import os
 import sys
 import json
 import hashlib
 from pathlib import Path
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Dict, List, Optional, Tuple, Any
-# Configure Django if running standalone
+import click
 if __name__ == '__main__':
    parent_dir = str(Path(__file__).resolve().parent.parent.parent)
    if parent_dir not in sys.path:
        sys.path.insert(0, parent_dir)
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
    import django
    django.setup()
 import rich_click as click
 def sha256_file(filepath: Path) -> str:
@@ -49,12 +31,10 @@ def sha256_file(filepath: Path) -> str:
    h = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            # Read in 64kb chunks
            while chunk := f.read(65536):
                h.update(chunk)
        return h.hexdigest()
    except (OSError, PermissionError):
        # If we can't read the file, return a null hash
        return '0' * 64
@@ -64,74 +44,45 @@ def sha256_data(data: bytes) -> str:
 def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
-    """
+    """Recursively collect all files in snapshot directory."""
    Recursively collect all files in snapshot directory.
    Args:
        snapshot_dir: Root directory to scan
        exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
    Returns:
        List of (relative_path, sha256_hash, file_size) tuples
    """
    exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
    files = []
    for root, dirs, filenames in os.walk(snapshot_dir):
        # Filter out excluded directories
        dirs[:] = [d for d in dirs if d not in exclude_dirs]
        for filename in filenames:
            filepath = Path(root) / filename
            rel_path = filepath.relative_to(snapshot_dir)
            # Skip symlinks (we hash the target, not the link)
            if filepath.is_symlink():
                continue
            # Compute hash and size
            file_hash = sha256_file(filepath)
            file_size = filepath.stat().st_size if filepath.exists() else 0
            files.append((rel_path, file_hash, file_size))
    # Sort by path for deterministic tree
    files.sort(key=lambda x: str(x[0]))
    return files
 def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
-    """
+    """Build a Merkle tree from a list of leaf hashes."""
    Build a Merkle tree from a list of leaf hashes.
    Args:
        file_hashes: List of SHA256 hashes (leaves)
    Returns:
        (root_hash, tree_levels) where tree_levels is a list of hash lists per level
    """
    if not file_hashes:
        # Empty tree
        return sha256_data(b''), [[]]
    # Initialize with leaf level
    tree_levels = [file_hashes.copy()]
    # Build tree bottom-up
    while len(tree_levels[-1]) > 1:
        current_level = tree_levels[-1]
        next_level = []
        # Process pairs
        for i in range(0, len(current_level), 2):
            left = current_level[i]
            if i + 1 < len(current_level):
                # Combine left + right
                right = current_level[i + 1]
                combined = left + right
            else:
                # Odd number of nodes: duplicate the last one
                combined = left + left
            parent_hash = sha256_data(combined.encode('utf-8'))
@@ -139,67 +90,41 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
        tree_levels.append(next_level)
    # Root is the single hash at the top level
    root_hash = tree_levels[-1][0]
    return root_hash, tree_levels
 def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
-    """
+    """Create a complete Merkle tree of all files in snapshot directory."""
    Create a complete Merkle tree of all files in snapshot directory.
    Args:
        snapshot_dir: The snapshot directory to scan
    Returns:
        Dict containing root_hash, tree structure, file list, and metadata
    """
    # Collect all files
    files = collect_files(snapshot_dir)
    # Extract just the hashes for tree building
    file_hashes = [file_hash for _, file_hash, _ in files]
    # Build Merkle tree
    root_hash, tree_levels = build_merkle_tree(file_hashes)
    # Calculate total size
    total_size = sum(size for _, _, size in files)
    # Prepare file list with metadata
    file_list = [
-        {
+        {'path': str(path), 'hash': file_hash, 'size': size}
            'path': str(path),
            'hash': file_hash,
            'size': size,
        }
        for path, file_hash, size in files
    ]
-    # Prepare result
+    return {
    result = {
        'root_hash': root_hash,
        'tree_levels': tree_levels,
        'files': file_list,
        'metadata': {
-            'timestamp': datetime.now().isoformat(),
+            'timestamp': datetime.now(timezone.utc).isoformat(),
            'file_count': len(files),
            'total_size': total_size,
            'tree_depth': len(tree_levels),
        },
    }
    return result
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
 def main(url: str, snapshot_id: str):
    """Generate Merkle tree of all archived outputs."""
-    from archivebox.core.models import Snapshot
+    start_ts = datetime.now(timezone.utc)
    start_ts = datetime.now()
    status = 'failed'
    output = None
    error = ''
@@ -211,30 +136,19 @@ def main(url: str, snapshot_id: str):
        save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
        if not save_merkletree:
            click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
            status = 'skipped'
-            end_ts = datetime.now()
+            click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
            click.echo(f'START_TS={start_ts.isoformat()}')
            click.echo(f'END_TS={end_ts.isoformat()}')
            click.echo(f'STATUS={status}')
            click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
            sys.exit(0)
-        # Get snapshot
+        # Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
-        try:
+        # Parent is the snapshot directory
-            snapshot = Snapshot.objects.get(id=snapshot_id)
+        output_dir = Path.cwd()
-        except Snapshot.DoesNotExist:
+        snapshot_dir = output_dir.parent
            error = f'Snapshot {snapshot_id} not found'
            raise ValueError(error)
        # Get snapshot directory
        snapshot_dir = Path(snapshot.output_dir)
        if not snapshot_dir.exists():
-            error = f'Snapshot directory not found: {snapshot_dir}'
+            raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
            raise FileNotFoundError(error)
-        # Create output directory
+        # Ensure output directory exists
        output_dir = snapshot_dir / 'merkletree'
        output_dir.mkdir(exist_ok=True)
        output_path = output_dir / 'merkletree.json'
@@ -246,49 +160,31 @@ def main(url: str, snapshot_id: str):
            json.dump(merkle_data, f, indent=2)
        status = 'succeeded'
-        output = str(output_path)
+        output = 'merkletree.json'
        root_hash = merkle_data['root_hash']
        file_count = merkle_data['metadata']['file_count']
        total_size = merkle_data['metadata']['total_size']
-        click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
+        click.echo(f'Merkle tree: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
    except Exception as e:
        error = f'{type(e).__name__}: {e}'
        status = 'failed'
        click.echo(f'Error: {error}', err=True)
-    end_ts = datetime.now()
+    end_ts = datetime.now(timezone.utc)
    duration = (end_ts - start_ts).total_seconds()
-    # Print results
+    # Print JSON result for hook runner
-    click.echo(f'START_TS={start_ts.isoformat()}')
+    result = {
    click.echo(f'END_TS={end_ts.isoformat()}')
    click.echo(f'DURATION={duration:.2f}')
    if output:
        click.echo(f'OUTPUT={output}')
    click.echo(f'STATUS={status}')
    if error:
        click.echo(f'ERROR={error}', err=True)
    # Print JSON result
    result_json = {
        'extractor': 'merkletree',
        'url': url,
        'snapshot_id': snapshot_id,
        'status': status,
        'start_ts': start_ts.isoformat(),
        'end_ts': end_ts.isoformat(),
        'duration': round(duration, 2),
        'output': output,
        'error': error or None,
        'root_hash': root_hash,
        'file_count': file_count,
        'error': error or None,
    }
-    click.echo(f'RESULT_JSON={json.dumps(result_json)}')
+    click.echo(json.dumps(result))
-    sys.exit(0 if status == 'succeeded' else 1)
+    sys.exit(0 if status in ('succeeded', 'skipped') else 1)
 if __name__ == '__main__':
--- a/archivebox/plugins/parse_dom_outlinks/templates/icon.html
+++ b/archivebox/plugins/parse_dom_outlinks/templates/icon.html
@@ -0,0 +1 @@
 🔗
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
@@ -133,7 +133,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='HTML URL to parse')
-def main(url: str):
+@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
 def main(url: str, snapshot_id: str = None):
    """Parse HTML and extract href URLs."""
    # Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
--- a/archivebox/plugins/parse_html_urls/templates/icon.html
+++ b/archivebox/plugins/parse_html_urls/templates/icon.html
@@ -0,0 +1 @@
 🔗
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
@@ -127,7 +127,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='JSONL file URL to parse')
-def main(url: str):
+@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
 def main(url: str, snapshot_id: str = None):
    """Parse JSONL bookmark file and extract URLs."""
    try:
--- a/archivebox/plugins/parse_jsonl_urls/templates/icon.html
+++ b/archivebox/plugins/parse_jsonl_urls/templates/icon.html
@@ -0,0 +1 @@
 📋
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
@@ -52,7 +52,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
-def main(url: str):
+@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
 def main(url: str, snapshot_id: str = None):
    """Parse Netscape bookmark HTML and extract URLs."""
    try:
--- a/archivebox/plugins/parse_netscape_urls/templates/icon.html
+++ b/archivebox/plugins/parse_netscape_urls/templates/icon.html
@@ -0,0 +1 @@
 🔖
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
@@ -51,7 +51,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
-def main(url: str):
+@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
 def main(url: str, snapshot_id: str = None):
    """Parse RSS/Atom feed and extract article URLs."""
    if feedparser is None:
--- a/archivebox/plugins/parse_rss_urls/templates/icon.html
+++ b/archivebox/plugins/parse_rss_urls/templates/icon.html
@@ -0,0 +1 @@
 📡
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
@@ -100,7 +100,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
-def main(url: str):
+@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
 def main(url: str, snapshot_id: str = None):
    """Parse plain text and extract URLs."""
    try:
--- a/archivebox/plugins/parse_txt_urls/templates/icon.html
+++ b/archivebox/plugins/parse_txt_urls/templates/icon.html
@@ -0,0 +1 @@
 📃
--- a/archivebox/plugins/pdf/templates/embed.html
+++ b/archivebox/plugins/pdf/templates/embed.html
@@ -0,0 +1,5 @@
 <!-- PDF embed - full PDF viewer -->
 <embed src="{{ output_path }}#toolbar=1&navpanes=1"
       type="application/pdf"
       class="extractor-embed pdf-embed"
       style="width: 100%; height: 100%; min-height: 500px;">
--- a/archivebox/plugins/pdf/templates/fullscreen.html
+++ b/archivebox/plugins/pdf/templates/fullscreen.html
@@ -0,0 +1,5 @@
 <!-- PDF fullscreen - full PDF viewer -->
 <embed src="{{ output_path }}#toolbar=1&navpanes=1&view=FitH"
       type="application/pdf"
       class="extractor-fullscreen pdf-fullscreen"
       style="width: 100%; height: 100vh;">
--- a/archivebox/plugins/pdf/templates/icon.html
+++ b/archivebox/plugins/pdf/templates/icon.html
@@ -0,0 +1 @@
 📄
--- a/archivebox/plugins/pdf/templates/thumbnail.html
+++ b/archivebox/plugins/pdf/templates/thumbnail.html
@@ -0,0 +1,6 @@
 <!-- PDF thumbnail - shows first page preview -->
 <div class="extractor-thumbnail pdf-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f5f5f5;">
    <embed src="{{ output_path }}#toolbar=0&navpanes=0&scrollbar=0&page=1&view=FitH"
           type="application/pdf"
           style="width: 100%; height: 200px; margin-top: -20px; pointer-events: none;">
 </div>
--- a/archivebox/plugins/readability/on_Crawl__00_install_readability.py
+++ b/archivebox/plugins/readability/on_Crawl__00_install_readability.py
@@ -1,68 +0,0 @@
 #!/usr/bin/env python3
 """
 Install readability-extractor if not already available.
 Runs at crawl start to ensure readability-extractor is installed.
 Outputs JSONL for InstalledBinary.
 """
 import json
 import sys
 from pathlib import Path
 def main():
    try:
        from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
        NpmProvider.model_rebuild()
        EnvProvider.model_rebuild()
        # Note: npm package is from github:ArchiveBox/readability-extractor
        readability_binary = Binary(
            name='readability-extractor',
            binproviders=[NpmProvider(), EnvProvider()],
            overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
        )
        # Try to load, install if not found
        try:
            loaded = readability_binary.load()
            if not loaded or not loaded.abspath:
                raise Exception("Not loaded")
        except Exception:
            # Install via npm from GitHub repo
            loaded = readability_binary.install()
        if loaded and loaded.abspath:
            # Output InstalledBinary JSONL
            print(json.dumps({
                'type': 'InstalledBinary',
                'name': 'readability-extractor',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256,
                'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
            }))
            sys.exit(0)
        else:
            print(json.dumps({
                'type': 'Dependency',
                'bin_name': 'readability-extractor',
                'bin_providers': 'npm,env',
            }))
            print("Failed to install readability-extractor", file=sys.stderr)
            sys.exit(1)
    except Exception as e:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'readability-extractor',
            'bin_providers': 'npm,env',
        }))
        print(f"Error installing readability-extractor: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/readability/on_Crawl__00_validate_readability.py
+++ b/archivebox/plugins/readability/on_Crawl__00_validate_readability.py
@@ -0,0 +1,123 @@
 #!/usr/bin/env python3
 """
 Validation hook for readability-extractor binary.
 Runs at crawl start to verify readability-extractor is available.
 Outputs JSONL for InstalledBinary and Machine config updates.
 """
 import os
 import sys
 import json
 import shutil
 import hashlib
 import subprocess
 from pathlib import Path
 def get_binary_version(abspath: str) -> str | None:
    """Get version string from binary."""
    try:
        result = subprocess.run(
            [abspath, '--version'],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if result.returncode == 0 and result.stdout:
            first_line = result.stdout.strip().split('\n')[0]
            return first_line[:64]
    except Exception:
        pass
    return None
 def get_binary_hash(abspath: str) -> str | None:
    """Get SHA256 hash of binary."""
    try:
        with open(abspath, 'rb') as f:
            return hashlib.sha256(f.read()).hexdigest()
    except Exception:
        return None
 def find_readability() -> dict | None:
    """Find readability-extractor binary."""
    try:
        from abx_pkg import Binary, NpmProvider, EnvProvider
        class ReadabilityBinary(Binary):
            name: str = 'readability-extractor'
            binproviders_supported = [NpmProvider(), EnvProvider()]
            overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
        binary = ReadabilityBinary()
        loaded = binary.load()
        if loaded and loaded.abspath:
            return {
                'name': 'readability-extractor',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
            }
    except ImportError:
        pass
    except Exception:
        pass
    # Fallback to shutil.which
    abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
    if abspath and Path(abspath).is_file():
        return {
            'name': 'readability-extractor',
            'abspath': abspath,
            'version': get_binary_version(abspath),
            'sha256': get_binary_hash(abspath),
            'binprovider': 'env',
        }
    return None
 def main():
    result = find_readability()
    if result and result.get('abspath'):
        print(json.dumps({
            'type': 'InstalledBinary',
            'name': result['name'],
            'abspath': result['abspath'],
            'version': result['version'],
            'sha256': result['sha256'],
            'binprovider': result['binprovider'],
        }))
        print(json.dumps({
            'type': 'Machine',
            '_method': 'update',
            'key': 'config/READABILITY_BINARY',
            'value': result['abspath'],
        }))
        if result['version']:
            print(json.dumps({
                'type': 'Machine',
                '_method': 'update',
                'key': 'config/READABILITY_VERSION',
                'value': result['version'],
            }))
        sys.exit(0)
    else:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'readability-extractor',
            'bin_providers': 'npm,env',
        }))
        print(f"readability-extractor binary not found", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/readability/templates/embed.html
+++ b/archivebox/plugins/readability/templates/embed.html
@@ -0,0 +1,6 @@
 <!-- Readability embed - reader-mode article view -->
 <iframe src="{{ output_path }}"
        class="extractor-embed readability-embed"
        style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
        sandbox="allow-same-origin">
 </iframe>
--- a/archivebox/plugins/readability/templates/fullscreen.html
+++ b/archivebox/plugins/readability/templates/fullscreen.html
@@ -0,0 +1,6 @@
 <!-- Readability fullscreen - full reader-mode article -->
 <iframe src="{{ output_path }}"
        class="extractor-fullscreen readability-fullscreen"
        style="width: 100%; height: 100vh; border: none; background: #fefefe;"
        sandbox="allow-same-origin">
 </iframe>
--- a/archivebox/plugins/readability/templates/icon.html
+++ b/archivebox/plugins/readability/templates/icon.html
@@ -0,0 +1 @@
 📖
--- a/archivebox/plugins/readability/templates/thumbnail.html
+++ b/archivebox/plugins/readability/templates/thumbnail.html
@@ -0,0 +1,8 @@
 <!-- Readability thumbnail - shows reader-mode extracted article content -->
 <div class="extractor-thumbnail readability-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
    <iframe src="{{ output_path }}"
            style="width: 100%; height: 300px; border: none; pointer-events: none;"
            loading="lazy"
            sandbox="allow-same-origin">
    </iframe>
 </div>
--- a/archivebox/plugins/readability/tests/test_readability.py
+++ b/archivebox/plugins/readability/tests/test_readability.py
@@ -2,7 +2,7 @@
 Integration tests for readability plugin
 Tests verify:
-1. Install hook installs readability-extractor via abx-pkg
+1. Validate hook checks for readability-extractor binary
 2. Verify deps with abx-pkg
 3. Plugin reports missing dependency correctly
 4. Extraction works against real example.com content
@@ -21,7 +21,7 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
 READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
-READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
+READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py'
 TEST_URL = 'https://example.com'
@@ -101,48 +101,63 @@ def test_reports_missing_dependency_when_not_installed():
        assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
-def test_readability_install_hook():
+def test_readability_validate_hook():
-    """Test readability install hook to install readability-extractor if needed."""
+    """Test readability validate hook checks for readability-extractor binary."""
    result = subprocess.run(
-        [sys.executable, str(READABILITY_INSTALL_HOOK)],
+        [sys.executable, str(READABILITY_VALIDATE_HOOK)],
        capture_output=True,
        text=True,
-        timeout=600
+        timeout=30
    )
-    assert result.returncode == 0, f"Install hook failed: {result.stderr}"
+    # Hook exits 0 if binary found, 1 if not found (with Dependency record)
-
+    if result.returncode == 0:
-    # Verify InstalledBinary JSONL output
+        # Binary found - verify InstalledBinary JSONL output
-    found_binary = False
+        found_binary = False
-    for line in result.stdout.strip().split('\n'):
+        for line in result.stdout.strip().split('\n'):
-        if line.strip():
+            if line.strip():
-            try:
+                try:
-                record = json.loads(line)
+                    record = json.loads(line)
-                if record.get('type') == 'InstalledBinary':
+                    if record.get('type') == 'InstalledBinary':
-                    assert record['name'] == 'readability-extractor'
+                        assert record['name'] == 'readability-extractor'
-                    assert record['abspath']
+                        assert record['abspath']
-                    found_binary = True
+                        found_binary = True
-                    break
+                        break
-            except json.JSONDecodeError:
+                except json.JSONDecodeError:
-                pass
+                    pass
-
+        assert found_binary, "Should output InstalledBinary record when binary found"
-    assert found_binary, "Should output InstalledBinary record"
+    else:
        # Binary not found - verify Dependency JSONL output
        found_dependency = False
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                try:
                    record = json.loads(line)
                    if record.get('type') == 'Dependency':
                        assert record['bin_name'] == 'readability-extractor'
                        assert 'npm' in record['bin_providers']
                        found_dependency = True
                        break
                except json.JSONDecodeError:
                    pass
        assert found_dependency, "Should output Dependency record when binary not found"
 def test_verify_deps_with_abx_pkg():
-    """Verify readability-extractor is available via abx-pkg after hook installation."""
+    """Verify readability-extractor is available via abx-pkg."""
    from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
    NpmProvider.model_rebuild()
    EnvProvider.model_rebuild()
    readability_binary = Binary(
        name='readability-extractor',
        binproviders=[NpmProvider(), EnvProvider()],
        overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
    )
    readability_loaded = readability_binary.load()
-    assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook"
+
    if readability_loaded and readability_loaded.abspath:
        assert True, "readability-extractor is available"
    else:
        pytest.skip("readability-extractor not available - Dependency record should have been emitted")
 def test_extracts_article_after_installation():
--- a/archivebox/plugins/screenshot/templates/embed.html
+++ b/archivebox/plugins/screenshot/templates/embed.html
@@ -0,0 +1,5 @@
 <!-- Screenshot embed - full image view -->
 <img src="{{ output_path }}"
     alt="Screenshot of page"
     class="extractor-embed screenshot-embed"
     style="max-width: 100%; height: auto;">
--- a/archivebox/plugins/screenshot/templates/fullscreen.html
+++ b/archivebox/plugins/screenshot/templates/fullscreen.html
@@ -0,0 +1,8 @@
 <!-- Screenshot fullscreen - zoomable image -->
 <div style="width: 100%; height: 100vh; overflow: auto; background: #222; display: flex; align-items: start; justify-content: center;">
    <img src="{{ output_path }}"
         alt="Screenshot of page"
         class="extractor-fullscreen screenshot-fullscreen"
         style="max-width: 100%; cursor: zoom-in;"
         onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.maxWidth === 'none' ? 'zoom-out' : 'zoom-in';">
 </div>
--- a/archivebox/plugins/screenshot/templates/icon.html
+++ b/archivebox/plugins/screenshot/templates/icon.html
@@ -0,0 +1 @@
 📷
--- a/archivebox/plugins/screenshot/templates/thumbnail.html
+++ b/archivebox/plugins/screenshot/templates/thumbnail.html
@@ -0,0 +1,8 @@
 <!-- Screenshot thumbnail - shows the captured screenshot image -->
 <img src="{{ output_path }}"
     alt="Screenshot of page"
     class="extractor-thumbnail screenshot-thumbnail"
     style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333;"
     loading="lazy"
     onerror="this.style.display='none'; this.nextElementSibling.style.display='block';">
 <div style="display: none; text-align: center; padding: 20px; color: #999;">📷 Screenshot</div>
--- a/archivebox/plugins/singlefile/templates/embed.html
+++ b/archivebox/plugins/singlefile/templates/embed.html
@@ -0,0 +1,6 @@
 <!-- Singlefile embed - full iframe of archived HTML -->
 <iframe src="{{ output_path }}"
        class="extractor-embed singlefile-embed"
        style="width: 100%; height: 100%; min-height: 500px; border: none;"
        sandbox="allow-same-origin allow-scripts allow-forms">
 </iframe>
--- a/archivebox/plugins/singlefile/templates/fullscreen.html
+++ b/archivebox/plugins/singlefile/templates/fullscreen.html
@@ -0,0 +1,6 @@
 <!-- Singlefile fullscreen - full page iframe -->
 <iframe src="{{ output_path }}"
        class="extractor-fullscreen singlefile-fullscreen"
        style="width: 100%; height: 100vh; border: none;"
        sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
 </iframe>
--- a/archivebox/plugins/singlefile/templates/icon.html
+++ b/archivebox/plugins/singlefile/templates/icon.html
@@ -0,0 +1 @@
 📦
--- a/archivebox/plugins/singlefile/templates/thumbnail.html
+++ b/archivebox/plugins/singlefile/templates/thumbnail.html
@@ -0,0 +1,8 @@
 <!-- Singlefile thumbnail - scaled down iframe preview of archived HTML -->
 <div class="extractor-thumbnail singlefile-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
    <iframe src="{{ output_path }}"
            style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
            loading="lazy"
            sandbox="allow-same-origin">
    </iframe>
 </div>
--- a/archivebox/plugins/staticfile/templates/icon.html
+++ b/archivebox/plugins/staticfile/templates/icon.html
@@ -0,0 +1 @@
 📁
--- a/archivebox/plugins/title/templates/icon.html
+++ b/archivebox/plugins/title/templates/icon.html
@@ -0,0 +1 @@
 📝
--- a/archivebox/plugins/wget/on_Crawl__00_install_wget.py
+++ b/archivebox/plugins/wget/on_Crawl__00_install_wget.py
@@ -1,68 +0,0 @@
 #!/usr/bin/env python3
 """
 Install wget if not already available.
 Runs at crawl start to ensure wget is installed.
 Outputs JSONL for InstalledBinary.
 """
 import json
 import sys
 from pathlib import Path
 def main():
    try:
        from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
        AptProvider.model_rebuild()
        BrewProvider.model_rebuild()
        EnvProvider.model_rebuild()
        # wget binary and package have same name
        wget_binary = Binary(
            name='wget',
            binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
        )
        # Try to load, install if not found
        try:
            loaded = wget_binary.load()
            if not loaded or not loaded.abspath:
                raise Exception("Not loaded")
        except Exception:
            # Install via system package manager
            loaded = wget_binary.install()
        if loaded and loaded.abspath:
            # Output InstalledBinary JSONL
            print(json.dumps({
                'type': 'InstalledBinary',
                'name': 'wget',
                'abspath': str(loaded.abspath),
                'version': str(loaded.version) if loaded.version else None,
                'sha256': loaded.sha256,
                'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
            }))
            sys.exit(0)
        else:
            print(json.dumps({
                'type': 'Dependency',
                'bin_name': 'wget',
                'bin_providers': 'apt,brew,env',
            }))
            print("Failed to install wget", file=sys.stderr)
            sys.exit(1)
    except Exception as e:
        print(json.dumps({
            'type': 'Dependency',
            'bin_name': 'wget',
            'bin_providers': 'apt,brew,env',
        }))
        print(f"Error installing wget: {e}", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/wget/templates/embed.html
+++ b/archivebox/plugins/wget/templates/embed.html
@@ -0,0 +1,6 @@
 <!-- Wget embed - full iframe of mirrored site -->
 <iframe src="{{ output_path }}"
        class="extractor-embed wget-embed"
        style="width: 100%; height: 100%; min-height: 500px; border: none;"
        sandbox="allow-same-origin allow-scripts allow-forms">
 </iframe>
--- a/archivebox/plugins/wget/templates/fullscreen.html
+++ b/archivebox/plugins/wget/templates/fullscreen.html
@@ -0,0 +1,6 @@
 <!-- Wget fullscreen - full page iframe of mirrored site -->
 <iframe src="{{ output_path }}"
        class="extractor-fullscreen wget-fullscreen"
        style="width: 100%; height: 100vh; border: none;"
        sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
 </iframe>
--- a/archivebox/plugins/wget/templates/icon.html
+++ b/archivebox/plugins/wget/templates/icon.html
@@ -0,0 +1 @@
 📥
--- a/archivebox/plugins/wget/templates/thumbnail.html
+++ b/archivebox/plugins/wget/templates/thumbnail.html
@@ -0,0 +1,8 @@
 <!-- Wget thumbnail - scaled down iframe preview of mirrored site -->
 <div class="extractor-thumbnail wget-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
    <iframe src="{{ output_path }}"
            style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
            loading="lazy"
            sandbox="allow-same-origin">
    </iframe>
 </div>
--- a/archivebox/plugins/wget/tests/test_wget.py
+++ b/archivebox/plugins/wget/tests/test_wget.py
@@ -2,8 +2,8 @@
 Integration tests for wget plugin
 Tests verify:
-1. Plugin reports missing dependency correctly
+1. Validate hook checks for wget binary
-2. wget can be installed via brew/apt provider hooks
+2. Verify deps with abx-pkg
 3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
 4. Extraction works against real example.com
 5. Output files contain actual page content
@@ -26,7 +26,7 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
 WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
-WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
+WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
 BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
 APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
 TEST_URL = 'https://example.com'
@@ -37,45 +37,59 @@ def test_hook_script_exists():
    assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
-def test_wget_install_hook():
+def test_wget_validate_hook():
-    """Test wget install hook to install wget if needed."""
+    """Test wget validate hook checks for wget binary."""
    result = subprocess.run(
-        [sys.executable, str(WGET_INSTALL_HOOK)],
+        [sys.executable, str(WGET_VALIDATE_HOOK)],
        capture_output=True,
        text=True,
-        timeout=600
+        timeout=30
    )
-    assert result.returncode == 0, f"Install hook failed: {result.stderr}"
+    # Hook exits 0 if binary found, 1 if not found (with Dependency record)
-
+    if result.returncode == 0:
-    # Verify InstalledBinary JSONL output
+        # Binary found - verify InstalledBinary JSONL output
-    found_binary = False
+        found_binary = False
-    for line in result.stdout.strip().split('\n'):
+        for line in result.stdout.strip().split('\n'):
-        if line.strip():
+            if line.strip():
-            try:
+                try:
-                record = json.loads(line)
+                    record = json.loads(line)
-                if record.get('type') == 'InstalledBinary':
+                    if record.get('type') == 'InstalledBinary':
-                    assert record['name'] == 'wget'
+                        assert record['name'] == 'wget'
-                    assert record['abspath']
+                        assert record['abspath']
-                    found_binary = True
+                        found_binary = True
-                    break
+                        break
-            except json.JSONDecodeError:
+                except json.JSONDecodeError:
-                pass
+                    pass
-
+        assert found_binary, "Should output InstalledBinary record when binary found"
-    assert found_binary, "Should output InstalledBinary record"
+    else:
        # Binary not found - verify Dependency JSONL output
        found_dependency = False
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                try:
                    record = json.loads(line)
                    if record.get('type') == 'Dependency':
                        assert record['bin_name'] == 'wget'
                        assert 'env' in record['bin_providers']
                        found_dependency = True
                        break
                except json.JSONDecodeError:
                    pass
        assert found_dependency, "Should output Dependency record when binary not found"
 def test_verify_deps_with_abx_pkg():
-    """Verify wget is available via abx-pkg after hook installation."""
+    """Verify wget is available via abx-pkg."""
-    from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
+    from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
    AptProvider.model_rebuild()
    BrewProvider.model_rebuild()
    EnvProvider.model_rebuild()
    wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
    wget_loaded = wget_binary.load()
-    assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook"
+
    if wget_loaded and wget_loaded.abspath:
        assert True, "wget is available"
    else:
        pytest.skip("wget not available - Dependency record should have been emitted")
 def test_reports_missing_dependency_when_not_installed():
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
@@ -110,6 +110,10 @@
                    {% block nav-global %}{% endblock %}
                </div>
                {% if has_permission %}
                    {% include 'admin/progress_monitor.html' %}
                {% endif %}
                {% block breadcrumbs %}
                    <div class="breadcrumbs">
                        <a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -0,0 +1,648 @@
 <style>
    /* Progress Monitor Container */
    #progress-monitor {
        background: linear-gradient(135deg, #0d1117 0%, #161b22 100%);
        color: #c9d1d9;
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Noto Sans', Helvetica, Arial, sans-serif;
        font-size: 12px;
        border-bottom: 1px solid #30363d;
        position: relative;
        z-index: 100;
    }
    #progress-monitor.hidden {
        display: none;
    }
    #progress-monitor .tree-container {
        max-height: 350px;
        overflow-y: auto;
    }
    /* Header Bar */
    #progress-monitor .header-bar {
        display: flex;
        justify-content: space-between;
        align-items: center;
        padding: 8px 16px;
        background: rgba(0,0,0,0.2);
        border-bottom: 1px solid #30363d;
        position: sticky;
        top: 0;
        z-index: 10;
    }
    #progress-monitor .header-left {
        display: flex;
        align-items: center;
        gap: 16px;
    }
    #progress-monitor .header-right {
        display: flex;
        align-items: center;
        gap: 12px;
    }
    /* Orchestrator Status */
    #progress-monitor .orchestrator-status {
        display: flex;
        align-items: center;
        gap: 6px;
    }
    #progress-monitor .status-dot {
        width: 8px;
        height: 8px;
        border-radius: 50%;
        flex-shrink: 0;
    }
    #progress-monitor .status-dot.running {
        background: #3fb950;
        box-shadow: 0 0 8px #3fb950;
        animation: pulse 2s infinite;
    }
    #progress-monitor .status-dot.stopped {
        background: #f85149;
    }
    @keyframes pulse {
        0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
        50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
    }
    /* Stats */
    #progress-monitor .stats {
        display: flex;
        gap: 16px;
    }
    #progress-monitor .stat {
        display: flex;
        align-items: center;
        gap: 4px;
    }
    #progress-monitor .stat-label {
        color: #8b949e;
        font-size: 10px;
        text-transform: uppercase;
        letter-spacing: 0.5px;
    }
    #progress-monitor .stat-value {
        font-weight: 600;
        font-variant-numeric: tabular-nums;
    }
    #progress-monitor .stat-value.success { color: #3fb950; }
    #progress-monitor .stat-value.error { color: #f85149; }
    #progress-monitor .stat-value.warning { color: #d29922; }
    #progress-monitor .stat-value.info { color: #58a6ff; }
    /* Toggle Button */
    #progress-monitor .toggle-btn {
        background: transparent;
        border: 1px solid #30363d;
        color: #8b949e;
        cursor: pointer;
        padding: 4px 8px;
        border-radius: 6px;
        font-size: 11px;
        transition: all 0.2s;
    }
    #progress-monitor .toggle-btn:hover {
        background: #21262d;
        color: #c9d1d9;
        border-color: #8b949e;
    }
    /* Tree Container */
    #progress-monitor .tree-container {
        padding: 12px 16px;
    }
    #progress-monitor.collapsed .tree-container {
        display: none;
    }
    /* Idle Message */
    #progress-monitor .idle-message {
        color: #8b949e;
        font-style: italic;
        padding: 8px 0;
        text-align: center;
    }
    /* Crawl Item */
    #progress-monitor .crawl-item {
        background: #161b22;
        border: 1px solid #30363d;
        border-radius: 8px;
        margin-bottom: 12px;
        overflow: hidden;
    }
    #progress-monitor .crawl-header {
        display: flex;
        align-items: center;
        gap: 12px;
        padding: 10px 14px;
        background: rgba(0,0,0,0.2);
        cursor: pointer;
    }
    #progress-monitor .crawl-header:hover {
        background: rgba(88, 166, 255, 0.1);
    }
    #progress-monitor .crawl-icon {
        font-size: 16px;
        width: 20px;
        text-align: center;
    }
    #progress-monitor .crawl-info {
        flex: 1;
        min-width: 0;
    }
    #progress-monitor .crawl-label {
        font-weight: 600;
        color: #58a6ff;
        white-space: nowrap;
        overflow: hidden;
        text-overflow: ellipsis;
    }
    #progress-monitor .crawl-meta {
        font-size: 11px;
        color: #8b949e;
        margin-top: 2px;
    }
    #progress-monitor .crawl-stats {
        display: flex;
        gap: 12px;
        font-size: 11px;
    }
    /* Progress Bar */
    #progress-monitor .progress-bar-container {
        height: 4px;
        background: #21262d;
        border-radius: 2px;
        overflow: hidden;
        position: relative;
    }
    #progress-monitor .progress-bar {
        height: 100%;
        border-radius: 2px;
        transition: width 0.5s ease-out;
        position: relative;
    }
    #progress-monitor .progress-bar.crawl {
        background: linear-gradient(90deg, #238636 0%, #3fb950 100%);
    }
    #progress-monitor .progress-bar.snapshot {
        background: linear-gradient(90deg, #1f6feb 0%, #58a6ff 100%);
    }
    #progress-monitor .progress-bar.extractor {
        background: linear-gradient(90deg, #8957e5 0%, #a371f7 100%);
    }
    #progress-monitor .progress-bar.indeterminate {
        background: linear-gradient(90deg, transparent 0%, #58a6ff 50%, transparent 100%);
        animation: indeterminate 1.5s infinite linear;
        width: 30% !important;
    }
    @keyframes indeterminate {
        0% { transform: translateX(-100%); }
        100% { transform: translateX(400%); }
    }
    /* Crawl Body */
    #progress-monitor .crawl-body {
        padding: 0 14px 14px;
    }
    #progress-monitor .crawl-progress {
        padding: 10px 14px;
        border-bottom: 1px solid #21262d;
    }
    /* Snapshot List */
    #progress-monitor .snapshot-list {
        margin-top: 8px;
    }
    #progress-monitor .snapshot-item {
        background: #0d1117;
        border: 1px solid #21262d;
        border-radius: 6px;
        margin-bottom: 8px;
        overflow: hidden;
    }
    #progress-monitor .snapshot-header {
        display: flex;
        align-items: center;
        gap: 10px;
        padding: 8px 12px;
        cursor: pointer;
    }
    #progress-monitor .snapshot-header:hover {
        background: rgba(88, 166, 255, 0.05);
    }
    #progress-monitor .snapshot-icon {
        font-size: 14px;
        width: 18px;
        text-align: center;
        color: #58a6ff;
    }
    #progress-monitor .snapshot-info {
        flex: 1;
        min-width: 0;
    }
    #progress-monitor .snapshot-url {
        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
        font-size: 11px;
        color: #c9d1d9;
        white-space: nowrap;
        overflow: hidden;
        text-overflow: ellipsis;
    }
    #progress-monitor .snapshot-meta {
        font-size: 10px;
        color: #8b949e;
        margin-top: 2px;
    }
    #progress-monitor .snapshot-progress {
        padding: 0 12px 8px;
    }
    /* Extractor List */
    #progress-monitor .extractor-list {
        padding: 8px 12px;
        background: rgba(0,0,0,0.2);
        border-top: 1px solid #21262d;
    }
    #progress-monitor .extractor-item {
        display: flex;
        align-items: center;
        gap: 8px;
        padding: 4px 0;
    }
    #progress-monitor .extractor-icon {
        font-size: 12px;
        width: 16px;
        text-align: center;
    }
    #progress-monitor .extractor-icon.running {
        color: #d29922;
        animation: spin 1s linear infinite;
    }
    #progress-monitor .extractor-icon.success {
        color: #3fb950;
    }
    #progress-monitor .extractor-icon.failed {
        color: #f85149;
    }
    #progress-monitor .extractor-icon.pending {
        color: #8b949e;
    }
    @keyframes spin {
        from { transform: rotate(0deg); }
        to { transform: rotate(360deg); }
    }
    #progress-monitor .extractor-name {
        flex: 1;
        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
        font-size: 11px;
    }
    #progress-monitor .extractor-progress {
        width: 60px;
    }
    /* Status Badge */
    #progress-monitor .status-badge {
        font-size: 10px;
        padding: 2px 6px;
        border-radius: 10px;
        font-weight: 500;
        text-transform: uppercase;
        letter-spacing: 0.3px;
    }
    #progress-monitor .status-badge.queued {
        background: #21262d;
        color: #8b949e;
    }
    #progress-monitor .status-badge.started {
        background: rgba(210, 153, 34, 0.2);
        color: #d29922;
    }
    #progress-monitor .status-badge.sealed,
    #progress-monitor .status-badge.succeeded {
        background: rgba(63, 185, 80, 0.2);
        color: #3fb950;
    }
    #progress-monitor .status-badge.failed {
        background: rgba(248, 81, 73, 0.2);
        color: #f85149;
    }
    /* Expand/Collapse Icons */
    #progress-monitor .expand-icon {
        color: #8b949e;
        font-size: 10px;
        transition: transform 0.2s;
    }
    #progress-monitor .expand-icon.expanded {
        transform: rotate(90deg);
    }
 </style>
 <div id="progress-monitor">
    <div class="header-bar">
        <div class="header-left">
            <div class="orchestrator-status">
                <span class="status-dot stopped" id="orchestrator-dot"></span>
                <span id="orchestrator-text">Stopped</span>
            </div>
            <div class="stats">
                <div class="stat">
                    <span class="stat-label">Workers</span>
                    <span class="stat-value info" id="worker-count">0</span>
                </div>
                <div class="stat">
                    <span class="stat-label">Queued</span>
                    <span class="stat-value warning" id="total-queued">0</span>
                </div>
                <div class="stat">
                    <span class="stat-label">Done</span>
                    <span class="stat-value success" id="total-succeeded">0</span>
                </div>
                <div class="stat">
                    <span class="stat-label">Failed</span>
                    <span class="stat-value error" id="total-failed">0</span>
                </div>
            </div>
        </div>
        <div class="header-right">
            <button class="toggle-btn" id="progress-collapse" title="Toggle details">Details</button>
        </div>
    </div>
    <div class="tree-container" id="tree-container">
        <div class="idle-message" id="idle-message">No active crawls</div>
        <div id="crawl-tree"></div>
    </div>
 </div>
 <script>
 (function() {
    const monitor = document.getElementById('progress-monitor');
    const collapseBtn = document.getElementById('progress-collapse');
    const treeContainer = document.getElementById('tree-container');
    const crawlTree = document.getElementById('crawl-tree');
    const idleMessage = document.getElementById('idle-message');
    let pollInterval = null;
    let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true';
    let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
    let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
    function formatUrl(url) {
        try {
            const u = new URL(url);
            return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
        } catch {
            return url.substring(0, 50) + (url.length > 50 ? '...' : '');
        }
    }
    function renderExtractor(extractor) {
        const iconClass = extractor.status === 'started' ? 'running' :
                         extractor.status === 'succeeded' ? 'success' :
                         extractor.status === 'failed' ? 'failed' : 'pending';
        const icon = extractor.status === 'started' ? '&#8635;' :
                    extractor.status === 'succeeded' ? '&#10003;' :
                    extractor.status === 'failed' ? '&#10007;' : '&#9675;';
        return `
            <div class="extractor-item">
                <span class="extractor-icon ${iconClass}">${icon}</span>
                <span class="extractor-name">${extractor.extractor}</span>
                <div class="extractor-progress">
                    <div class="progress-bar-container">
                        <div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
                             style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
                    </div>
                </div>
            </div>
        `;
    }
    function renderSnapshot(snapshot, crawlId) {
        const snapshotKey = `${crawlId}-${snapshot.id}`;
        const isExpanded = expandedSnapshots.has(snapshotKey);
        const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
        let extractorHtml = '';
        if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
            extractorHtml = `
                <div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
                    ${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
                </div>
            `;
        }
        return `
            <div class="snapshot-item" data-snapshot-key="${snapshotKey}">
                <div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
                    <span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '&#9654;' : ''}</span>
                    <span class="snapshot-icon">${statusIcon}</span>
                    <div class="snapshot-info">
                        <div class="snapshot-url">${formatUrl(snapshot.url)}</div>
                        <div class="snapshot-meta">
                            ${snapshot.completed_extractors}/${snapshot.total_extractors} extractors
                            ${snapshot.failed_extractors > 0 ? `<span style="color:#f85149">(${snapshot.failed_extractors} failed)</span>` : ''}
                        </div>
                    </div>
                    <span class="status-badge ${snapshot.status}">${snapshot.status}</span>
                </div>
                <div class="snapshot-progress">
                    <div class="progress-bar-container">
                        <div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
                             style="width: ${snapshot.progress}%"></div>
                    </div>
                </div>
                ${extractorHtml}
            </div>
        `;
    }
    function renderCrawl(crawl) {
        const isExpanded = expandedCrawls.has(crawl.id);
        const statusIcon = crawl.status === 'started' ? '&#8635;' : '&#128269;';
        let snapshotsHtml = '';
        if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
            snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
        }
        return `
            <div class="crawl-item" data-crawl-id="${crawl.id}">
                <div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
                    <span class="expand-icon ${isExpanded ? 'expanded' : ''}">${crawl.active_snapshots?.length ? '&#9654;' : ''}</span>
                    <span class="crawl-icon">${statusIcon}</span>
                    <div class="crawl-info">
                        <div class="crawl-label">${crawl.label}</div>
                        <div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
                    </div>
                    <div class="crawl-stats">
                        <span style="color:#3fb950">${crawl.completed_snapshots} done</span>
                        <span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
                    </div>
                    <span class="status-badge ${crawl.status}">${crawl.status}</span>
                </div>
                <div class="crawl-progress">
                    <div class="progress-bar-container">
                        <div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
                             style="width: ${crawl.progress}%"></div>
                    </div>
                </div>
                <div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
                    <div class="snapshot-list">
                        ${snapshotsHtml}
                    </div>
                </div>
            </div>
        `;
    }
    window.toggleCrawl = function(crawlId) {
        const item = document.querySelector(`[data-crawl-id="${crawlId}"]`);
        const body = item.querySelector('.crawl-body');
        const icon = item.querySelector('.expand-icon');
        if (expandedCrawls.has(crawlId)) {
            expandedCrawls.delete(crawlId);
            body.style.display = 'none';
            icon.classList.remove('expanded');
        } else {
            expandedCrawls.add(crawlId);
            body.style.display = '';
            icon.classList.add('expanded');
        }
        localStorage.setItem('progress-monitor-expanded-crawls', JSON.stringify([...expandedCrawls]));
    };
    window.toggleSnapshot = function(snapshotKey) {
        const item = document.querySelector(`[data-snapshot-key="${snapshotKey}"]`);
        const extractorList = item.querySelector('.extractor-list');
        const icon = item.querySelector('.expand-icon');
        if (!extractorList) return;
        if (expandedSnapshots.has(snapshotKey)) {
            expandedSnapshots.delete(snapshotKey);
            extractorList.style.display = 'none';
            icon.classList.remove('expanded');
        } else {
            expandedSnapshots.add(snapshotKey);
            extractorList.style.display = '';
            icon.classList.add('expanded');
        }
        localStorage.setItem('progress-monitor-expanded-snapshots', JSON.stringify([...expandedSnapshots]));
    };
    function updateProgress(data) {
        // Calculate if there's activity
        const hasActivity = data.active_crawls.length > 0 ||
                           data.crawls_pending > 0 || data.crawls_started > 0 ||
                           data.snapshots_pending > 0 || data.snapshots_started > 0 ||
                           data.archiveresults_pending > 0 || data.archiveresults_started > 0;
        // Update orchestrator status
        const dot = document.getElementById('orchestrator-dot');
        const text = document.getElementById('orchestrator-text');
        if (data.orchestrator_running) {
            dot.classList.remove('stopped');
            dot.classList.add('running');
            text.textContent = 'Running';
        } else {
            dot.classList.remove('running');
            dot.classList.add('stopped');
            text.textContent = 'Stopped';
        }
        // Update stats
        document.getElementById('worker-count').textContent = data.total_workers;
        document.getElementById('total-queued').textContent =
            data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
        document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
        document.getElementById('total-failed').textContent = data.archiveresults_failed;
        // Render crawl tree
        if (data.active_crawls.length > 0) {
            idleMessage.style.display = 'none';
            crawlTree.innerHTML = data.active_crawls.map(c => renderCrawl(c)).join('');
        } else if (hasActivity) {
            idleMessage.style.display = 'none';
            crawlTree.innerHTML = `
                <div class="idle-message">
                    ${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
                </div>
            `;
        } else {
            idleMessage.style.display = '';
            // Build the URL for recent crawls (last 24 hours)
            var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
            var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
            idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
            crawlTree.innerHTML = '';
        }
    }
    function fetchProgress() {
        fetch('/admin/live-progress/')
            .then(response => response.json())
            .then(data => {
                if (data.error) {
                    console.error('Progress API error:', data.error, data.traceback);
                    idleMessage.textContent = 'API Error: ' + data.error;
                    idleMessage.style.color = '#f85149';
                }
                updateProgress(data);
            })
            .catch(error => {
                console.error('Progress fetch error:', error);
                idleMessage.textContent = 'Fetch Error: ' + error.message;
                idleMessage.style.color = '#f85149';
            });
    }
    function startPolling() {
        if (pollInterval) return;
        fetchProgress();
        pollInterval = setInterval(fetchProgress, 1000);  // Poll every 1 second
    }
    function stopPolling() {
        if (pollInterval) {
            clearInterval(pollInterval);
            pollInterval = null;
        }
    }
    // Collapse toggle
    collapseBtn.addEventListener('click', function() {
        isCollapsed = !isCollapsed;
        localStorage.setItem('progress-monitor-collapsed', isCollapsed);
        if (isCollapsed) {
            monitor.classList.add('collapsed');
            collapseBtn.textContent = 'Expand';
        } else {
            monitor.classList.remove('collapsed');
            collapseBtn.textContent = 'Details';
        }
    });
    // Apply initial state
    if (isCollapsed) {
        monitor.classList.add('collapsed');
        collapseBtn.textContent = 'Expand';
    }
    // Start polling when page loads
    startPolling();
    // Pause polling when tab is hidden
    document.addEventListener('visibilitychange', function() {
        if (document.hidden) {
            stopPolling();
        } else {
            startPolling();
        }
    });
 })();
 </script>
--- a/archivebox/templates/core/snapshot_live.html
+++ b/archivebox/templates/core/snapshot_live.html
@@ -192,6 +192,42 @@
                border: 0px;
                border-top: 3px solid #aa1e55;
            }
            #main-frame-wrapper {
                width: 100%;
                height: calc(100vh - 210px);
                border-top: 3px solid #aa1e55;
                overflow: hidden;
            }
            #main-frame-wrapper iframe {
                width: 100%;
                height: 100%;
                border: none;
            }
            .full-page-wrapper {
                width: 100%;
                height: calc(100vh - 210px);
            }
            .thumbnail-wrapper {
                height: 100px;
                overflow: hidden;
                background-color: #333;
                pointer-events: none;
            }
            .thumbnail-wrapper iframe {
                width: 405%;
                height: 430px;
                margin-bottom: -330px;
                margin-left: -1%;
                transform: scale(0.25);
                transform-origin: 0 0;
                border: none;
            }
            .thumbnail-wrapper img {
                width: 100%;
                height: 100%;
                object-fit: cover;
                object-position: top center;
            }
            .card.selected-card {
                border: 2px solid orange;
                box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
@@ -403,12 +439,18 @@
                            <div class="card {% if forloop.first %}selected-card{% endif %}">
                                <div class="card-body">
                                    <a href="{{result.path|urlencode}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
-                                        <h4>{{result.name|truncatechars:24}} <small>({{result.size|filesizeformat}})</small></h4>
+                                        <h4>{% extractor_icon result.name %} {{result.name|extractor_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
                                        <!-- <p class="card-text" ><code>./{{result.path|truncatechars:30}}</code></p> -->
                                    </a>
                                    <!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>-->
                                </div>
-                                <iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
+                                {% if result.result %}
                                    {# Use plugin-specific thumbnail template when ArchiveResult is available #}
                                    <div class="card-img-top thumbnail-wrapper">
                                        {% extractor_thumbnail result.result %}
                                    </div>
                                {% else %}
                                    {# Fall back to generic iframe for filesystem-discovered files #}
                                    <iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
                                {% endif %}
                            </div>
                        </div>
                    {% endfor %}
@@ -431,7 +473,15 @@
-        <iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
+        {% if best_result.result %}
            {# Use plugin-specific fullscreen template when ArchiveResult is available #}
            <div id="main-frame-wrapper" class="full-page-wrapper">
                {% extractor_fullscreen best_result.result %}
            </div>
        {% else %}
            {# Fall back to generic iframe #}
            <iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
        {% endif %}
--- a/archivebox/workers/admin.py
+++ b/archivebox/workers/admin.py
@@ -1,23 +1,13 @@
 """
 Workers admin module.
 The orchestrator/worker system doesn't need Django admin registration
 as workers are managed via CLI commands and the orchestrator.
 """
 __package__ = 'archivebox.workers'
 from django.contrib.auth import get_permission_codename
 from huey_monitor.apps import HueyMonitorConfig
 from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
 HueyMonitorConfig.verbose_name = 'Background Workers'
 class CustomTaskModelAdmin(TaskModelAdmin):
    actions = ["delete_selected"]
    def has_delete_permission(self, request, obj=None):
        codename = get_permission_codename("delete", self.opts)
        return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
 def register_admin(admin_site):
-    admin_site.register(TaskModel, CustomTaskModelAdmin)
+    """No models to register - workers are process-based, not Django models."""
-    admin_site.register(SignalInfoModel, SignalInfoModelAdmin)
+    pass
--- a/archivebox/workers/management/init.py
+++ b/archivebox/workers/management/init.py
--- a/archivebox/workers/management/commands/init.py
+++ b/archivebox/workers/management/commands/init.py
--- a/archivebox/workers/management/commands/orchestrator.py
+++ b/archivebox/workers/management/commands/orchestrator.py
@@ -0,0 +1,15 @@
 from django.core.management.base import BaseCommand
 from workers.orchestrator import Orchestrator
 class Command(BaseCommand):
    help = 'Run the archivebox orchestrator'
    def add_arguments(self, parser):
        parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
    def handle(self, *args, **kwargs):
        daemon = kwargs.get('daemon', False)
        orchestrator = Orchestrator(exit_on_idle=not daemon)
        orchestrator.runloop()
--- a/Show More
+++ b/Show More