logging and admin ui improvements

2026-04-03 14:27:55 +10:00 · 2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions
--- a/archivebox/ArchiveBox.conf
+++ b/archivebox/ArchiveBox.conf
@@ -0,0 +1,3 @@
+[SERVER_CONFIG]
+SECRET_KEY = amuxg7v5e2l_6jrktp_f3kszlpx4ieqk4rtwda5q6nfiavits4
+
--- a/archivebox/api/admin.py
+++ b/archivebox/api/admin.py
@@ -13,7 +13,21 @@ class APITokenAdmin(BaseModelAdmin):
    sort_fields = ('id', 'created_at', 'created_by', 'expires')
    readonly_fields = ('created_at', 'modified_at')
    search_fields = ('id', 'created_by__username', 'token')
-    fields = ('created_by', 'token', 'expires', *readonly_fields)
+
+    fieldsets = (
+        ('Token', {
+            'fields': ('token', 'expires'),
+            'classes': ('card',),
+        }),
+        ('Owner', {
+            'fields': ('created_by',),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )

    list_filter = ('created_by',)
    ordering = ['-created_at']
@@ -25,6 +39,29 @@ class CustomWebhookAdmin(WebhookAdmin, BaseModelAdmin):
    sort_fields = ('created_at', 'created_by', 'id', 'referenced_model', 'endpoint', 'last_success', 'last_error')
    readonly_fields = ('created_at', 'modified_at', *WebhookAdmin.readonly_fields)

+    fieldsets = (
+        ('Webhook', {
+            'fields': ('name', 'signal', 'referenced_model', 'endpoint'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Authentication', {
+            'fields': ('auth_token',),
+            'classes': ('card',),
+        }),
+        ('Status', {
+            'fields': ('enabled', 'last_success', 'last_error'),
+            'classes': ('card',),
+        }),
+        ('Owner', {
+            'fields': ('created_by',),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )
+

 def register_admin(admin_site):
    admin_site.register(APIToken, APITokenAdmin)
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -115,12 +115,10 @@ def add(urls: str | list[str],
    #    - Repeat until max_depth reached

    if bg:
-        # Background mode: start orchestrator and return immediately
-        print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.start()  # Fork to background
+        # Background mode: just queue work and return (orchestrator via server will pick it up)
+        print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
    else:
-        # Foreground mode: run orchestrator until all work is done
+        # Foreground mode: run orchestrator inline until all work is done
        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
        orchestrator = Orchestrator(exit_on_idle=True)
        orchestrator.runloop()  # Block until complete
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -117,11 +117,11 @@ def run_plugins(
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
            elif record.get('url'):
-                # Look up by URL
-                try:
-                    snap = Snapshot.objects.get(url=record['url'])
+                # Look up by URL (get most recent if multiple exist)
+                snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
+                if snap:
                    snapshot_ids.add(str(snap.id))
-                except Snapshot.DoesNotExist:
+                else:
                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)

        elif record_type == TYPE_ARCHIVERESULT:
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -49,20 +49,45 @@ def install(dry_run: bool=False) -> None:
    # Using a minimal crawl that will trigger on_Crawl hooks
    created_by_id = get_or_create_system_user_pk()

-    seed = Seed.objects.create(
+    seed, _created = Seed.objects.get_or_create(
        uri='archivebox://install',
        label='Dependency detection',
        created_by_id=created_by_id,
+        defaults={
+            'extractor': 'auto',
+        }
    )

-    crawl = Crawl.objects.create(
+    crawl, created = Crawl.objects.get_or_create(
        seed=seed,
        max_depth=0,
        created_by_id=created_by_id,
-        status='queued',
+        defaults={
+            'status': 'queued',
+        }
    )

+    # If crawl already existed, reset it to queued state so it can be processed again
+    if not created:
+        crawl.status = 'queued'
+        crawl.retry_at = timezone.now()
+        crawl.save()
+
    print(f'[+] Created dependency detection crawl: {crawl.id}')
+    print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
+
+    # Verify the crawl is in the queue
+    from crawls.models import Crawl as CrawlModel
+    queued_crawls = CrawlModel.objects.filter(
+        retry_at__lte=timezone.now()
+    ).exclude(
+        status__in=CrawlModel.FINAL_STATES
+    )
+    print(f'[+] Crawls in queue: {queued_crawls.count()}')
+    if queued_crawls.exists():
+        for c in queued_crawls:
+            print(f'    - Crawl {c.id}: status={c.status}, retry_at={c.retry_at}')
+
    print('[+] Running crawl to detect binaries via on_Crawl hooks...')
    print()

--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -56,20 +56,53 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
    except IndexError:
        pass

-    print('[green][+] Starting ArchiveBox webserver...[/green]')
-    print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
-    print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
-    print('    > Writing ArchiveBox error log to ./logs/errors.log')
-
    if SHELL_CONFIG.DEBUG:
+        print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
+        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+        print('    > Writing ArchiveBox error log to ./logs/errors.log')
        if not reload:
            runserver_args.append('--noreload')  # '--insecure'
        if nothreading:
            runserver_args.append('--nothreading')
        call_command("runserver", *runserver_args)
    else:
-        from workers.supervisord_util import start_server_workers
+        from workers.supervisord_util import (
+            get_existing_supervisord_process,
+            get_worker,
+            start_server_workers,
+            tail_multiple_worker_logs,
+        )

+        # Check if supervisord is already running
+        supervisor = get_existing_supervisord_process()
+        if supervisor:
+            daphne_proc = get_worker(supervisor, 'worker_daphne')
+
+            # If daphne is already running, just tail logs
+            if daphne_proc and daphne_proc.get('statename') == 'RUNNING':
+                orchestrator_proc = get_worker(supervisor, 'worker_orchestrator')
+                print('[yellow][!] ArchiveBox server is already running[/yellow]')
+                print(f'    [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+                if orchestrator_proc and orchestrator_proc.get('statename') == 'RUNNING':
+                    print(f'    [green]√[/green] Background worker (worker_orchestrator) is RUNNING')
+                print()
+                print('[blue][i] Tailing worker logs (Ctrl+C to stop watching)...[/i][/blue]')
+                print()
+
+                # Tail logs for both workers
+                tail_multiple_worker_logs(
+                    log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
+                    follow=True,
+                )
+                return
+            # Otherwise, daphne is not running - fall through to start it
+
+        # No existing workers found - start new ones
+        print('[green][+] Starting ArchiveBox webserver...[/green]')
+        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+        print('    > Writing ArchiveBox error log to ./logs/errors.log')
        print()
        start_server_workers(host=host, port=port, daemonize=daemonize)
        print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -119,12 +119,13 @@ def version(quiet: bool=False,
    else:
        for key in sorted(set(binary_config_keys)):
            # Get the actual binary name/path from config value
-            bin_value = config.get(key, '').strip()
+            # Prioritize Machine.config overrides over base config
+            bin_value = machine.config.get(key) or config.get(key, '').strip()
            if not bin_value:
                continue

            # Check if it's a path (has slashes) or just a name
-            is_path = '/' in bin_value
+            is_path = '/' in str(bin_value)

            if is_path:
                # It's a full path - match against abspath
--- a/archivebox/config/django.py
+++ b/archivebox/config/django.py
@@ -5,7 +5,6 @@ import sys

 from datetime import datetime, timezone

-from rich.progress import Progress
 from rich.console import Console

 import django
@@ -27,16 +26,6 @@ STDERR = Console(stderr=True)
 logging.CONSOLE = CONSOLE


-INITIAL_STARTUP_PROGRESS = None
-INITIAL_STARTUP_PROGRESS_TASK = 0
-
-def bump_startup_progress_bar(advance=1):
-    global INITIAL_STARTUP_PROGRESS
-    global INITIAL_STARTUP_PROGRESS_TASK
-    if INITIAL_STARTUP_PROGRESS:
-        INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance)   # type: ignore
-
-
 def setup_django_minimal():
    # sys.path.append(str(CONSTANTS.PACKAGE_DIR))
    # os.environ.setdefault('ARCHIVEBOX_DATA_DIR', str(CONSTANTS.DATA_DIR))
@@ -49,9 +38,7 @@ DJANGO_SET_UP = False

 def setup_django(check_db=False, in_memory_db=False) -> None:
    from rich.panel import Panel
-    
-    global INITIAL_STARTUP_PROGRESS
-    global INITIAL_STARTUP_PROGRESS_TASK
+
    global DJANGO_SET_UP

    if DJANGO_SET_UP:
@@ -59,118 +46,100 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
        # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
        return

-    with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
-        INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=True)
-        
-        from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
-    
-        # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
-        if IS_ROOT and ARCHIVEBOX_USER != 0:
-            with SudoPermission(uid=0):
-                # running as root is a special case where it's ok to be a bit slower
-                # make sure data dir is always owned by the correct user
-                os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
-                os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
+    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission

-        bump_startup_progress_bar()
-        try:
-            from django.core.management import call_command
-                
-            bump_startup_progress_bar()
+    # if running as root, chown the data dir to the archivebox user to make sure it's accessible to the archivebox user
+    if IS_ROOT and ARCHIVEBOX_USER != 0:
+        with SudoPermission(uid=0):
+            # running as root is a special case where it's ok to be a bit slower
+            # make sure data dir is always owned by the correct user
+            os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
+            os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')

-            if in_memory_db:
-                raise Exception('dont use this anymore')
-            
-                # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
-                # in those cases we create a temporary in-memory db and run the migrations
-                # immediately to get a usable in-memory-database at startup
-                os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
+    try:
+        from django.core.management import call_command
+
+        if in_memory_db:
+            raise Exception('dont use this anymore')
+
+            # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
+            # in those cases we create a temporary in-memory db and run the migrations
+            # immediately to get a usable in-memory-database at startup
+            os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
+            django.setup()
+
+            call_command("migrate", interactive=False, verbosity=0)
+        else:
+            # Otherwise use default sqlite3 file-based database and initialize django
+            # without running migrations automatically (user runs them manually by calling init)
+            try:
                django.setup()
-                
-                bump_startup_progress_bar()
-                call_command("migrate", interactive=False, verbosity=0)
-            else:
-                # Otherwise use default sqlite3 file-based database and initialize django
-                # without running migrations automatically (user runs them manually by calling init)
-                try:
-                    django.setup()
-                except Exception as e:
-                    bump_startup_progress_bar(advance=1000)
-                    
-                    is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
-                    if not is_using_meta_cmd:
-                        # show error message to user only if they're not running a meta command / just trying to get help
-                        STDERR.print()
-                        STDERR.print(Panel(
-                            f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
-                            title='\n\n[red][X] Error while trying to load database![/red]',
-                            subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
-                            expand=False,
-                            style='bold red',
-                        ))
-                        STDERR.print()
-                        STDERR.print_exception(show_locals=False)
-                    return
-            
-            bump_startup_progress_bar()
-
-            from django.conf import settings
-            
-            # log startup message to the error log
-            with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
-                command = ' '.join(sys.argv)
-                ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
-                f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
-
-            if check_db:
-                # make sure the data dir is owned by a non-root user
-                if CONSTANTS.DATA_DIR.stat().st_uid == 0:
-                    STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
-                    STDERR.print(f'    {CONSTANTS.DATA_DIR}')
+            except Exception as e:
+                is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
+                if not is_using_meta_cmd:
+                    # show error message to user only if they're not running a meta command / just trying to get help
                    STDERR.print()
-                    STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
-                    STDERR.print('    cd path/to/your/archive/data')
-                    STDERR.print('    archivebox [command]')
+                    STDERR.print(Panel(
+                        f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n',
+                        title='\n\n[red][X] Error while trying to load database![/red]',
+                        subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]',
+                        expand=False,
+                        style='bold red',
+                    ))
                    STDERR.print()
-                    raise SystemExit(9)
-                
-                # Create cache table in DB if needed
-                try:
-                    from django.core.cache import cache
-                    cache.get('test', None)
-                except django.db.utils.OperationalError:
-                    call_command("createcachetable", verbosity=0)
+                    STDERR.print_exception(show_locals=False)
+                return

-                bump_startup_progress_bar()
+        from django.conf import settings

-                # if archivebox gets imported multiple times, we have to close
-                # the sqlite3 whenever we init from scratch to avoid multiple threads
-                # sharing the same connection by accident
-                from django.db import connections
-                for conn in connections.all():
-                    conn.close_if_unusable_or_obsolete()
+        # log startup message to the error log
+        with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
+            command = ' '.join(sys.argv)
+            ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
+            f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")

-                sql_index_path = CONSTANTS.DATABASE_FILE
-                assert os.access(sql_index_path, os.F_OK), (
-                    f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
+        if check_db:
+            # make sure the data dir is owned by a non-root user
+            if CONSTANTS.DATA_DIR.stat().st_uid == 0:
+                STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
+                STDERR.print(f'    {CONSTANTS.DATA_DIR}')
+                STDERR.print()
+                STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
+                STDERR.print('    cd path/to/your/archive/data')
+                STDERR.print('    archivebox [command]')
+                STDERR.print()
+                raise SystemExit(9)

-                bump_startup_progress_bar()
+            # Create cache table in DB if needed
+            try:
+                from django.core.cache import cache
+                cache.get('test', None)
+            except django.db.utils.OperationalError:
+                call_command("createcachetable", verbosity=0)

-                # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
-                # if settings.DEBUG_LOGFIRE:
-                #     from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
-                #     SQLite3Instrumentor().instrument()
+            # if archivebox gets imported multiple times, we have to close
+            # the sqlite3 whenever we init from scratch to avoid multiple threads
+            # sharing the same connection by accident
+            from django.db import connections
+            for conn in connections.all():
+                conn.close_if_unusable_or_obsolete()

-                #     import logfire
+            sql_index_path = CONSTANTS.DATABASE_FILE
+            assert os.access(sql_index_path, os.F_OK), (
+                f'No database file {sql_index_path} found in: {CONSTANTS.DATA_DIR} (Are you in an ArchiveBox collection directory?)')

-                #     logfire.configure()
-                #     logfire.instrument_django(is_sql_commentor_enabled=True)
-                #     logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
+            # https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
+            # if settings.DEBUG_LOGFIRE:
+            #     from opentelemetry.instrumentation.sqlite3 import SQLite3Instrumentor
+            #     SQLite3Instrumentor().instrument()
+
+            #     import logfire
+
+            #     logfire.configure()
+            #     logfire.instrument_django(is_sql_commentor_enabled=True)
+            #     logfire.info(f'Started ArchiveBox v{CONSTANTS.VERSION}', argv=sys.argv)
+
+    except KeyboardInterrupt:
+        raise SystemExit(2)

-        except KeyboardInterrupt:
-            raise SystemExit(2)
-        
    DJANGO_SET_UP = True
-
-    INITIAL_STARTUP_PROGRESS = None
-    INITIAL_STARTUP_PROGRESS_TASK = None
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
 from core.models import ArchiveResult, Snapshot


+def render_archiveresults_list(archiveresults_qs, limit=50):
+    """Render a nice inline list view of archive results with status, extractor, output, and actions."""
+
+    results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
+
+    if not results:
+        return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
+
+    # Status colors
+    status_colors = {
+        'succeeded': ('#166534', '#dcfce7'),   # green
+        'failed': ('#991b1b', '#fee2e2'),       # red
+        'queued': ('#6b7280', '#f3f4f6'),       # gray
+        'started': ('#92400e', '#fef3c7'),      # amber
+    }
+
+    rows = []
+    for idx, result in enumerate(results):
+        status = result.status or 'queued'
+        color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
+
+        # Get extractor icon
+        icon = get_extractor_icon(result.extractor)
+
+        # Format timestamp
+        end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
+
+        # Truncate output for display
+        full_output = result.output or '-'
+        output_display = full_output[:60]
+        if len(full_output) > 60:
+            output_display += '...'
+
+        # Get full command as tooltip
+        cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
+
+        # Build output link
+        output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
+
+        # Get version - try cmd_version field
+        version = result.cmd_version if result.cmd_version else '-'
+
+        # Unique ID for this row's expandable output
+        row_id = f'output_{idx}_{str(result.id)[:8]}'
+
+        rows.append(f'''
+            <tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
+                <td style="padding: 10px 12px; white-space: nowrap;">
+                    <span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
+                                 font-size: 11px; font-weight: 600; text-transform: uppercase;
+                                 color: {color}; background: {bg};">{status}</span>
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
+                    {icon}
+                </td>
+                <td style="padding: 10px 12px; font-weight: 500; color: #334155;">
+                    {result.extractor}
+                </td>
+                <td style="padding: 10px 12px; max-width: 280px;">
+                    <span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
+                          style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
+                          title="Click to expand full output">
+                        {output_display}
+                    </span>
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
+                    {end_time}
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
+                    {version}
+                </td>
+                <td style="padding: 10px 8px; white-space: nowrap;">
+                    <div style="display: flex; gap: 4px;">
+                        <a href="{output_link}" target="_blank"
+                           style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
+                           title="View output">📄</a>
+                        <a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
+                           style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
+                           title="Edit">✏️</a>
+                    </div>
+                </td>
+            </tr>
+            <tr style="border-bottom: 1px solid #e2e8f0;">
+                <td colspan="7" style="padding: 0 12px 10px 12px;">
+                    <details id="{row_id}" style="margin: 0;">
+                        <summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
+                            Details &amp; Output
+                        </summary>
+                        <div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
+                            <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
+                                <span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
+                                <span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
+                                <span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
+                            </div>
+                            <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
+                                <b>Output:</b>
+                            </div>
+                            <pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
+                            <div style="font-size: 11px; color: #64748b; margin-top: 8px;">
+                                <b>Command:</b>
+                            </div>
+                            <pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
+                        </div>
+                    </details>
+                </td>
+            </tr>
+        ''')
+
+    total_count = archiveresults_qs.count()
+    footer = ''
+    if total_count > limit:
+        footer = f'''
+            <tr>
+                <td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
+                    Showing {limit} of {total_count} results &nbsp;
+                    <a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
+                       style="color: #2563eb;">View all →</a>
+                </td>
+            </tr>
+        '''
+
+    return mark_safe(f'''
+        <div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
+            <table style="width: 100%; border-collapse: collapse; font-size: 14px;">
+                <thead>
+                    <tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
+                        <th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {''.join(rows)}
+                    {footer}
+                </tbody>
+            </table>
+        </div>
+    ''')
+
+

 class ArchiveResultInline(admin.TabularInline):
    name = 'Archive Results Log'
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
    sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
    search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
-    fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
    autocomplete_fields = ['snapshot']

+    fieldsets = (
+        ('Snapshot', {
+            'fields': ('snapshot', 'snapshot_info', 'tags_str'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Extractor', {
+            'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
+            'classes': ('card',),
+        }),
+        ('Timing', {
+            'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Command', {
+            'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
+            'classes': ('card',),
+        }),
+        ('Output', {
+            'fields': ('output', 'output_summary'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Metadata', {
+            'fields': ('created_by',),
+            'classes': ('card',),
+        }),
+    )
+
    list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
    ordering = ['-start_ts']
    list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
-    
+
    paginator = AccelleratedPaginator
    save_on_top = True
-    
+
    actions = ['delete_selected']
-    
+
    class Meta:
        verbose_name = 'Archive Result'
        verbose_name_plural = 'Archive Results'
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add

 from core.models import Tag
 from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline
+from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list


 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
 class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
    sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
-    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
+    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
    search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
-    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
+
+    fieldsets = (
+        ('URL', {
+            'fields': ('url', 'title'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Status', {
+            'fields': ('status', 'retry_at', 'status_info'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
+            'classes': ('card',),
+        }),
+        ('Relations', {
+            'fields': ('crawl', 'created_by', 'tags_str'),
+            'classes': ('card',),
+        }),
+        ('Config', {
+            'fields': ('config',),
+            'classes': ('card',),
+        }),
+        ('Files', {
+            'fields': ('output_dir',),
+            'classes': ('card',),
+        }),
+        ('Actions', {
+            'fields': ('admin_actions',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Archive Results', {
+            'fields': ('archiveresults_list',),
+            'classes': ('card', 'wide'),
+        }),
+    )
+
    ordering = ['-created_at']
    actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
-    inlines = [TagInline, ArchiveResultInline]
+    inlines = [TagInline]  # Removed ArchiveResultInline, using custom renderer instead
    list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)

    action_form = SnapshotActionForm
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            obj.extension or '-',
        )

+    @admin.display(description='Archive Results')
+    def archiveresults_list(self, obj):
+        return render_archiveresults_list(obj.archiveresult_set.all())
+
    @admin.display(
        description='Title',
        ordering='title',
--- a/archivebox/core/admin_tags.py
+++ b/archivebox/core/admin_tags.py
@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
    sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
    readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
    search_fields = ('id', 'name', 'slug')
-    fields = ('name', 'created_by', *readonly_fields)
    actions = ['delete_selected', 'merge_tags']
    ordering = ['-created_at']
    # inlines = [TaggedItemInline]

+    fieldsets = (
+        ('Tag Info', {
+            'fields': ('name', 'slug'),
+            'classes': ('card',),
+        }),
+        ('Metadata', {
+            'fields': ('id', 'created_by', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card', 'wide'),
+        }),
+    )
+
    paginator = AccelleratedPaginator


--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -1,7 +1,5 @@
 __package__ = 'archivebox.core'

-import sys
-
 from django.apps import AppConfig


@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
        """Register the archivebox.core.admin_site as the main django admin site"""
        from core.admin_site import register_admin_site
        register_admin_site()
-
-        # Auto-start the orchestrator when running the web server
-        self._maybe_start_orchestrator()
-
-    def _maybe_start_orchestrator(self):
-        """Start the orchestrator if we're running a web server."""
-        import os
-
-        # Don't start orchestrator during migrations, shell, tests, etc.
-        # Only start when running: runserver, daphne, gunicorn, uwsgi
-        if not self._is_web_server():
-            return
-
-        # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
-        if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
-            return
-
-        # Don't start in autoreload child process (avoid double-start)
-        if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
-            return
-
-        try:
-            from workers.orchestrator import Orchestrator
-
-            if not Orchestrator.is_running():
-                # Start orchestrator as daemon (won't exit on idle when started by server)
-                orchestrator = Orchestrator(exit_on_idle=False)
-                orchestrator.start()
-        except Exception as e:
-            # Don't crash the server if orchestrator fails to start
-            import logging
-            logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
-
-    def _is_web_server(self) -> bool:
-        """Check if we're running a web server command."""
-        # Check for common web server indicators
-        server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
-        return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
--- a/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py
+++ b/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py
@@ -0,0 +1,22 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_snapshot_crawl'),
+    ]
+
+    operations = [
+        # Remove the unique constraint on url
+        migrations.AlterField(
+            model_name='snapshot',
+            name='url',
+            field=models.URLField(db_index=True, unique=False),
+        ),
+        # Add unique constraint on (url, crawl) combination
+        migrations.AddConstraint(
+            model_name='snapshot',
+            constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
        return self.name

    def save(self, *args, **kwargs):
-        if self._state.adding:
+        is_new = self._state.adding
+        if is_new:
            self.slug = slugify(self.name)
            existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
            i = None
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
                i = (i or 0) + 1
        super().save(*args, **kwargs)

+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Tag',
+                indent_level=0,
+                metadata={
+                    'id': self.id,
+                    'name': self.name,
+                    'slug': self.slug,
+                },
+            )
+
    @property
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_tag', args=[self.id])
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
                if tag.strip()
            ))

-        try:
-            snapshot = self.get(url=url)
+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
+        snapshot = self.filter(url=url).order_by('-created_at').first()
+        if snapshot:
            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
                snapshot.title = title
                snapshot.save(update_fields=['title', 'modified_at'])
-        except self.model.DoesNotExist:
+        else:
            if timestamp:
                while self.filter(timestamp=timestamp).exists():
                    timestamp = str(float(timestamp) + 1.0)
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

-    url = models.URLField(unique=True, db_index=True)
+    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    class Meta(TypedModelMeta):
        verbose_name = "Snapshot"
        verbose_name_plural = "Snapshots"
+        constraints = [
+            # Allow same URL in different crawls, but not duplicates within same crawl
+            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+        ]

    def __str__(self):
        return f'[{self.id}] {self.url[:64]}'

    def save(self, *args, **kwargs):
+        is_new = self._state.adding
        if not self.bookmarked_at:
            self.bookmarked_at = self.created_at or timezone.now()
        if not self.timestamp:
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            self.crawl.urls += f'\n{self.url}'
            self.crawl.save()

+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Snapshot',
+                indent_level=2,
+                url=self.url,
+                metadata={
+                    'id': str(self.id),
+                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
+                    'depth': self.depth,
+                    'status': self.status,
+                },
+            )
+
    def output_dir_parent(self) -> str:
        return 'archive'

@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    def __str__(self):
        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'

+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        super().save(*args, **kwargs)
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created ArchiveResult',
+                indent_level=3,
+                extractor=self.extractor,
+                metadata={
+                    'id': str(self.id),
+                    'snapshot_id': str(self.snapshot_id),
+                    'snapshot_url': str(self.snapshot.url)[:64],
+                    'status': self.status,
+                },
+            )
+
    @cached_property
    def snapshot_dir(self):
        return Path(self.snapshot.output_dir)
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        from django.utils import timezone
        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook

-        extractor_dir = Path(self.snapshot.output_dir) / self.extractor
        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]

        # Find hook for this extractor
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            self.save()
            return

+        # Use plugin directory name instead of extractor name (removes numeric prefix)
+        plugin_name = hook.parent.name
+        extractor_dir = Path(self.snapshot.output_dir) / plugin_name
+
        # Run the hook
        start_ts = timezone.now()
        result = run_hook(
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
        super().__init__(snapshot, *args, **kwargs)
        
    def __repr__(self) -> str:
-        return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'Snapshot[{self.snapshot.id}]'
+
    def __str__(self) -> str:
        return self.__repr__()
-        
+
    def can_start(self) -> bool:
        can_start = bool(self.snapshot.url)
-        if not can_start:
-            print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
+        # Suppressed: queue waiting logs
        return can_start
        
    def is_finished(self) -> bool:
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
        
    @queued.enter
    def enter_queued(self):
-        print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
+        # Suppressed: state transition logs
        self.snapshot.update_for_workers(
            retry_at=timezone.now(),
            status=Snapshot.StatusChoices.QUEUED,
        )
-        
+
    @started.enter
    def enter_started(self):
-        print(f'{self}.on_started() ↳ snapshot.run()')
+        # Suppressed: state transition logs
        # lock the snapshot while we create the pending archiveresults
        self.snapshot.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
            retry_at=timezone.now() + timedelta(seconds=5),  # wait 5s before checking it again
            status=Snapshot.StatusChoices.STARTED,
        )
-        
+
    @sealed.enter
    def enter_sealed(self):
-        print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
+        # Suppressed: state transition logs
        self.snapshot.update_for_workers(
            retry_at=None,
            status=Snapshot.StatusChoices.SEALED,
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
        super().__init__(archiveresult, *args, **kwargs)
    
    def __repr__(self) -> str:
-        return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'ArchiveResult[{self.archiveresult.id}]'
+
    def __str__(self) -> str:
        return self.__repr__()
-        
+
    def can_start(self) -> bool:
        can_start = bool(self.archiveresult.snapshot.url)
-        if not can_start:
-            print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
+        # Suppressed: queue waiting logs
        return can_start
    
    def is_succeeded(self) -> bool:
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):

    @queued.enter
    def enter_queued(self):
-        print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=timezone.now(),
            status=ArchiveResult.StatusChoices.QUEUED,
            start_ts=None,
        )  # bump the snapshot's retry_at so they pickup any new changes
-        
+
    @started.enter
    def enter_started(self):
-        print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
-        
+        # Suppressed: state transition logs
        # Lock the object and mark start time
        self.archiveresult.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for extractor
            status=ArchiveResult.StatusChoices.STARTED,
            start_ts=timezone.now(),
        )
-        
+
        # Run the extractor - this updates status, output, timestamps, etc.
        self.archiveresult.run()
-        
+
        # Save the updated result
        self.archiveresult.save()
-        
-        # Log the result
-        if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
-            print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
-        elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
-            print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
-        elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
-            print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
+
+        # Suppressed: extractor result logs (already logged by worker)

    @backoff.enter
    def enter_backoff(self):
-        print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=60),
            status=ArchiveResult.StatusChoices.BACKOFF,
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
        )
        self.archiveresult.save(write_indexes=True)
-        
+
    @succeeded.enter
    def enter_succeeded(self):
-        print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):

    @failed.enter
    def enter_failed(self):
-        print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=None,
            status=ArchiveResult.StatusChoices.FAILED,
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):

    @skipped.enter
    def enter_skipped(self):
-        print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SKIPPED,
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
        )

-        # Start orchestrator in background to process the queued crawl
-        try:
-            from archivebox.workers.tasks import ensure_orchestrator_running
-            ensure_orchestrator_running()
-        except Exception as e:
-            # Orchestrator may already be running via supervisord, or fail to start
-            # This is not fatal - the crawl will be processed when orchestrator runs
-            print(f'[!] Failed to start orchestrator: {e}')
-
+        # Orchestrator (managed by supervisord) will pick up the queued crawl
        return redirect(crawl.admin_change_url)


@@ -539,6 +531,7 @@ def live_progress_view(request):
        from workers.orchestrator import Orchestrator
        from crawls.models import Crawl
        from core.models import Snapshot, ArchiveResult
+        from django.db.models import Case, When, Value, IntegerField

        # Get orchestrator status
        orchestrator_running = Orchestrator.is_running()
@@ -570,8 +563,26 @@ def live_progress_view(request):
            crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
            total_snapshots = crawl_snapshots.count()
            completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
+            started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
            pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()

+            # Count URLs in the crawl (for when snapshots haven't been created yet)
+            urls_count = 0
+            if crawl.urls:
+                urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
+            elif crawl.seed and crawl.seed.uri:
+                # Try to get URL count from seed
+                if crawl.seed.uri.startswith('file:///'):
+                    try:
+                        from pathlib import Path
+                        seed_file = Path(crawl.seed.uri.replace('file://', ''))
+                        if seed_file.exists():
+                            urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
+                    except:
+                        pass
+                else:
+                    urls_count = 1  # Single URL seed
+
            # Calculate crawl progress
            crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0

@@ -590,16 +601,24 @@ def live_progress_view(request):
                # Calculate snapshot progress
                snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0

-                # Get active extractors for this snapshot
-                active_extractors = [
+                # Get all extractors for this snapshot
+                # Order: started first, then queued, then completed
+                all_extractors = [
                    {
                        'id': str(ar.id),
                        'extractor': ar.extractor,
                        'status': ar.status,
-                        'started': ar.start_ts.isoformat() if ar.start_ts else None,
-                        'progress': 50,
                    }
-                    for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
+                    for ar in snapshot_results.annotate(
+                        status_order=Case(
+                            When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
+                            When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
+                            When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
+                            When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
+                            default=Value(4),
+                            output_field=IntegerField(),
+                        )
+                    ).order_by('status_order', 'extractor')
                ]

                active_snapshots_for_crawl.append({
@@ -612,9 +631,17 @@ def live_progress_view(request):
                    'completed_extractors': completed_extractors,
                    'failed_extractors': failed_extractors,
                    'pending_extractors': pending_extractors,
-                    'active_extractors': active_extractors,
+                    'all_extractors': all_extractors,
                })

+            # Check if crawl can start (for debugging stuck crawls)
+            can_start = bool(crawl.seed and crawl.seed.uri)
+            seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
+
+            # Check if retry_at is in the future (would prevent worker from claiming)
+            retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
+            seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
+
            active_crawls.append({
                'id': str(crawl.id),
                'label': str(crawl)[:60],
@@ -622,11 +649,17 @@ def live_progress_view(request):
                'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
                'progress': crawl_progress,
                'max_depth': crawl.max_depth,
+                'urls_count': urls_count,
                'total_snapshots': total_snapshots,
                'completed_snapshots': completed_snapshots,
+                'started_snapshots': started_snapshots,
                'failed_snapshots': 0,
                'pending_snapshots': pending_snapshots,
                'active_snapshots': active_snapshots_for_crawl,
+                'can_start': can_start,
+                'seed_uri': seed_uri,
+                'retry_at_future': retry_at_future,
+                'seconds_until_retry': seconds_until_retry,
            })

        return JsonResponse({
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -8,6 +8,7 @@ from django.contrib import admin, messages
 from django.urls import path
 from django.http import JsonResponse
 from django.views.decorators.http import require_POST
+from django.db.models import Count, Q

 from archivebox import DATA_DIR

@@ -19,13 +20,155 @@ from core.models import Snapshot
 from crawls.models import Seed, Crawl, CrawlSchedule


+def render_snapshots_list(snapshots_qs, limit=20):
+    """Render a nice inline list view of snapshots with status, title, URL, and progress."""
+
+    snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
+        total_results=Count('archiveresult'),
+        succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
+        failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
+    )
+
+    if not snapshots:
+        return mark_safe('<div style="color: #666; font-style: italic; padding: 8px 0;">No Snapshots yet...</div>')
+
+    # Status colors matching Django admin and progress monitor
+    status_colors = {
+        'queued': ('#6c757d', '#f8f9fa'),      # gray
+        'started': ('#856404', '#fff3cd'),     # amber
+        'sealed': ('#155724', '#d4edda'),      # green
+        'failed': ('#721c24', '#f8d7da'),      # red
+    }
+
+    rows = []
+    for snapshot in snapshots:
+        status = snapshot.status or 'queued'
+        color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa'))
+
+        # Calculate progress
+        total = snapshot.total_results
+        done = snapshot.succeeded_results + snapshot.failed_results
+        progress_pct = int((done / total) * 100) if total > 0 else 0
+        progress_text = f'{done}/{total}' if total > 0 else '-'
+
+        # Truncate title and URL
+        title = (snapshot.title or 'Untitled')[:60]
+        if len(snapshot.title or '') > 60:
+            title += '...'
+        url_display = snapshot.url[:50]
+        if len(snapshot.url) > 50:
+            url_display += '...'
+
+        # Format date
+        date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
+
+        rows.append(f'''
+            <tr style="border-bottom: 1px solid #eee;">
+                <td style="padding: 6px 8px; white-space: nowrap;">
+                    <span style="display: inline-block; padding: 2px 8px; border-radius: 10px;
+                                 font-size: 11px; font-weight: 500; text-transform: uppercase;
+                                 color: {color}; background: {bg};">{status}</span>
+                </td>
+                <td style="padding: 6px 8px; white-space: nowrap;">
+                    <a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
+                        <img src="/archive/{snapshot.timestamp}/favicon.ico"
+                             style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
+                             onerror="this.style.display='none'"/>
+                    </a>
+                </td>
+                <td style="padding: 6px 8px; max-width: 300px;">
+                    <a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
+                       title="{snapshot.title or 'Untitled'}">{title}</a>
+                </td>
+                <td style="padding: 6px 8px; max-width: 250px;">
+                    <a href="{snapshot.url}" target="_blank"
+                       style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
+                       title="{snapshot.url}">{url_display}</a>
+                </td>
+                <td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
+                    <div style="display: inline-flex; align-items: center; gap: 6px;">
+                        <div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
+                            <div style="width: {progress_pct}%; height: 100%;
+                                        background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
+                                        transition: width 0.3s;"></div>
+                        </div>
+                        <a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
+                           style="font-size: 11px; color: #417690; min-width: 35px; text-decoration: none;"
+                           title="View archive results">{progress_text}</a>
+                    </div>
+                </td>
+                <td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
+                    {date_str}
+                </td>
+            </tr>
+        ''')
+
+    total_count = snapshots_qs.count()
+    footer = ''
+    if total_count > limit:
+        footer = f'''
+            <tr>
+                <td colspan="6" style="padding: 8px; text-align: center; color: #666; font-size: 12px; background: #f8f9fa;">
+                    Showing {limit} of {total_count} snapshots
+                </td>
+            </tr>
+        '''
+
+    return mark_safe(f'''
+        <div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
+            <table style="width: 100%; border-collapse: collapse; font-size: 13px;">
+                <thead>
+                    <tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Status</th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333; width: 24px;"></th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Title</th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
+                        <th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
+                        <th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {''.join(rows)}
+                    {footer}
+                </tbody>
+            </table>
+        </div>
+    ''')
+
+
 class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
    list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
    sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
    search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')

    readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
-    fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
+
+    fieldsets = (
+        ('Source', {
+            'fields': ('uri', 'contents'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Info', {
+            'fields': ('label', 'notes', 'tags_str'),
+            'classes': ('card',),
+        }),
+        ('Settings', {
+            'fields': ('extractor', 'config'),
+            'classes': ('card',),
+        }),
+        ('Metadata', {
+            'fields': ('created_by', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Crawls', {
+            'fields': ('scheduled_crawls', 'crawls'),
+            'classes': ('card',),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card',),
+        }),
+    )

    list_filter = ('extractor', 'created_by')
    ordering = ['-created_at']
@@ -51,22 +194,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
        )) or mark_safe('<i>No Crawls yet...</i>')

    def snapshots(self, obj):
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (snapshot.admin_change_url, snapshot)
-            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Snapshots yet...</i>')
+        return render_snapshots_list(obj.snapshot_set.all())

    def contents(self, obj):
-        if obj.uri.startswith('file:///data/'):
-            source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
+        source_file = obj.get_file_path()
+        if source_file:
            contents = ""
            try:
                contents = source_file.read_text().strip()[:14_000]
            except Exception as e:
                contents = f'Error reading {source_file}: {e}'
-                
+
            return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
-        
+
        return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)


@@ -78,7 +218,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
    search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')

    readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
-    fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
+
+    fieldsets = (
+        ('URLs', {
+            'fields': ('seed_urls_editor',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Info', {
+            'fields': ('label', 'notes'),
+            'classes': ('card',),
+        }),
+        ('Settings', {
+            'fields': ('max_depth', 'config'),
+            'classes': ('card',),
+        }),
+        ('Status', {
+            'fields': ('status', 'retry_at'),
+            'classes': ('card',),
+        }),
+        ('Relations', {
+            'fields': ('seed', 'schedule', 'created_by'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card', 'wide'),
+        }),
+    )

    list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
    ordering = ['-created_at', '-retry_at']
@@ -90,6 +260,16 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
    def recrawl(self, request, obj):
        """Duplicate this crawl as a new crawl with the same seed and settings."""
        from django.utils import timezone
+        from django.shortcuts import redirect
+
+        # Validate seed has a URI (required for crawl to start)
+        if not obj.seed:
+            messages.error(request, 'Cannot recrawl: original crawl has no seed.')
+            return redirect('admin:crawls_crawl_change', obj.id)
+
+        if not obj.seed.uri:
+            messages.error(request, 'Cannot recrawl: seed has no URI.')
+            return redirect('admin:crawls_crawl_change', obj.id)

        new_crawl = Crawl.objects.create(
            seed=obj.seed,
@@ -110,8 +290,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
            f'It will start processing shortly.'
        )

-        # Redirect to the new crawl's change page
-        from django.shortcuts import redirect
        return redirect('admin:crawls_crawl_change', new_crawl.id)

    def get_urls(self):
@@ -133,7 +311,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
        except Crawl.DoesNotExist:
            return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)

-        if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
+        source_file = crawl.seed.get_file_path() if crawl.seed else None
+        if not source_file:
            return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)

        try:
@@ -142,8 +321,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
        except json.JSONDecodeError:
            return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)

-        source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
-
        try:
            # Ensure parent directory exists
            source_file.parent.mkdir(parents=True, exist_ok=True)
@@ -156,10 +333,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
        return obj.snapshot_set.count()

    def snapshots(self, obj):
-        return format_html_join('<br/>', '<a href="{}">{}</a>', (
-            (snapshot.admin_change_url, snapshot)
-            for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Snapshots yet...</i>')
+        return render_snapshots_list(obj.snapshot_set.all())

    @admin.display(description='Schedule', ordering='schedule')
    def schedule_str(self, obj):
@@ -186,13 +360,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
            seed_uri = obj.urls

        # Check if it's a local file we can edit
-        is_file = seed_uri.startswith('file:///data/')
+        source_file = obj.seed.get_file_path() if obj.seed else None
+        is_file = source_file is not None
        contents = ""
        error = None
-        source_file = None

-        if is_file:
-            source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
+        if is_file and source_file:
            try:
                contents = source_file.read_text().strip()
            except Exception as e:
@@ -337,7 +510,29 @@ class CrawlScheduleAdmin(BaseModelAdmin):
    search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')

    readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
-    fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields)
+
+    fieldsets = (
+        ('Schedule Info', {
+            'fields': ('label', 'notes'),
+            'classes': ('card',),
+        }),
+        ('Configuration', {
+            'fields': ('schedule', 'template'),
+            'classes': ('card',),
+        }),
+        ('Metadata', {
+            'fields': ('created_by', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Crawls', {
+            'fields': ('crawls',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card', 'wide'),
+        }),
+    )

    list_filter = ('created_by',)
    ordering = ['-created_at']
@@ -362,10 +557,7 @@ class CrawlScheduleAdmin(BaseModelAdmin):
    
    def snapshots(self, obj):
        crawl_ids = obj.crawl_set.values_list('pk', flat=True)
-        return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
-            (snapshot.admin_change_url, snapshot)
-            for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
-        )) or mark_safe('<i>No Snapshots yet...</i>')
+        return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))


 def register_admin(admin_site):
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -44,9 +44,27 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
    def __str__(self):
        return f'[{self.id}] {self.uri[:64]}'

+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        super().save(*args, **kwargs)
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Seed',
+                indent_level=0,
+                metadata={
+                    'id': str(self.id),
+                    'uri': str(self.uri)[:64],
+                    'extractor': self.extractor,
+                    'label': self.label or None,
+                },
+            )
+
    @classmethod
    def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
-        source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
+        # Use absolute path for file:// URLs so extractors can find the files
+        source_path = str(source_file.resolve())
        seed, _ = cls.objects.get_or_create(
            label=label or source_file.name, uri=f'file://{source_path}',
            created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
@@ -62,6 +80,25 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_seed', args=[self.id])

+    def get_file_path(self) -> Path | None:
+        """
+        Get the filesystem path for file:// URIs.
+        Handles both old format (file:///data/...) and new format (file:///absolute/path).
+        Returns None if URI is not a file:// URI.
+        """
+        if not self.uri.startswith('file://'):
+            return None
+
+        # Remove file:// prefix
+        path_str = self.uri.replace('file://', '', 1)
+
+        # Handle old format: file:///data/... -> DATA_DIR/...
+        if path_str.startswith('/data/'):
+            return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
+
+        # Handle new format: file:///absolute/path
+        return Path(path_str)
+
    @property
    def snapshot_set(self) -> QuerySet['Snapshot']:
        from core.models import Snapshot
@@ -136,6 +173,23 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    def __str__(self):
        return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'

+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        super().save(*args, **kwargs)
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Crawl',
+                indent_level=1,
+                metadata={
+                    'id': str(self.id),
+                    'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
+                    'max_depth': self.max_depth,
+                    'status': self.status,
+                },
+            )
+
    @classmethod
    def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
        crawl, _ = cls.objects.get_or_create(
--- a/archivebox/crawls/statemachines.py
+++ b/archivebox/crawls/statemachines.py
@@ -36,13 +36,19 @@ class CrawlMachine(StateMachine, strict_states=True):
        super().__init__(crawl, *args, **kwargs)
    
    def __repr__(self) -> str:
-        return f'[grey53]Crawl\\[{self.crawl.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'Crawl[{self.crawl.id}]'
+
    def __str__(self) -> str:
        return self.__repr__()
        
    def can_start(self) -> bool:
-        return bool(self.crawl.seed and self.crawl.seed.uri)
+        if not self.crawl.seed:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
+            return False
+        if not self.crawl.seed.uri:
+            print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
+            return False
+        return True
        
    def is_finished(self) -> bool:
        from core.models import Snapshot, ArchiveResult
@@ -73,25 +79,121 @@ class CrawlMachine(StateMachine, strict_states=True):

    @started.enter
    def enter_started(self):
-        print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()')
+        # Suppressed: state transition logs
        # lock the crawl object while we create snapshots
        self.crawl.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=5),
            status=Crawl.StatusChoices.QUEUED,
        )

-        # Run the crawl - creates root snapshot and processes queued URLs
-        self.crawl.run()
+        try:
+            # Run on_Crawl hooks to validate/install dependencies
+            self._run_crawl_hooks()

-        # only update status to STARTED once snapshots are created
-        self.crawl.update_for_workers(
-            retry_at=timezone.now() + timedelta(seconds=5),
-            status=Crawl.StatusChoices.STARTED,
+            # Run the crawl - creates root snapshot and processes queued URLs
+            self.crawl.run()
+
+            # only update status to STARTED once snapshots are created
+            self.crawl.update_for_workers(
+                retry_at=timezone.now() + timedelta(seconds=5),
+                status=Crawl.StatusChoices.STARTED,
+            )
+        except Exception as e:
+            print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
+            import traceback
+            traceback.print_exc()
+            # Re-raise so the worker knows it failed
+            raise
+
+    def _run_crawl_hooks(self):
+        """Run on_Crawl hooks to validate/install dependencies."""
+        from pathlib import Path
+        from archivebox.hooks import run_hooks, discover_hooks
+        from archivebox.config import CONSTANTS
+
+        # Discover and run all on_Crawl hooks
+        hooks = discover_hooks('Crawl')
+        if not hooks:
+            return
+
+        # Create a temporary output directory for hook results
+        output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Run all on_Crawl hooks
+        results = run_hooks(
+            event_name='Crawl',
+            output_dir=output_dir,
+            timeout=60,
+            config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
+            crawl_id=str(self.crawl.id),
+            seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
        )

-    @sealed.enter        
+        # Process hook results - parse JSONL output and create DB objects
+        self._process_hook_results(results)
+
+    def _process_hook_results(self, results: list):
+        """Process JSONL output from hooks to create InstalledBinary and update Machine config."""
+        import json
+        from machine.models import Machine, InstalledBinary
+
+        machine = Machine.current()
+
+        for result in results:
+            if result['returncode'] != 0:
+                # Hook failed - might indicate missing dependency
+                continue
+
+            # Parse JSONL output
+            for line in result['stdout'].strip().split('\n'):
+                if not line.strip():
+                    continue
+
+                try:
+                    obj = json.loads(line)
+                    obj_type = obj.get('type')
+
+                    if obj_type == 'InstalledBinary':
+                        # Create or update InstalledBinary record
+                        # Skip if essential fields are missing
+                        if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
+                            continue
+
+                        InstalledBinary.objects.update_or_create(
+                            machine=machine,
+                            name=obj['name'],
+                            defaults={
+                                'abspath': obj['abspath'],
+                                'version': obj['version'],
+                                'sha256': obj.get('sha256') or '',
+                                'binprovider': obj.get('binprovider') or 'env',
+                            }
+                        )
+
+                    elif obj_type == 'Machine':
+                        # Update Machine config
+                        method = obj.get('_method', 'update')
+                        if method == 'update':
+                            key = obj.get('key', '')
+                            value = obj.get('value')
+                            if key.startswith('config/'):
+                                config_key = key[7:]  # Remove 'config/' prefix
+                                machine.config[config_key] = value
+                                machine.save(update_fields=['config'])
+
+                    elif obj_type == 'Dependency':
+                        # Dependency request - could trigger installation
+                        # For now just log it (installation hooks would be separate)
+                        print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]')
+
+                except json.JSONDecodeError:
+                    # Not JSON, skip
+                    continue
+
+    @sealed.enter
    def enter_sealed(self):
-        print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
+        # Suppressed: state transition logs
        self.crawl.update_for_workers(
            retry_at=None,
            status=Crawl.StatusChoices.SEALED,
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -245,6 +245,14 @@ def run_hook(
    env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
    env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))

+    # Pass SEARCH_BACKEND_ENGINE from new-style config
+    try:
+        from archivebox.config.configset import get_config
+        search_config = get_config()
+        env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
+    except Exception:
+        env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
+
    # Create output directory if needed
    output_dir.mkdir(parents=True, exist_ok=True)

--- a/archivebox/logs/errors.log
+++ b/archivebox/logs/errors.log
@@ -0,0 +1,2 @@
+
+> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False
--- a/archivebox/machine/admin.py
+++ b/archivebox/machine/admin.py
@@ -12,7 +12,33 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
    sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')

    readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
-    fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
+
+    fieldsets = (
+        ('Identity', {
+            'fields': ('hostname', 'guid', 'ips'),
+            'classes': ('card',),
+        }),
+        ('Hardware', {
+            'fields': ('hw_manufacturer', 'hw_product', 'hw_uuid', 'hw_in_docker', 'hw_in_vm'),
+            'classes': ('card',),
+        }),
+        ('Operating System', {
+            'fields': ('os_platform', 'os_family', 'os_arch', 'os_kernel', 'os_release'),
+            'classes': ('card',),
+        }),
+        ('Statistics', {
+            'fields': ('stats', 'num_uses_succeeded', 'num_uses_failed'),
+            'classes': ('card',),
+        }),
+        ('Configuration', {
+            'fields': ('config',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )

    list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
    ordering = ['-created_at']
@@ -33,7 +59,29 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
    search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')

    readonly_fields = ('machine', 'created_at', 'modified_at', 'mac_address', 'ip_public', 'ip_local', 'dns_server')
-    fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed')
+
+    fieldsets = (
+        ('Machine', {
+            'fields': ('machine',),
+            'classes': ('card',),
+        }),
+        ('Network', {
+            'fields': ('iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),
+            'classes': ('card',),
+        }),
+        ('Location', {
+            'fields': ('hostname', 'isp', 'city', 'region', 'country'),
+            'classes': ('card',),
+        }),
+        ('Usage', {
+            'fields': ('num_uses_succeeded', 'num_uses_failed'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )

    list_filter = ('isp', 'country', 'region')
    ordering = ['-created_at']
@@ -54,7 +102,25 @@ class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
    search_fields = ('id', 'bin_name', 'bin_providers')

    readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
-    fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
+
+    fieldsets = (
+        ('Binary', {
+            'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'),
+            'classes': ('card',),
+        }),
+        ('Commands', {
+            'fields': ('custom_cmds',),
+            'classes': ('card',),
+        }),
+        ('Configuration', {
+            'fields': ('config',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Timestamps', {
+            'fields': ('id', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )

    list_filter = ('bin_providers', 'created_at')
    ordering = ['-created_at']
@@ -82,7 +148,29 @@ class InstalledBinaryAdmin(BaseModelAdmin):
    search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')

    readonly_fields = ('created_at', 'modified_at')
-    fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
+
+    fieldsets = (
+        ('Binary Info', {
+            'fields': ('name', 'dependency', 'binprovider'),
+            'classes': ('card',),
+        }),
+        ('Location', {
+            'fields': ('machine', 'abspath'),
+            'classes': ('card',),
+        }),
+        ('Version', {
+            'fields': ('version', 'sha256'),
+            'classes': ('card',),
+        }),
+        ('Usage', {
+            'fields': ('num_uses_succeeded', 'num_uses_failed'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+    )

    list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
    ordering = ['-created_at']
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -544,16 +544,21 @@ def log_worker_event(

    # Build worker identifier
    worker_parts = [worker_type]
-    if pid:
+    # Don't add pid/worker_id for DB operations (they happen in whatever process is running)
+    if pid and worker_type != 'DB':
        worker_parts.append(f'pid={pid}')
-    if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
+    if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
        worker_parts.append(f'id={worker_id}')
-    if url and worker_type == 'SnapshotWorker':
+    if url and worker_type in ('SnapshotWorker', 'DB'):
        worker_parts.append(f'url={truncate_url(url)}')
-    if extractor and worker_type == 'ArchiveResultWorker':
+    if extractor and worker_type in ('ArchiveResultWorker', 'DB'):
        worker_parts.append(f'extractor={extractor}')

-    worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
+    # Format worker label - only add brackets if there are additional identifiers
+    if len(worker_parts) > 1:
+        worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
+    else:
+        worker_label = worker_parts[0]

    # Build metadata string
    metadata_str = ''
@@ -579,12 +584,14 @@ def log_worker_event(
                meta_parts.append(f'{k}: {len(v)}')
            else:
                meta_parts.append(f'{k}: {v}')
-        metadata_str = ' {' + ', '.join(meta_parts) + '}'
+        metadata_str = ' | '.join(meta_parts)

    # Determine color based on event
    color = 'white'
    if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
        color = 'green'
+    elif event.startswith('Created'):
+        color = 'cyan'  # DB creation events
    elif event in ('Processing...', 'PROCESSING'):
        color = 'blue'
    elif event in ('Completed', 'COMPLETED', 'All work complete'):
@@ -606,8 +613,9 @@ def log_worker_event(
    text.append(indent)  # Indentation
    # Append worker label and event with color
    text.append(f'{worker_label} {event}{error_str}', style=color)
-    # Append metadata without color
-    text.append(metadata_str)
+    # Append metadata without color (add separator if metadata exists)
+    if metadata_str:
+        text.append(f' | {metadata_str}')

    CONSOLE.print(text)

--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'accessibility';
-const OUTPUT_DIR = 'accessibility';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'accessibility.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Extract accessibility info
 async function extractAccessibility(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
+++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
@@ -24,7 +24,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'archive_org'
-OUTPUT_DIR = 'archive_org'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'archive.org.txt'


--- a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
+++ b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
@@ -26,7 +26,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'chrome_cleanup'
-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'


 def get_env(name: str, default: str = '') -> str:
--- a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'chrome_navigate';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'consolelog';
-const OUTPUT_DIR = 'consolelog';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'console.jsonl';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
 async function captureConsoleLogs(url) {
    const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Clear existing file
--- a/archivebox/plugins/dom/on_Snapshot__36_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js
@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'dom';
-const OUTPUT_DIR = 'dom';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.html';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
 }

 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -114,10 +114,7 @@ async function dumpDom(url) {

    const { width, height } = parseResolution(resolution);

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -26,7 +26,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'favicon'
-OUTPUT_DIR = 'favicon'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'favicon.ico'


--- a/archivebox/plugins/git/on_Snapshot__12_git.py
+++ b/archivebox/plugins/git/on_Snapshot__12_git.py
@@ -26,7 +26,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'git'
 BIN_NAME = 'git'
 BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'repo'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
--- a/archivebox/plugins/headers/on_Snapshot__33_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js
@@ -22,9 +22,9 @@ const http = require('http');

 // Extractor metadata
 const EXTRACTOR_NAME = 'headers';
-const OUTPUT_DIR = 'headers';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'headers.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 const CHROME_HEADERS_FILE = 'response_headers.json';

 // Parse command line arguments
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
 }

 async function extractHeaders(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Try Chrome session first
--- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
@@ -28,7 +28,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'htmltotext'
-OUTPUT_DIR = 'htmltotext'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'htmltotext.txt'


@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
    if not text or len(text) < 10:
        return False, None, 'No meaningful text extracted from HTML'

-    # Create output directory and write output
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
    output_path = output_dir / OUTPUT_FILE
    output_path.write_text(text, encoding='utf-8')

--- a/archivebox/plugins/media/on_Snapshot__51_media.py
+++ b/archivebox/plugins/media/on_Snapshot__51_media.py
@@ -39,7 +39,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'media'
 BIN_NAME = 'yt-dlp'
 BIN_PROVIDERS = 'pip,apt,brew,env'
-OUTPUT_DIR = 'media'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'

 def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
    extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
    media_max_size = get_env('MEDIA_MAX_SIZE', '750m')

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)

    # Build command (later options take precedence)
    cmd = [
--- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
@@ -27,7 +27,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'mercury'
 BIN_NAME = 'postlight-parser'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'mercury'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
    """
    timeout = get_env_int('TIMEOUT', 60)

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)

    try:
        # Get text version
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'parse_dom_outlinks';
-const OUTPUT_DIR = 'parse_dom_outlinks';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'outlinks.json';
 const URLS_FILE = 'urls.jsonl';  // For crawl system
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -64,10 +64,7 @@ function getCdpUrl() {

 // Extract outlinks
 async function extractOutlinks(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'pdf';
-const OUTPUT_DIR = 'pdf';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.pdf';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
 }

 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -113,10 +113,7 @@ async function printToPdf(url) {

    const { width, height } = parseResolution(resolution);

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/readability/on_Snapshot__52_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py
@@ -29,7 +29,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'readability'
 BIN_NAME = 'readability-extractor'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'readability'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
    if not html_source:
        return False, None, 'No HTML source found (run singlefile, dom, or wget first)'

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)

    try:
        # Run readability-extractor (outputs JSON by default)
--- a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
+++ b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'redirects';
-const OUTPUT_DIR = 'redirects';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'redirects.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Track redirect chain
 async function trackRedirects(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.js
@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'responses';
-const OUTPUT_DIR = 'responses';
-const CHROME_SESSION_DIR = 'chrome_session';
+const OUTPUT_DIR = '.';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Resource types to capture (by default, capture everything)
 const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
    const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
    const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());

-    // Create output directories
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
+    // Create subdirectories for organizing responses
    const allDir = path.join(OUTPUT_DIR, 'all');
    if (!fs.existsSync(allDir)) {
        fs.mkdirSync(allDir, { recursive: true });
--- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'screenshot';
-const OUTPUT_DIR = 'screenshot';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'screenshot.png';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
 }

 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {

    const { width, height } = parseResolution(resolution);

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
+++ b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Validation hook for ripgrep binary.
+
+Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+def get_binary_version(abspath: str) -> str | None:
+    """Get version string from ripgrep binary."""
+    try:
+        result = subprocess.run(
+            [abspath, '--version'],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode == 0 and result.stdout:
+            # ripgrep version string: "ripgrep 14.1.0"
+            first_line = result.stdout.strip().split('\n')[0]
+            parts = first_line.split()
+            for i, part in enumerate(parts):
+                if part.lower() == 'ripgrep' and i + 1 < len(parts):
+                    return parts[i + 1]
+            # Try to find version number pattern
+            for part in parts:
+                if part[0].isdigit() and '.' in part:
+                    return part
+            return first_line[:32]
+    except Exception:
+        pass
+    return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+    """Get SHA256 hash of binary."""
+    try:
+        with open(abspath, 'rb') as f:
+            return hashlib.sha256(f.read()).hexdigest()
+    except Exception:
+        return None
+
+
+def find_ripgrep() -> dict | None:
+    """Find ripgrep binary using shutil.which or env var."""
+    # Check env var first - if it's an absolute path and exists, use it
+    ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
+    if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
+        abspath = ripgrep_env
+    else:
+        # Otherwise try shutil.which with the env var as the binary name
+        abspath = shutil.which(ripgrep_env) if ripgrep_env else None
+        if not abspath:
+            abspath = shutil.which('rg')
+
+    if abspath and Path(abspath).is_file():
+        return {
+            'name': 'rg',
+            'abspath': abspath,
+            'version': get_binary_version(abspath),
+            'sha256': get_binary_hash(abspath),
+            'binprovider': 'env',
+        }
+
+    return None
+
+
+def main():
+    """Validate ripgrep binary and output JSONL."""
+
+    # Check if ripgrep search backend is enabled
+    search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
+
+    if search_backend != 'ripgrep':
+        # No-op: ripgrep is not the active search backend
+        sys.exit(0)
+
+    result = find_ripgrep()
+
+    if result and result.get('abspath'):
+        # Output InstalledBinary
+        print(json.dumps({
+            'type': 'InstalledBinary',
+            'name': result['name'],
+            'abspath': result['abspath'],
+            'version': result['version'],
+            'sha256': result['sha256'],
+            'binprovider': result['binprovider'],
+        }))
+
+        # Output Machine config update
+        print(json.dumps({
+            'type': 'Machine',
+            '_method': 'update',
+            'key': 'config/RIPGREP_BINARY',
+            'value': result['abspath'],
+        }))
+
+        if result['version']:
+            print(json.dumps({
+                'type': 'Machine',
+                '_method': 'update',
+                'key': 'config/RIPGREP_VERSION',
+                'value': result['version'],
+            }))
+
+        sys.exit(0)
+    else:
+        # Output Dependency request
+        print(json.dumps({
+            'type': 'Dependency',
+            'bin_name': 'rg',
+            'bin_providers': 'apt,brew,cargo,env',
+        }))
+
+        # Exit non-zero to indicate binary not found
+        print(f"ripgrep binary not found", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/search_backend_ripgrep/tests/init.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/init.py
--- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Tests for ripgrep binary detection and archivebox install functionality.
+
+Guards against regressions in:
+1. Machine.config overrides not being used in version command
+2. Ripgrep hook not resolving binary names via shutil.which()
+3. SEARCH_BACKEND_ENGINE not being passed to hook environment
+"""
+
+import os
+import sys
+import json
+import shutil
+import tempfile
+import subprocess
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+
+def test_ripgrep_hook_detects_binary_from_path():
+    """Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    # Skip if rg is not installed
+    if not shutil.which('rg'):
+        pytest.skip("ripgrep (rg) not installed")
+
+    # Set SEARCH_BACKEND_ENGINE to enable the hook
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+    env['RIPGREP_BINARY'] = 'rg'  # Just the name, not the full path (this was the bug)
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, f"Hook failed: {result.stderr}"
+
+    # Parse JSONL output
+    lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
+    assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
+
+    installed_binary = json.loads(lines[0])
+    assert installed_binary['type'] == 'InstalledBinary'
+    assert installed_binary['name'] == 'rg'
+    assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
+    assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
+    assert installed_binary['version'], "Version should be detected"
+
+    machine_config = json.loads(lines[1])
+    assert machine_config['type'] == 'Machine'
+    assert machine_config['key'] == 'config/RIPGREP_BINARY'
+    assert '/' in machine_config['value'], "Machine config should store full path"
+
+
+def test_ripgrep_hook_skips_when_backend_not_ripgrep():
+    """Test that ripgrep hook exits silently when search backend is not ripgrep."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'sqlite'  # Different backend
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
+    assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
+
+
+def test_ripgrep_hook_handles_absolute_path():
+    """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    rg_path = shutil.which('rg')
+    if not rg_path:
+        pytest.skip("ripgrep (rg) not installed")
+
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+    env['RIPGREP_BINARY'] = rg_path  # Full absolute path
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, f"Hook failed: {result.stderr}"
+    assert result.stdout.strip(), "Hook should produce output"
+
+    installed_binary = json.loads(result.stdout.strip().split('\n')[0])
+    assert installed_binary['abspath'] == rg_path
+
+
+@pytest.mark.django_db
+def test_machine_config_overrides_base_config():
+    """
+    Test that Machine.config overrides take precedence over base config.
+
+    Guards against regression where archivebox version was showing binaries
+    as "not installed" even though they were detected and stored in Machine.config.
+    """
+    from machine.models import Machine, InstalledBinary
+
+    machine = Machine.current()
+
+    # Simulate a hook detecting chrome and storing it with a different path than base config
+    detected_chrome_path = '/custom/path/to/chrome'
+    machine.config['CHROME_BINARY'] = detected_chrome_path
+    machine.config['CHROME_VERSION'] = '143.0.7499.170'
+    machine.save()
+
+    # Create InstalledBinary record
+    InstalledBinary.objects.create(
+        machine=machine,
+        name='chrome',
+        abspath=detected_chrome_path,
+        version='143.0.7499.170',
+        binprovider='env',
+    )
+
+    # Verify Machine.config takes precedence
+    from archivebox.config.configset import get_config
+    config = get_config()
+
+    # Machine.config should override the base config value
+    assert machine.config.get('CHROME_BINARY') == detected_chrome_path
+
+    # The version command should use Machine.config, not base config
+    # (Base config might have 'chromium' while Machine.config has the full path)
+    bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
+    assert bin_value == detected_chrome_path, \
+        "Machine.config override should take precedence over base config"
+
+
+@pytest.mark.django_db
+def test_search_backend_engine_passed_to_hooks():
+    """
+    Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
+
+    Guards against regression where hooks couldn't determine which search backend was active.
+    """
+    from pathlib import Path
+    from archivebox.hooks import build_hook_environment
+    from archivebox.config.configset import get_config
+
+    config = get_config()
+    search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
+
+    env = build_hook_environment(overrides=None)
+
+    assert 'SEARCH_BACKEND_ENGINE' in env, \
+        "SEARCH_BACKEND_ENGINE must be in hook environment"
+    assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
+        f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
+
+
+@pytest.mark.django_db
+def test_install_creates_installedbinary_records():
+    """
+    Test that archivebox install creates InstalledBinary records for detected binaries.
+
+    This is an integration test that verifies the full install flow.
+    """
+    from machine.models import Machine, InstalledBinary
+    from crawls.models import Seed, Crawl
+    from crawls.statemachines import CrawlMachine
+    from archivebox.base_models.models import get_or_create_system_user_pk
+
+    machine = Machine.current()
+    initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+
+    # Create an install crawl (like archivebox install does)
+    created_by_id = get_or_create_system_user_pk()
+    seed, _ = Seed.objects.get_or_create(
+        uri='archivebox://test-install',
+        label='Test dependency detection',
+        created_by_id=created_by_id,
+        defaults={'extractor': 'auto'},
+    )
+
+    crawl = Crawl.objects.create(
+        seed=seed,
+        max_depth=0,
+        created_by_id=created_by_id,
+        status='queued',
+    )
+
+    # Run the crawl state machine (this triggers hooks)
+    sm = CrawlMachine(crawl)
+    sm.send('tick')  # queued -> started (runs hooks)
+
+    # Verify InstalledBinary records were created
+    final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+    assert final_binary_count > initial_binary_count, \
+        "archivebox install should create InstalledBinary records"
+
+    # Verify at least some common binaries were detected
+    common_binaries = ['git', 'wget', 'node']
+    detected = []
+    for bin_name in common_binaries:
+        if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
+            detected.append(bin_name)
+
+    assert detected, f"At least one of {common_binaries} should be detected"
+
+    # Verify detected binaries have valid paths and versions
+    for binary in InstalledBinary.objects.filter(machine=machine):
+        if binary.abspath:  # Only check non-empty paths
+            assert '/' in binary.abspath, \
+                f"{binary.name} should have full path, not just name: {binary.abspath}"
+            # Version might be empty for some binaries, that's ok
+
+
+@pytest.mark.django_db
+def test_ripgrep_only_detected_when_backend_enabled():
+    """
+    Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
+
+    Guards against ripgrep being installed/detected when not needed.
+    """
+    from machine.models import Machine, InstalledBinary
+    from crawls.models import Seed, Crawl
+    from crawls.statemachines import CrawlMachine
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from django.conf import settings
+
+    if not shutil.which('rg'):
+        pytest.skip("ripgrep (rg) not installed")
+
+    machine = Machine.current()
+
+    # Clear any existing ripgrep records
+    InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+    # Test 1: With ripgrep backend - should be detected
+    with patch('archivebox.config.configset.get_config') as mock_config:
+        mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
+
+        created_by_id = get_or_create_system_user_pk()
+        seed = Seed.objects.create(
+            uri='archivebox://test-rg-enabled',
+            label='Test ripgrep detection enabled',
+            created_by_id=created_by_id,
+            extractor='auto',
+        )
+
+        crawl = Crawl.objects.create(
+            seed=seed,
+            max_depth=0,
+            created_by_id=created_by_id,
+            status='queued',
+        )
+
+        sm = CrawlMachine(crawl)
+        sm.send('tick')
+
+        # Ripgrep should be detected
+        rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+        assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
+
+    # Clear records again
+    InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+    # Test 2: With different backend - should NOT be detected
+    with patch('archivebox.config.configset.get_config') as mock_config:
+        mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
+
+        seed2 = Seed.objects.create(
+            uri='archivebox://test-rg-disabled',
+            label='Test ripgrep detection disabled',
+            created_by_id=created_by_id,
+            extractor='auto',
+        )
+
+        crawl2 = Crawl.objects.create(
+            seed=seed2,
+            max_depth=0,
+            created_by_id=created_by_id,
+            status='queued',
+        )
+
+        sm2 = CrawlMachine(crawl2)
+        sm2.send('tick')
+
+        # Ripgrep should NOT be detected
+        rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+        assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
+++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
@@ -29,7 +29,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'index_sonic'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'

 # Text file patterns to index
 INDEXABLE_FILES = [
--- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
+++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
@@ -27,7 +27,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'index_sqlite'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'

 # Text file patterns to index, in priority order
 INDEXABLE_FILES = [
--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'seo';
-const OUTPUT_DIR = 'seo';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'seo.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Extract SEO metadata
 async function extractSeo(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
+++ b/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
 const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');

-const OUTPUT_DIR = 'singlefile';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'singlefile.html';

 /**
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
            .filter(fn => fn.endsWith('.html'))
    );

-    // Ensure output directory exists
-    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    // Output directory is current directory (hook already runs in output dir)
    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);

    console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
        return null;
    }

-    // Ensure output directory exists
-    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    // Output directory is current directory (hook already runs in output dir)
    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Build command
--- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
@@ -41,7 +41,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'singlefile'
 BIN_NAME = 'single-file'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'singlefile'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'singlefile.html'


@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'

 def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
        return ''


-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'


 def get_cdp_url() -> str | None:
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
    if extra_args:
        cmd.extend(extra_args.split())

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
    output_path = output_dir / OUTPUT_FILE

    cmd.extend([url, str(output_path)])
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
            sys.exit(1)

        version = get_version(binary)
-        cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
+        cmd_str = f'{binary} {url} {OUTPUT_FILE}'

        # Run extraction
        success, output, error = save_singlefile(url, binary)
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'ssl';
-const OUTPUT_DIR = 'ssl';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'ssl.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Extract SSL details
 async function extractSsl(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Only extract SSL for HTTPS URLs
--- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
+++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
@@ -31,8 +31,8 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'staticfile'
-OUTPUT_DIR = 'staticfile'
-CHROME_SESSION_DIR = 'chrome_session'
+OUTPUT_DIR = '.'
+CHROME_SESSION_DIR = '../chrome_session'

 # Content-Types that indicate static files
 # These can't be meaningfully processed by Chrome-based extractors
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
        if content_length and int(content_length) > max_size:
            return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'

-        # Create output directory
+        # Output directory is current directory (hook already runs in output dir)
        output_dir = Path(OUTPUT_DIR)
-        output_dir.mkdir(exist_ok=True)

        # Determine filename
        filename = get_filename_from_url(url)
--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -21,9 +21,9 @@ const http = require('http');

 // Extractor metadata
 const EXTRACTOR_NAME = 'title';
-const OUTPUT_DIR = 'title';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'title.txt';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
 }

 async function extractTitle(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Try Chrome session first
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -43,7 +43,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'wget'
 BIN_NAME = 'wget'
 BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'wget'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'

 def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -57,13 +57,24 @@
        box-shadow: 0 0 8px #3fb950;
        animation: pulse 2s infinite;
    }
+    #progress-monitor .status-dot.idle {
+        background: #d29922;
+        box-shadow: 0 0 4px #d29922;
+    }
    #progress-monitor .status-dot.stopped {
-        background: #f85149;
+        background: #6e7681;
+    }
+    #progress-monitor .status-dot.flash {
+        animation: flash 0.3s ease-out;
    }
    @keyframes pulse {
        0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
        50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
    }
+    @keyframes flash {
+        0% { transform: scale(1.5); }
+        100% { transform: scale(1); }
+    }

    /* Stats */
    #progress-monitor .stats {
@@ -89,6 +100,19 @@
    #progress-monitor .stat-value.error { color: #f85149; }
    #progress-monitor .stat-value.warning { color: #d29922; }
    #progress-monitor .stat-value.info { color: #58a6ff; }
+    #progress-monitor .stat.clickable {
+        cursor: pointer;
+        padding: 2px 6px;
+        margin: -2px -6px;
+        border-radius: 4px;
+        transition: background 0.2s;
+    }
+    #progress-monitor .stat.clickable:hover {
+        background: rgba(255,255,255,0.1);
+    }
+    #progress-monitor .stat.clickable:active {
+        background: rgba(255,255,255,0.2);
+    }

    /* Toggle Button */
    #progress-monitor .toggle-btn {
@@ -259,48 +283,86 @@
        padding: 0 12px 8px;
    }

-    /* Extractor List */
+    /* Extractor List - Compact Badge Layout */
    #progress-monitor .extractor-list {
        padding: 8px 12px;
        background: rgba(0,0,0,0.2);
        border-top: 1px solid #21262d;
+        display: flex;
+        flex-wrap: wrap;
+        gap: 4px;
    }
-    #progress-monitor .extractor-item {
+    #progress-monitor .extractor-badge {
+        position: relative;
+        display: inline-flex;
+        align-items: center;
+        gap: 4px;
+        padding: 3px 8px;
+        border-radius: 4px;
+        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+        font-size: 10px;
+        background: #21262d;
+        overflow: hidden;
+        white-space: nowrap;
+    }
+    #progress-monitor .extractor-badge .progress-fill {
+        position: absolute;
+        top: 0;
+        left: 0;
+        bottom: 0;
+        z-index: 0;
+        transition: width 0.3s ease-out;
+    }
+    #progress-monitor .extractor-badge .badge-content {
+        position: relative;
+        z-index: 1;
        display: flex;
        align-items: center;
-        gap: 8px;
-        padding: 4px 0;
+        gap: 4px;
    }
-    #progress-monitor .extractor-icon {
-        font-size: 12px;
-        width: 16px;
-        text-align: center;
+    #progress-monitor .extractor-badge.queued {
+        color: #8b949e;
    }
-    #progress-monitor .extractor-icon.running {
+    #progress-monitor .extractor-badge.queued .progress-fill {
+        background: rgba(110, 118, 129, 0.2);
+        width: 0%;
+    }
+    #progress-monitor .extractor-badge.started {
        color: #d29922;
-        animation: spin 1s linear infinite;
    }
-    #progress-monitor .extractor-icon.success {
+    #progress-monitor .extractor-badge.started .progress-fill {
+        background: rgba(210, 153, 34, 0.3);
+        width: 50%;
+        animation: progress-pulse 1.5s ease-in-out infinite;
+    }
+    @keyframes progress-pulse {
+        0%, 100% { opacity: 0.5; }
+        50% { opacity: 1; }
+    }
+    #progress-monitor .extractor-badge.succeeded {
        color: #3fb950;
    }
-    #progress-monitor .extractor-icon.failed {
+    #progress-monitor .extractor-badge.succeeded .progress-fill {
+        background: rgba(63, 185, 80, 0.25);
+        width: 100%;
+    }
+    #progress-monitor .extractor-badge.failed {
        color: #f85149;
    }
-    #progress-monitor .extractor-icon.pending {
-        color: #8b949e;
+    #progress-monitor .extractor-badge.failed .progress-fill {
+        background: rgba(248, 81, 73, 0.25);
+        width: 100%;
+    }
+    #progress-monitor .extractor-badge .badge-icon {
+        font-size: 10px;
+    }
+    #progress-monitor .extractor-badge.started .badge-icon {
+        animation: spin 1s linear infinite;
    }
    @keyframes spin {
        from { transform: rotate(0deg); }
        to { transform: rotate(360deg); }
    }
-    #progress-monitor .extractor-name {
-        flex: 1;
-        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
-        font-size: 11px;
-    }
-    #progress-monitor .extractor-progress {
-        width: 60px;
-    }

    /* Status Badge */
    #progress-monitor .status-badge {
@@ -356,11 +418,11 @@
                    <span class="stat-label">Queued</span>
                    <span class="stat-value warning" id="total-queued">0</span>
                </div>
-                <div class="stat">
+                <div class="stat clickable" id="stat-succeeded" title="Click to reset counter">
                    <span class="stat-label">Done</span>
                    <span class="stat-value success" id="total-succeeded">0</span>
                </div>
-                <div class="stat">
+                <div class="stat clickable" id="stat-failed" title="Click to reset counter">
                    <span class="stat-label">Failed</span>
                    <span class="stat-value error" id="total-failed">0</span>
                </div>
@@ -390,6 +452,24 @@
    let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
    let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));

+    // Baselines for resettable counters
+    let succeededBaseline = parseInt(localStorage.getItem('progress-succeeded-baseline') || '0');
+    let failedBaseline = parseInt(localStorage.getItem('progress-failed-baseline') || '0');
+    let lastSucceeded = 0;
+    let lastFailed = 0;
+
+    // Click handlers for resetting counters
+    document.getElementById('stat-succeeded').addEventListener('click', function() {
+        succeededBaseline = lastSucceeded;
+        localStorage.setItem('progress-succeeded-baseline', succeededBaseline);
+        document.getElementById('total-succeeded').textContent = '0';
+    });
+    document.getElementById('stat-failed').addEventListener('click', function() {
+        failedBaseline = lastFailed;
+        localStorage.setItem('progress-failed-baseline', failedBaseline);
+        document.getElementById('total-failed').textContent = '0';
+    });
+
    function formatUrl(url) {
        try {
            const u = new URL(url);
@@ -400,24 +480,18 @@
    }

    function renderExtractor(extractor) {
-        const iconClass = extractor.status === 'started' ? 'running' :
-                         extractor.status === 'succeeded' ? 'success' :
-                         extractor.status === 'failed' ? 'failed' : 'pending';
        const icon = extractor.status === 'started' ? '&#8635;' :
                    extractor.status === 'succeeded' ? '&#10003;' :
                    extractor.status === 'failed' ? '&#10007;' : '&#9675;';

        return `
-            <div class="extractor-item">
-                <span class="extractor-icon ${iconClass}">${icon}</span>
-                <span class="extractor-name">${extractor.extractor}</span>
-                <div class="extractor-progress">
-                    <div class="progress-bar-container">
-                        <div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
-                             style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
-                    </div>
-                </div>
-            </div>
+            <span class="extractor-badge ${extractor.status}">
+                <span class="progress-fill"></span>
+                <span class="badge-content">
+                    <span class="badge-icon">${icon}</span>
+                    <span>${extractor.extractor}</span>
+                </span>
+            </span>
        `;
    }

@@ -427,10 +501,14 @@
        const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';

        let extractorHtml = '';
-        if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
+        if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
+            // Sort extractors alphabetically by name to prevent reordering on updates
+            const sortedExtractors = [...snapshot.all_extractors].sort((a, b) =>
+                a.extractor.localeCompare(b.extractor)
+            );
            extractorHtml = `
                <div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
-                    ${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
+                    ${sortedExtractors.map(e => renderExtractor(e)).join('')}
                </div>
            `;
        }
@@ -438,7 +516,7 @@
        return `
            <div class="snapshot-item" data-snapshot-key="${snapshotKey}">
                <div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
-                    <span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '&#9654;' : ''}</span>
+                    <span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.all_extractors?.length ? '&#9654;' : ''}</span>
                    <span class="snapshot-icon">${statusIcon}</span>
                    <div class="snapshot-info">
                        <div class="snapshot-url">${formatUrl(snapshot.url)}</div>
@@ -469,6 +547,40 @@
            snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
        }

+        // Show warning if crawl is stuck (queued but can't start)
+        let warningHtml = '';
+        if (crawl.status === 'queued' && !crawl.can_start) {
+            warningHtml = `
+                <div style="padding: 8px 14px; background: rgba(248, 81, 73, 0.1); border-top: 1px solid #f85149; color: #f85149; font-size: 11px;">
+                    ⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'}
+                </div>
+            `;
+        } else if (crawl.status === 'queued' && crawl.retry_at_future) {
+            // Queued but retry_at is in future (was claimed by worker, will retry)
+            warningHtml = `
+                <div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
+                    🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
+                </div>
+            `;
+        } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
+            // Queued and waiting to be picked up by worker
+            warningHtml = `
+                <div style="padding: 8px 14px; background: rgba(210, 153, 34, 0.1); border-top: 1px solid #d29922; color: #d29922; font-size: 11px;">
+                    ⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''}
+                </div>
+            `;
+        }
+
+        // Show snapshot info or URL count if no snapshots yet
+        let metaText = `depth: ${crawl.max_depth}`;
+        if (crawl.total_snapshots > 0) {
+            metaText += ` | ${crawl.total_snapshots} snapshots`;
+        } else if (crawl.urls_count > 0) {
+            metaText += ` | ${crawl.urls_count} URLs`;
+        } else if (crawl.seed_uri) {
+            metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`;
+        }
+
        return `
            <div class="crawl-item" data-crawl-id="${crawl.id}">
                <div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
@@ -476,10 +588,11 @@
                    <span class="crawl-icon">${statusIcon}</span>
                    <div class="crawl-info">
                        <div class="crawl-label">${crawl.label}</div>
-                        <div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
+                        <div class="crawl-meta">${metaText}</div>
                    </div>
                    <div class="crawl-stats">
                        <span style="color:#3fb950">${crawl.completed_snapshots} done</span>
+                        <span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
                        <span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
                    </div>
                    <span class="status-badge ${crawl.status}">${crawl.status}</span>
@@ -490,6 +603,7 @@
                             style="width: ${crawl.progress}%"></div>
                    </div>
                </div>
+                ${warningHtml}
                <div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
                    <div class="snapshot-list">
                        ${snapshotsHtml}
@@ -542,25 +656,48 @@
                           data.snapshots_pending > 0 || data.snapshots_started > 0 ||
                           data.archiveresults_pending > 0 || data.archiveresults_started > 0;

-        // Update orchestrator status
+        // Update orchestrator status - show "Running" only when there's actual activity
+        // Don't distinguish between "Stopped" and "Idle" since orchestrator starts/stops frequently
        const dot = document.getElementById('orchestrator-dot');
        const text = document.getElementById('orchestrator-text');
-        if (data.orchestrator_running) {
-            dot.classList.remove('stopped');
+        const hasWorkers = data.total_workers > 0;
+
+        if (hasWorkers || hasActivity) {
+            dot.classList.remove('stopped', 'idle');
            dot.classList.add('running');
            text.textContent = 'Running';
        } else {
-            dot.classList.remove('running');
-            dot.classList.add('stopped');
-            text.textContent = 'Stopped';
+            // No activity - show as idle (whether orchestrator process exists or not)
+            dot.classList.remove('stopped', 'running');
+            dot.classList.add('idle');
+            text.textContent = 'Idle';
        }

+        // Pulse the dot to show we got fresh data
+        dot.classList.add('flash');
+        setTimeout(() => dot.classList.remove('flash'), 300);
+
        // Update stats
        document.getElementById('worker-count').textContent = data.total_workers;
        document.getElementById('total-queued').textContent =
            data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
-        document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
-        document.getElementById('total-failed').textContent = data.archiveresults_failed;
+
+        // Store raw values and display relative to baseline
+        lastSucceeded = data.archiveresults_succeeded;
+        lastFailed = data.archiveresults_failed;
+
+        // If baseline is higher than current (e.g. after DB reset), reset baseline
+        if (succeededBaseline > lastSucceeded) {
+            succeededBaseline = 0;
+            localStorage.setItem('progress-succeeded-baseline', '0');
+        }
+        if (failedBaseline > lastFailed) {
+            failedBaseline = 0;
+            localStorage.setItem('progress-failed-baseline', '0');
+        }
+
+        document.getElementById('total-succeeded').textContent = lastSucceeded - succeededBaseline;
+        document.getElementById('total-failed').textContent = lastFailed - failedBaseline;

        // Render crawl tree
        if (data.active_crawls.length > 0) {
--- a/archivebox/workers/management/commands/orchestrator.py
+++ b/archivebox/workers/management/commands/orchestrator.py
@@ -7,9 +7,14 @@ class Command(BaseCommand):
    help = 'Run the archivebox orchestrator'

    def add_arguments(self, parser):
-        parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
+        parser.add_argument(
+            '--exit-on-idle',
+            action='store_true',
+            default=False,
+            help="Exit when all work is complete (default: run forever)"
+        )

    def handle(self, *args, **kwargs):
-        daemon = kwargs.get('daemon', False)
-        orchestrator = Orchestrator(exit_on_idle=not daemon)
+        exit_on_idle = kwargs.get('exit_on_idle', False)
+        orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
        orchestrator.runloop()
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -12,16 +12,17 @@ Architecture:
        └── Each worker spawns task subprocesses via CLI

 Usage:
-    # Embedded in other commands (exits when done)
+    # Default: runs forever (for use as subprocess of server)
+    orchestrator = Orchestrator(exit_on_idle=False)
+    orchestrator.runloop()
+
+    # Exit when done (for embedded use in other commands)
    orchestrator = Orchestrator(exit_on_idle=True)
    orchestrator.runloop()
-    
-    # Daemon mode (runs forever)
-    orchestrator = Orchestrator(exit_on_idle=False)
-    orchestrator.start()  # fork and return
-    
+
    # Or run via CLI
-    archivebox orchestrator [--daemon]
+    archivebox manage orchestrator              # runs forever
+    archivebox manage orchestrator --exit-on-idle  # exits when done
 """

 __package__ = 'archivebox.workers'
@@ -45,6 +46,14 @@ from .pid_utils import (
 )


+def _run_orchestrator_process(exit_on_idle: bool) -> None:
+    """Top-level function for multiprocessing (must be picklable)."""
+    from archivebox.config.django import setup_django
+    setup_django()
+    orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
+    orchestrator.runloop()
+
+
 class Orchestrator:
    """
    Manages worker processes by polling queues and spawning workers as needed.
@@ -277,12 +286,12 @@ class Orchestrator:
        Fork orchestrator as a background process.
        Returns the PID of the new process.
        """
-        def run_orchestrator():
-            from archivebox.config.django import setup_django
-            setup_django()
-            self.runloop()
-        
-        proc = Process(target=run_orchestrator, name='orchestrator')
+        # Use module-level function to avoid pickle errors with local functions
+        proc = Process(
+            target=_run_orchestrator_process,
+            args=(self.exit_on_idle,),
+            name='orchestrator'
+        )
        proc.start()

        assert proc.pid is not None
--- a/archivebox/workers/supervisord_util.py
+++ b/archivebox/workers/supervisord_util.py
@@ -28,7 +28,7 @@ WORKERS_DIR_NAME = "workers"

 ORCHESTRATOR_WORKER = {
    "name": "worker_orchestrator",
-    "command": "archivebox manage orchestrator",
+    "command": "archivebox manage orchestrator",  # runs forever by default
    "autostart": "true",
    "autorestart": "true",
    "stdout_logfile": "logs/worker_orchestrator.log",
@@ -332,14 +332,14 @@ def stop_worker(supervisor, daemon_name):

 def tail_worker_logs(log_path: str):
    get_or_create_supervisord_process(daemonize=False)
-    
+
    from rich.live import Live
    from rich.table import Table
-    
+
    table = Table()
    table.add_column("TS")
    table.add_column("URL")
-    
+
    try:
        with Live(table, refresh_per_second=1) as live:  # update 4 times a second to feel fluid
            with open(log_path, 'r') as f:
@@ -352,6 +352,83 @@ def tail_worker_logs(log_path: str):
    except SystemExit:
        pass

+
+def tail_multiple_worker_logs(log_files: list[str], follow=True):
+    """Tail multiple log files simultaneously, interleaving their output."""
+    import select
+    from pathlib import Path
+
+    # Convert relative paths to absolute paths
+    log_paths = []
+    for log_file in log_files:
+        log_path = Path(log_file)
+        if not log_path.is_absolute():
+            log_path = CONSTANTS.DATA_DIR / log_path
+
+        # Create log file if it doesn't exist
+        if not log_path.exists():
+            log_path.parent.mkdir(parents=True, exist_ok=True)
+            log_path.touch()
+
+        log_paths.append(log_path)
+
+    # Open all log files
+    file_handles = []
+    for log_path in log_paths:
+        try:
+            f = open(log_path, 'r')
+            # Seek to end of file if following
+            if follow:
+                f.seek(0, 2)  # Seek to end
+            file_handles.append((log_path.name, f))
+        except Exception as e:
+            print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]")
+
+    if not file_handles:
+        print("[red]No log files could be opened[/red]")
+        return
+
+    # Print which logs we're tailing
+    log_names = [name for name, _ in file_handles]
+    print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
+    print()
+
+    try:
+        while follow:
+            # Read available lines from all files
+            for log_name, f in file_handles:
+                line = f.readline()
+                if line:
+                    # Colorize based on log source
+                    if 'orchestrator' in log_name.lower():
+                        color = 'cyan'
+                    elif 'daphne' in log_name.lower():
+                        color = 'green'
+                    else:
+                        color = 'white'
+
+                    # Strip ANSI codes if present (supervisord does this but just in case)
+                    import re
+                    line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
+
+                    if line_clean:
+                        print(f'[{color}][{log_name}][/{color}] {line_clean}')
+
+            # Small sleep to avoid busy-waiting
+            time.sleep(0.1)
+
+    except (KeyboardInterrupt, BrokenPipeError, IOError):
+        print("\n[yellow][i] Stopped tailing logs[/i][/yellow]")
+    except SystemExit:
+        pass
+    finally:
+        # Close all file handles
+        for _, f in file_handles:
+            try:
+                f.close()
+            except Exception:
+                pass
+
 def watch_worker(supervisor, daemon_name, interval=5):
    """loop continuously and monitor worker's health"""
    while True:
--- a/archivebox/workers/tasks.py
+++ b/archivebox/workers/tasks.py
@@ -3,6 +3,9 @@ Background task functions for queuing work to the orchestrator.

 These functions queue Snapshots/Crawls for processing by setting their status
 to QUEUED, which the orchestrator workers will pick up and process.
+
+NOTE: These functions do NOT start the orchestrator - they assume it's already
+running via `archivebox server` (supervisord) or will be run inline by the CLI.
 """

 __package__ = 'archivebox.workers'
@@ -10,16 +13,6 @@ __package__ = 'archivebox.workers'
 from django.utils import timezone


-def ensure_orchestrator_running():
-    """Ensure the orchestrator is running to process queued items."""
-    from .orchestrator import Orchestrator
-
-    if not Orchestrator.is_running():
-        # Start orchestrator in background
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.start()
-
-
 def bg_add(add_kwargs: dict) -> int:
    """
    Add URLs and queue them for archiving.
@@ -36,9 +29,6 @@ def bg_add(add_kwargs: dict) -> int:

    result = add(**add_kwargs)

-    # Ensure orchestrator is running to process the new snapshots
-    ensure_orchestrator_running()
-
    return len(result) if result else 0


@@ -66,10 +56,6 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
            )
            queued_count += 1

-    # Ensure orchestrator is running to process the queued snapshots
-    if queued_count > 0:
-        ensure_orchestrator_running()
-
    return queued_count


@@ -90,9 +76,6 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
            status=Snapshot.StatusChoices.QUEUED,
            retry_at=timezone.now(),
        )
-
-        # Ensure orchestrator is running to process the queued snapshot
-        ensure_orchestrator_running()
        return 1

    return 0
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -67,8 +67,8 @@ class Worker:
    # Configuration (can be overridden by subclasses)
    MAX_TICK_TIME: ClassVar[int] = 60
    MAX_CONCURRENT_TASKS: ClassVar[int] = 1
-    POLL_INTERVAL: ClassVar[float] = 0.5
-    IDLE_TIMEOUT: ClassVar[int] = 3  # Exit after N idle iterations (set to 0 to never exit)
+    POLL_INTERVAL: ClassVar[float] = 1.0
+    IDLE_TIMEOUT: ClassVar[int] = 10  # Exit after N idle iterations (10 sec at 1.0 poll interval)

    def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
        self.worker_id = worker_id
				`@@ -0,0 +1,2 @@`

				`> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False`