tons of fixes with codex

2026-04-06 07:47:53 +10:00 · 2026-01-19 01:00:53 -08:00
parent eaf7256345
commit c7b2217cd6
184 changed files with 3943 additions and 2420 deletions
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -83,6 +83,10 @@ class Orchestrator:
        # In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker
        if self.exit_on_idle:
            self.MAX_CRAWL_WORKERS = 1
+            # Faster UI updates for interactive runs
+            self.POLL_INTERVAL = 0.25
+            # Exit quickly once idle in foreground mode
+            self.IDLE_TIMEOUT = 1
    
    def __repr__(self) -> str:
        return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -111,8 +115,14 @@ class Orchestrator:
        # Clean up any stale Process records from previous runs
        stale_count = Process.cleanup_stale_running()

-        # Clean up orphaned Chrome processes from previous crashes
-        chrome_count = Process.cleanup_orphaned_chrome()
+        # Foreground runs should start fast; skip expensive orphan cleanup unless in daemon mode.
+        chrome_count = 0
+        orphaned_workers = 0
+        if not self.exit_on_idle:
+            # Clean up orphaned Chrome processes from previous crashes
+            chrome_count = Process.cleanup_orphaned_chrome()
+            # Clean up orphaned workers from previous crashes
+            orphaned_workers = Process.cleanup_orphaned_workers()

        # Collect startup metadata
        metadata = {
@@ -123,6 +133,8 @@ class Orchestrator:
            metadata['cleaned_stale_pids'] = stale_count
        if chrome_count:
            metadata['cleaned_orphaned_chrome'] = chrome_count
+        if orphaned_workers:
+            metadata['cleaned_orphaned_workers'] = orphaned_workers

        log_worker_event(
            worker_type='Orchestrator',
@@ -135,30 +147,26 @@ class Orchestrator:
    def terminate_all_workers(self) -> None:
        """Terminate all running worker processes."""
        from archivebox.machine.models import Process
-        import signal
-
-        # Get all running worker processes
-        running_workers = Process.objects.filter(
-            process_type=Process.TypeChoices.WORKER,
-            status__in=['running', 'started']
-        )
+        # Get running worker processes scoped to this orchestrator when possible
+        if getattr(self, 'db_process', None):
+            running_workers = self._get_scoped_running_workers()
+        else:
+            running_workers = Process.objects.filter(
+                process_type=Process.TypeChoices.WORKER,
+                status=Process.StatusChoices.RUNNING,
+            )

        for worker_process in running_workers:
            try:
-                # Send SIGTERM to gracefully terminate the worker
-                os.kill(worker_process.pid, signal.SIGTERM)
-            except ProcessLookupError:
-                # Process already dead
-                pass
+                # Gracefully terminate the worker and update Process status
+                worker_process.terminate(graceful_timeout=5.0)
            except Exception:
-                # Ignore other errors during shutdown
                pass

    def on_shutdown(self, error: BaseException | None = None) -> None:
        """Called when orchestrator shuts down."""
-        # Terminate all worker processes in exit_on_idle mode
-        if self.exit_on_idle:
-            self.terminate_all_workers()
+        # Terminate all worker processes on shutdown
+        self.terminate_all_workers()

        # Update Process record status
        if hasattr(self, 'db_process') and self.db_process:
@@ -188,11 +196,26 @@ class Orchestrator:
            Process.cleanup_stale_running()
            self._last_cleanup_time = now

+        if self.crawl_id and getattr(self, 'db_process', None):
+            return self._get_scoped_running_workers().count()
+
        return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)

    def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int:
        """Get count of running workers for a specific worker type."""
+        if self.crawl_id and getattr(self, 'db_process', None):
+            return self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count()
        return len(WorkerClass.get_running_workers())
+
+    def _get_scoped_running_workers(self):
+        """Get running workers scoped to this orchestrator process tree."""
+        from archivebox.machine.models import Process
+
+        descendants = self.db_process.get_descendants(include_self=False)
+        return descendants.filter(
+            process_type=Process.TypeChoices.WORKER,
+            status=Process.StatusChoices.RUNNING,
+        )
    
    def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
        """Determine if we should spawn a new worker."""
@@ -208,8 +231,11 @@ class Orchestrator:
            max_workers = 1  # Default for unknown types

        # Check worker limit
-        running_workers = WorkerClass.get_running_workers()
-        running_count = len(running_workers)
+        if self.crawl_id and getattr(self, 'db_process', None) and WorkerClass.name != 'binary':
+            running_count = self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count()
+        else:
+            running_workers = WorkerClass.get_running_workers()
+            running_count = len(running_workers)

        if running_count >= max_workers:
            return False
@@ -225,9 +251,13 @@ class Orchestrator:
        """Spawn a new worker process. Returns PID or None if spawn failed."""
        try:
            print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
-            pid = WorkerClass.start(crawl_id=self.crawl_id)
+            pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
            print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')

+            if self.exit_on_idle:
+                # Foreground runs have MAX_CRAWL_WORKERS=1; avoid blocking startup on registration.
+                return pid
+
            # CRITICAL: Block until worker registers itself in Process table
            # This prevents race condition where orchestrator spawns multiple workers
            # before any of them finish on_startup() and register
@@ -316,7 +346,7 @@ class Orchestrator:
        if binary_count > 0:
            running_binary_workers_list = BinaryWorker.get_running_workers()
            if len(running_binary_workers_list) == 0:
-                BinaryWorker.start()
+                BinaryWorker.start(parent=self.db_process)

        # Check if any BinaryWorkers are still running
        running_binary_workers = len(BinaryWorker.get_running_workers())
@@ -344,7 +374,7 @@ class Orchestrator:
                # Claim next crawl
                crawl = crawl_queue.first()
                if crawl and self._claim_crawl(crawl):
-                    CrawlWorker.start(crawl_id=str(crawl.id))
+                    CrawlWorker.start(parent=self.db_process, crawl_id=str(crawl.id))

        return queue_sizes

@@ -463,7 +493,7 @@ class Orchestrator:

                with Live(
                    progress_layout.get_layout(),
-                    refresh_per_second=4,
+                    refresh_per_second=8,
                    screen=True,
                    console=orchestrator_console,
                ):
@@ -521,41 +551,147 @@ class Orchestrator:
                    else:
                        status = "Idle"

+                    binary_workers_count = worker_counts.get('binary', 0)
                    # Update orchestrator status
                    progress_layout.update_orchestrator_status(
                        status=status,
                        crawl_queue_count=crawl_queue_count,
                        crawl_workers_count=crawl_workers_count,
+                        binary_queue_count=queue_sizes.get('binary', 0),
+                        binary_workers_count=binary_workers_count,
                        max_crawl_workers=self.MAX_CRAWL_WORKERS,
                    )

-                    # Update CrawlWorker logs by tailing Process stdout/stderr
-                    if crawl_workers_count > 0:
-                        from archivebox.machine.models import Process
-                        crawl_worker_process = Process.objects.filter(
-                            process_type=Process.TypeChoices.WORKER,
-                            worker_type='crawl',
-                            status__in=['running', 'started']
-                        ).first()
-                        if crawl_worker_process:
-                            progress_layout.update_crawl_worker_logs(crawl_worker_process)
+                    # Update crawl queue tree (active + recently completed)
+                    from archivebox.crawls.models import Crawl
+                    from archivebox.core.models import Snapshot, ArchiveResult
+                    recent_cutoff = timezone.now() - timedelta(minutes=5)
+                    pending_snapshot_candidates: list[Snapshot] = []
+                    hooks_by_snapshot: dict[str, list] = {}

-                    # Log queue size changes
-                    if queue_sizes != last_queue_sizes:
-                        for worker_type, count in queue_sizes.items():
-                            old_count = last_queue_sizes.get(worker_type, 0)
-                            if count != old_count:
-                                if count > old_count:
-                                    progress_layout.log_event(
-                                        f"{worker_type.capitalize()} queue: {old_count} → {count}",
-                                        style="yellow"
-                                    )
-                                else:
-                                    progress_layout.log_event(
-                                        f"{worker_type.capitalize()} queue: {old_count} → {count}",
-                                        style="green"
-                                    )
-                        last_queue_sizes = queue_sizes.copy()
+                    active_qs = Crawl.objects.exclude(status__in=Crawl.FINAL_STATES)
+                    if self.crawl_id:
+                        active_qs = active_qs.filter(id=self.crawl_id)
+                    active_qs = active_qs.order_by('retry_at')
+
+                    recent_done_qs = Crawl.objects.filter(
+                        status__in=Crawl.FINAL_STATES,
+                        modified_at__gte=recent_cutoff,
+                    )
+                    if self.crawl_id:
+                        recent_done_qs = recent_done_qs.filter(id=self.crawl_id)
+                    recent_done_qs = recent_done_qs.order_by('-modified_at')
+
+                    crawls = list(active_qs)
+                    active_ids = {c.id for c in crawls}
+                    for crawl in recent_done_qs:
+                        if crawl.id not in active_ids:
+                            crawls.append(crawl)
+
+                    def _abbrev(text: str, max_len: int = 80) -> str:
+                        return text if len(text) <= max_len else f"{text[:max_len - 3]}..."
+
+                    tree_data: list[dict] = []
+                    for crawl in crawls:
+                        urls = crawl.get_urls_list()
+                        url_count = len(urls)
+                        label = f"{url_count} url" + ("s" if url_count != 1 else "")
+                        label = _abbrev(label)
+
+                        snapshots = []
+                        snap_qs = Snapshot.objects.filter(crawl_id=crawl.id)
+                        active_snaps = list(
+                            snap_qs.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED])
+                            .order_by('created_at')[:16]
+                        )
+                        recent_snaps = list(
+                            snap_qs.filter(status__in=Snapshot.FINAL_STATES)
+                            .order_by('-modified_at')[:8]
+                        )
+                        snap_ids = {s.id for s in active_snaps}
+                        for s in recent_snaps:
+                            if s.id not in snap_ids:
+                                active_snaps.append(s)
+
+                        for snap in active_snaps:
+                            total = snap.archiveresult_set.count()
+                            completed = snap.archiveresult_set.filter(status__in=[
+                                ArchiveResult.StatusChoices.SUCCEEDED,
+                                ArchiveResult.StatusChoices.SKIPPED,
+                                ArchiveResult.StatusChoices.FAILED,
+                            ]).count()
+                            running = snap.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED).count()
+                            try:
+                                from archivebox.config.configset import get_config
+                                from archivebox.hooks import discover_hooks
+                                hooks_list = discover_hooks('Snapshot', config=get_config(snapshot=snap))
+                                total_hooks = len(hooks_list)
+                                hooks_by_snapshot[str(snap.id)] = hooks_list
+                            except Exception:
+                                total_hooks = total
+                            pending = max(total_hooks - completed - running, 0)
+                            snap_label = _abbrev(snap.url or str(snap.id), max_len=60)
+                            snapshots.append({
+                                'id': str(snap.id),
+                                'status': snap.status,
+                                'label': snap_label,
+                                'hooks': {'completed': completed, 'running': running, 'pending': pending} if total else {},
+                            })
+                            pending_snapshot_candidates.append(snap)
+
+                        tree_data.append({
+                            'id': str(crawl.id),
+                            'status': crawl.status,
+                            'label': label,
+                            'snapshots': snapshots,
+                        })
+
+                    progress_layout.update_crawl_tree(tree_data)
+
+                    # Update running process panels (tail stdout/stderr for each running process)
+                    from archivebox.machine.models import Process
+                    if self.crawl_id and getattr(self, 'db_process', None):
+                        process_qs = self.db_process.get_descendants(include_self=False)
+                        process_qs = process_qs.filter(status=Process.StatusChoices.RUNNING)
+                    else:
+                        process_qs = Process.objects.filter(
+                            status=Process.StatusChoices.RUNNING,
+                        ).exclude(process_type=Process.TypeChoices.ORCHESTRATOR)
+
+                    running_processes = [
+                        proc for proc in process_qs.order_by('process_type', 'worker_type', 'started_at')
+                        if proc.is_running
+                    ]
+                    pending_processes = []
+                    try:
+                        from types import SimpleNamespace
+                        for snap in pending_snapshot_candidates:
+                            hooks_list = hooks_by_snapshot.get(str(snap.id), [])
+                            if not hooks_list:
+                                continue
+                            existing = set(
+                                snap.archiveresult_set.exclude(hook_name='').values_list('hook_name', flat=True)
+                            )
+                            for hook_path in hooks_list:
+                                if hook_path.name in existing:
+                                    continue
+                                pending_processes.append(SimpleNamespace(
+                                    process_type='hook',
+                                    worker_type='',
+                                    pid=None,
+                                    cmd=['', str(hook_path)],
+                                    url=snap.url,
+                                    status='queued',
+                                    started_at=None,
+                                    timeout=None,
+                                    pwd=None,
+                                ))
+                    except Exception:
+                        pending_processes = []
+
+                    progress_layout.update_process_panels(running_processes, pending=pending_processes)
+
+                    last_queue_sizes = queue_sizes.copy()

                    # Update snapshot progress
                    from archivebox.core.models import Snapshot
@@ -641,11 +777,10 @@ class Orchestrator:
                            # Hooks created but none started yet
                            current_plugin = "waiting"

-                        # Update snapshot worker (show even if no hooks yet)
                        # Debug: Log first time we see this snapshot
-                        if snapshot.id not in progress_layout.snapshot_to_worker:
+                        if snapshot.id not in snapshot_progress:
                            progress_layout.log_event(
-                                f"Assigning to worker: {snapshot.url[:50]}",
+                                f"Tracking snapshot: {snapshot.url[:50]}",
                                style="grey53"
                            )

@@ -656,17 +791,21 @@ class Orchestrator:
                        if prev_progress != curr_progress:
                            prev_total, prev_completed, prev_plugin = prev_progress

-                            # Log hooks created
-                            if total > prev_total:
-                                progress_layout.log_event(
-                                    f"Hooks created: {total} for {snapshot.url[:40]}",
-                                    style="cyan"
-                                )
-
                            # Log hook completion
                            if completed > prev_completed:
+                                completed_ar = snapshot.archiveresult_set.filter(
+                                    status__in=['succeeded', 'skipped', 'failed']
+                                ).order_by('-end_ts', '-modified_at').first()
+                                hook_label = ''
+                                if completed_ar:
+                                    hook_name = completed_ar.hook_name or completed_ar.plugin or ''
+                                    if hook_name:
+                                        hook_label = hook_name.split('__')[-1] if '__' in hook_name else hook_name
+                                        hook_label = hook_label.replace('.py', '').replace('.js', '').replace('.sh', '').replace('.bg', '')
+                                if not hook_label:
+                                    hook_label = f"{completed}/{total}"
                                progress_layout.log_event(
-                                    f"Hook completed: {completed}/{total} for {snapshot.url[:40]}",
+                                    f"Hook completed: {hook_label}",
                                    style="green"
                                )

@@ -686,23 +825,15 @@ class Orchestrator:
                                style="red"
                            )

-                        progress_layout.update_snapshot_worker(
-                            snapshot_id=snapshot.id,
-                            url=snapshot.url,
-                            total=max(total, 1),  # Show at least 1 to avoid division by zero
-                            completed=completed,
-                            current_plugin=current_plugin,
-                        )
+                        # No per-snapshot panels; logs only

-                    # Remove snapshots that are no longer active
-                    for snapshot_id in list(progress_layout.snapshot_to_worker.keys()):
+                    # Cleanup progress tracking for completed snapshots
+                    for snapshot_id in list(snapshot_progress.keys()):
                        if snapshot_id not in active_ids:
                            progress_layout.log_event(
                                f"Snapshot completed/removed",
                                style="blue"
                            )
-                            progress_layout.remove_snapshot_worker(snapshot_id)
-                            # Also clean up progress tracking
                            if snapshot_id in snapshot_progress:
                                del snapshot_progress[snapshot_id]

@@ -734,6 +865,7 @@ class Orchestrator:
            if progress_layout:
                progress_layout.log_event("Interrupted by user", style="red")
            print()  # Newline after ^C
+            self.on_shutdown(error=KeyboardInterrupt())
        except BaseException as e:
            if progress_layout:
                progress_layout.log_event(f"Error: {e}", style="red")
--- a/archivebox/workers/tests/test_orchestrator.py
+++ b/archivebox/workers/tests/test_orchestrator.py
@@ -215,6 +215,46 @@ class TestOrchestratorWithProcess(TestCase):
            mock_count.assert_called()
            self.assertTrue(result)

+    def test_orchestrator_scoped_worker_count(self):
+        """Orchestrator with crawl_id should count only descendant workers."""
+        import time
+        from archivebox.machine.models import Process, Machine
+
+        machine = Machine.current()
+        orchestrator = Orchestrator(exit_on_idle=True, crawl_id='test-crawl')
+
+        orchestrator.db_process = Process.objects.create(
+            machine=machine,
+            process_type=Process.TypeChoices.ORCHESTRATOR,
+            status=Process.StatusChoices.RUNNING,
+            pid=12345,
+            started_at=timezone.now(),
+        )
+
+        # Prevent cleanup from marking fake PIDs as exited
+        orchestrator._last_cleanup_time = time.time()
+
+        Process.objects.create(
+            machine=machine,
+            process_type=Process.TypeChoices.WORKER,
+            worker_type='crawl',
+            status=Process.StatusChoices.RUNNING,
+            pid=12346,
+            parent=orchestrator.db_process,
+            started_at=timezone.now(),
+        )
+
+        Process.objects.create(
+            machine=machine,
+            process_type=Process.TypeChoices.WORKER,
+            worker_type='crawl',
+            status=Process.StatusChoices.RUNNING,
+            pid=12347,
+            started_at=timezone.now(),
+        )
+
+        self.assertEqual(orchestrator.get_total_worker_count(), 1)
+

 class TestProcessBasedWorkerTracking(TestCase):
    """Test Process model methods that replace pid_utils functionality."""
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -23,6 +23,7 @@ from django.db.models import QuerySet
 from django.utils import timezone
 from django.conf import settings

+from statemachine.exceptions import TransitionNotAllowed
 from rich import print

 from archivebox.misc.logging_util import log_worker_event
@@ -450,13 +451,34 @@ class CrawlWorker(Worker):
    def runloop(self) -> None:
        """Run crawl state machine, spawn SnapshotWorkers."""
        import sys
+        from archivebox.crawls.models import Crawl
        self.on_startup()

        try:
            print(f'🔄 CrawlWorker starting for crawl {self.crawl_id}', file=sys.stderr)

+            if self.crawl.status == Crawl.StatusChoices.SEALED:
+                print(
+                    '✅ This crawl has already completed and there are no tasks remaining.\n'
+                    '   To re-crawl it, create a new crawl with the same URLs, e.g.\n'
+                    '   archivebox crawl create <urls> | archivebox run',
+                    file=sys.stderr,
+                )
+                return
+
            # Advance state machine: QUEUED → STARTED (triggers run() via @started.enter)
-            self.crawl.sm.tick()
+            try:
+                self.crawl.sm.tick()
+            except TransitionNotAllowed:
+                if self.crawl.status == Crawl.StatusChoices.SEALED:
+                    print(
+                        '✅ This crawl has already completed and there are no tasks remaining.\n'
+                        '   To re-crawl it, create a new crawl with the same URLs, e.g.\n'
+                        '   archivebox crawl create <urls> | archivebox run',
+                        file=sys.stderr,
+                    )
+                    return
+                raise
            self.crawl.refresh_from_db()
            print(f'🔄 tick() complete, crawl status={self.crawl.status}', file=sys.stderr)

@@ -509,13 +531,20 @@ class CrawlWorker(Worker):
            status__in=['running', 'started'],
        )

-        # Extract snapshot IDs from their pwd (contains snapshot ID at the end)
+        # Extract snapshot IDs from worker cmd args (more reliable than pwd paths)
        running_snapshot_ids = []
        for proc in running_processes:
-            if proc.pwd:
-                # pwd is like: /path/to/archive/{timestamp}
-                # We need to match this against snapshot.output_dir
-                running_snapshot_ids.append(proc.pwd)
+            cmd = proc.cmd or []
+            snapshot_id = None
+            for i, part in enumerate(cmd):
+                if part == '--snapshot-id' and i + 1 < len(cmd):
+                    snapshot_id = cmd[i + 1]
+                    break
+                if part.startswith('--snapshot-id='):
+                    snapshot_id = part.split('=', 1)[1]
+                    break
+            if snapshot_id:
+                running_snapshot_ids.append(snapshot_id)

        # Find snapshots that don't have a running worker
        all_snapshots = Snapshot.objects.filter(
@@ -526,7 +555,7 @@ class CrawlWorker(Worker):
        # Filter out snapshots that already have workers
        pending_snapshots = [
            snap for snap in all_snapshots
-            if snap.output_dir not in running_snapshot_ids
+            if str(snap.id) not in running_snapshot_ids
        ][:self.MAX_SNAPSHOT_WORKERS - running_count]

        with open(debug_log, 'a') as f:
@@ -631,7 +660,6 @@ class SnapshotWorker(Worker):
        b. If foreground: wait for completion
        c. If background: track but continue to next hook
        d. Update ArchiveResult status
-        e. Advance current_step when all step's hooks complete
    4. When all hooks done: seal snapshot
    5. On shutdown: SIGTERM all background hooks
    """
@@ -662,7 +690,7 @@ class SnapshotWorker(Worker):

    def runloop(self) -> None:
        """Execute all hooks sequentially."""
-        from archivebox.hooks import discover_hooks, is_background_hook, extract_step
+        from archivebox.hooks import discover_hooks, is_background_hook
        from archivebox.core.models import ArchiveResult
        from archivebox.config.configset import get_config

@@ -679,8 +707,7 @@ class SnapshotWorker(Worker):
            # Execute each hook sequentially
            for hook_path in hooks:
                hook_name = hook_path.name
-                plugin = self._extract_plugin_name(hook_name)
-                hook_step = extract_step(hook_name)
+                plugin = self._extract_plugin_name(hook_path, hook_name)
                is_background = is_background_hook(hook_name)

                # Create ArchiveResult for THIS HOOK (not per plugin)
@@ -724,16 +751,18 @@ class SnapshotWorker(Worker):
                        pid=self.pid,
                    )

-                # Check if we can advance to next step
-                self._try_advance_step()
+                # Reap any background hooks that finished while we worked
+                self._reap_background_hooks()

-            # All hooks launched (or completed) - seal using state machine
+            # All hooks launched (or completed) - terminate bg hooks and seal
+            self._finalize_background_hooks()
            # This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing
            self.snapshot.sm.seal()
            self.snapshot.refresh_from_db()

        except Exception as e:
            # Mark snapshot as sealed even on error (still triggers cleanup)
+            self._finalize_background_hooks()
            self.snapshot.sm.seal()
            self.snapshot.refresh_from_db()
            raise
@@ -753,7 +782,6 @@ class SnapshotWorker(Worker):
            script=hook_path,
            output_dir=output_dir,
            config=config,
-            timeout=120,
            parent=self.db_process,
            url=str(self.snapshot.url),
            snapshot_id=str(self.snapshot.id),
@@ -773,12 +801,22 @@ class SnapshotWorker(Worker):
        except TimeoutError:
            # Hook exceeded timeout - kill it
            process.kill(signal_num=9)
-            exit_code = -1
+            exit_code = process.exit_code or 137

        # Update ArchiveResult from hook output
        ar.update_from_output()
        ar.end_ts = timezone.now()

+        # Apply hook-emitted JSONL records regardless of exit code
+        from archivebox.hooks import extract_records_from_process, process_hook_records
+
+        records = extract_records_from_process(process)
+        if records:
+            process_hook_records(
+                records,
+                overrides={'snapshot': self.snapshot, 'crawl': self.snapshot.crawl},
+            )
+
        # Determine final status from hook exit code
        if exit_code == 0:
            ar.status = ar.StatusChoices.SUCCEEDED
@@ -787,34 +825,53 @@ class SnapshotWorker(Worker):

        ar.save(update_fields=['status', 'end_ts', 'modified_at'])

-    def _try_advance_step(self) -> None:
-        """Advance current_step if all foreground hooks in current step are done."""
-        from django.db.models import Q
+    def _finalize_background_hooks(self) -> None:
+        """Gracefully terminate background hooks and update their ArchiveResults."""
+        if getattr(self, '_background_hooks_finalized', False):
+            return
+
+        self._background_hooks_finalized = True
+
+        # Send SIGTERM and wait up to each hook's remaining timeout
+        self._terminate_background_hooks(
+            background_processes=self.background_processes,
+            worker_type='SnapshotWorker',
+            indent_level=2,
+        )
+
+        # Clear to avoid double-termination during on_shutdown
+        self.background_processes = {}
+
+        # Update STARTED background results now that hooks are done
        from archivebox.core.models import ArchiveResult

-        current_step = self.snapshot.current_step
-
-        # Single query: foreground hooks in current step that aren't finished
-        # Foreground hooks: hook_name doesn't contain '.bg.'
-        pending_foreground = self.snapshot.archiveresult_set.filter(
-            Q(hook_name__contains=f'__{current_step}_') &  # Current step
-            ~Q(hook_name__contains='.bg.') &  # Not background
-            ~Q(status__in=ArchiveResult.FINAL_STATES)  # Not finished
-        ).exists()
-
-        if pending_foreground:
-            return  # Still waiting for hooks
-
-        # All foreground hooks done - advance!
-        self.snapshot.current_step += 1
-        self.snapshot.save(update_fields=['current_step', 'modified_at'])
-
-        log_worker_event(
-            worker_type='SnapshotWorker',
-            event=f'Advanced to step {self.snapshot.current_step}',
-            indent_level=2,
-            pid=self.pid,
+        started_bg = self.snapshot.archiveresult_set.filter(
+            status=ArchiveResult.StatusChoices.STARTED,
+            hook_name__contains='.bg.',
        )
+        for ar in started_bg:
+            ar.update_from_output()
+
+    def _reap_background_hooks(self) -> None:
+        """Update ArchiveResults for background hooks that already exited."""
+        if getattr(self, '_background_hooks_finalized', False):
+            return
+        if not self.background_processes:
+            return
+
+        from archivebox.core.models import ArchiveResult
+
+        for hook_name, process in list(self.background_processes.items()):
+            exit_code = process.poll()
+            if exit_code is None:
+                continue
+
+            ar = self.snapshot.archiveresult_set.filter(hook_name=hook_name).first()
+            if ar and ar.status == ArchiveResult.StatusChoices.STARTED:
+                ar.update_from_output()
+
+            # Remove completed hook from tracking
+            self.background_processes.pop(hook_name, None)

    def on_shutdown(self, error: BaseException | None = None) -> None:
        """
@@ -834,12 +891,15 @@ class SnapshotWorker(Worker):
        super().on_shutdown(error)

    @staticmethod
-    def _extract_plugin_name(hook_name: str) -> str:
-        """Extract plugin name from hook filename."""
-        # on_Snapshot__50_wget.py -> wget
-        name = hook_name.split('__')[-1]  # Get part after last __
+    def _extract_plugin_name(hook_path: Path, hook_name: str) -> str:
+        """Extract plugin name from hook path (fallback to filename)."""
+        plugin_dir = hook_path.parent.name
+        if plugin_dir not in ('plugins', '.'):
+            return plugin_dir
+        # Fallback: on_Snapshot__50_wget.py -> wget
+        name = hook_name.split('__')[-1]
        name = name.replace('.py', '').replace('.js', '').replace('.sh', '')
-        name = name.replace('.bg', '')  # Remove .bg suffix
+        name = name.replace('.bg', '')
        return name


@@ -888,7 +948,7 @@ class BinaryWorker(Worker):
                machine=machine,
                status=Binary.StatusChoices.QUEUED,
                retry_at__lte=timezone.now()
-            ).order_by('retry_at')
+            ).order_by('retry_at', 'created_at', 'name')

    def runloop(self) -> None:
        """Install binary(ies)."""