""" Orchestrator for managing worker processes. The Orchestrator polls the Crawl queue and spawns CrawlWorkers as needed. Orchestrator (takes list of specific crawls | polls for pending queued crawls forever) spawns: └── CrawlWorker(s) (one per active Crawl) └── SnapshotWorker(s) (one per Snapshot, up to limit) └── Hook Processes (sequential, forked by SnapshotWorker) e.g on_Snapshot__23_save_pdf.js on_Snapshot__24_save_screenshot.js ... Usage: # Default: runs forever (for use as subprocess of server) orchestrator = Orchestrator(exit_on_idle=False) orchestrator.runloop() # Exit when done (for embedded use in other commands) orchestrator = Orchestrator(exit_on_idle=True) orchestrator.runloop() # Or run via CLI archivebox manage orchestrator # runs forever archivebox manage orchestrator --exit-on-idle # exits when done """ __package__ = 'archivebox.workers' import os import sys import time from typing import Type from datetime import datetime, timedelta from multiprocessing import Process as MPProcess from pathlib import Path from django.db import connections from django.utils import timezone from rich import print from archivebox.misc.logging_util import log_worker_event from .worker import Worker, BinaryWorker, CrawlWorker def _run_orchestrator_process(exit_on_idle: bool) -> None: """Top-level function for multiprocessing (must be picklable).""" import os os.environ['ARCHIVEBOX_ORCHESTRATOR_PROCESS'] = '1' from archivebox.config.django import setup_django setup_django() orchestrator = Orchestrator(exit_on_idle=exit_on_idle) orchestrator.runloop() class Orchestrator: """ Manages worker processes by polling queues and spawning workers as needed. The orchestrator: 1. Polls Crawl queue 2. If crawls exist and fewer than MAX_CRAWL_WORKERS are running, spawns CrawlWorkers 3. Monitors worker health and cleans up stale PIDs 4. Exits when queue is empty (unless daemon mode) Architecture: - Orchestrator spawns CrawlWorkers (one per active Crawl) - Each CrawlWorker spawns SnapshotWorkers (one per Snapshot, up to limit) - Each SnapshotWorker runs hooks sequentially for its snapshot """ # BinaryWorker (singleton daemon) and CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator WORKER_TYPES: list[Type[Worker]] = [BinaryWorker, CrawlWorker] # Configuration POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds) IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit) MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously MAX_BINARY_WORKERS: int = 1 # Max binaries installing simultaneously (sequential only) def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None): self.exit_on_idle = exit_on_idle self.crawl_id = crawl_id # If set, only process work for this crawl self.pid: int = os.getpid() self.pid_file = None self.idle_count: int = 0 self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running() self._last_hard_timeout_check: float = 0.0 # Throttle hard timeout enforcement # In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker if self.exit_on_idle: self.MAX_CRAWL_WORKERS = 1 # Faster UI updates for interactive runs self.POLL_INTERVAL = 0.25 # Exit quickly once idle in foreground mode self.IDLE_TIMEOUT = 1 def __repr__(self) -> str: return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]' @classmethod def is_running(cls) -> bool: """Check if an orchestrator is already running.""" from archivebox.machine.models import Process # Clean up stale processes before counting Process.cleanup_stale_running() return Process.get_running_count(process_type=Process.TypeChoices.ORCHESTRATOR) > 0 def on_startup(self) -> None: """Called when orchestrator starts.""" from archivebox.machine.models import Process self.pid = os.getpid() # Register orchestrator process in database with explicit type self.db_process = Process.current() # Ensure the process type is correctly set to ORCHESTRATOR if self.db_process.process_type != Process.TypeChoices.ORCHESTRATOR: self.db_process.process_type = Process.TypeChoices.ORCHESTRATOR self.db_process.save(update_fields=['process_type']) # Clean up any stale Process records from previous runs stale_count = Process.cleanup_stale_running() # Foreground runs should start fast; skip expensive orphan cleanup unless in daemon mode. chrome_count = 0 orphaned_workers = 0 if not self.exit_on_idle: # Clean up orphaned Chrome processes from previous crashes chrome_count = Process.cleanup_orphaned_chrome() # Clean up orphaned workers from previous crashes orphaned_workers = Process.cleanup_orphaned_workers() # Collect startup metadata metadata = { 'max_crawl_workers': self.MAX_CRAWL_WORKERS, 'poll_interval': self.POLL_INTERVAL, } if stale_count: metadata['cleaned_stale_pids'] = stale_count if chrome_count: metadata['cleaned_orphaned_chrome'] = chrome_count if orphaned_workers: metadata['cleaned_orphaned_workers'] = orphaned_workers log_worker_event( worker_type='Orchestrator', event='Starting...', indent_level=0, pid=self.pid, metadata=metadata, ) def terminate_all_workers(self) -> None: """Terminate all running worker processes.""" from archivebox.machine.models import Process # Get running worker processes scoped to this orchestrator when possible if getattr(self, 'db_process', None): running_workers = self._get_scoped_running_workers() else: running_workers = Process.objects.filter( process_type=Process.TypeChoices.WORKER, status=Process.StatusChoices.RUNNING, ) for worker_process in running_workers: try: # Gracefully terminate the worker and update Process status worker_process.terminate(graceful_timeout=5.0) except Exception: pass def on_shutdown(self, error: BaseException | None = None) -> None: """Called when orchestrator shuts down.""" # Terminate all worker processes on shutdown self.terminate_all_workers() # Update Process record status if hasattr(self, 'db_process') and self.db_process: # KeyboardInterrupt is a graceful shutdown, not an error self.db_process.exit_code = 1 if error and not isinstance(error, KeyboardInterrupt) else 0 self.db_process.status = self.db_process.StatusChoices.EXITED self.db_process.ended_at = timezone.now() self.db_process.save() log_worker_event( worker_type='Orchestrator', event='Shutting down', indent_level=0, pid=self.pid, error=error if isinstance(error, Exception) and not isinstance(error, KeyboardInterrupt) else None, ) def get_total_worker_count(self) -> int: """Get total count of running workers across all types.""" from archivebox.machine.models import Process import time # Throttle cleanup to once every 30 seconds to avoid performance issues CLEANUP_THROTTLE_SECONDS = 30 now = time.time() if now - self._last_cleanup_time > CLEANUP_THROTTLE_SECONDS: Process.cleanup_stale_running() self._last_cleanup_time = now if self.crawl_id and getattr(self, 'db_process', None): return self._get_scoped_running_workers().count() return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES) def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int: """Get count of running workers for a specific worker type.""" if self.crawl_id and getattr(self, 'db_process', None): return self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count() return len(WorkerClass.get_running_workers()) def _get_scoped_running_workers(self): """Get running workers scoped to this orchestrator process tree.""" from archivebox.machine.models import Process descendants = self.db_process.get_descendants(include_self=False) return descendants.filter( process_type=Process.TypeChoices.WORKER, status=Process.StatusChoices.RUNNING, ) def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool: """Determine if we should spawn a new worker.""" if queue_count == 0: return False # Get appropriate limit based on worker type if WorkerClass.name == 'crawl': max_workers = self.MAX_CRAWL_WORKERS elif WorkerClass.name == 'binary': max_workers = self.MAX_BINARY_WORKERS # Force sequential: only 1 binary at a time else: max_workers = 1 # Default for unknown types # Check worker limit if self.crawl_id and getattr(self, 'db_process', None) and WorkerClass.name != 'binary': running_count = self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count() else: running_workers = WorkerClass.get_running_workers() running_count = len(running_workers) if running_count >= max_workers: return False # Check if we already have enough workers for the queue size # Spawn more gradually - don't flood with workers if running_count > 0 and queue_count <= running_count * WorkerClass.MAX_CONCURRENT_TASKS: return False return True def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None: """Spawn a new worker process. Returns PID or None if spawn failed.""" try: pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id) # CRITICAL: Block until worker registers itself in Process table # This prevents race condition where orchestrator spawns multiple workers # before any of them finish on_startup() and register from archivebox.machine.models import Process import time timeout = 5.0 # seconds to wait for worker registration poll_interval = 0.1 # check every 100ms elapsed = 0.0 spawn_time = timezone.now() while elapsed < timeout: # Check if worker process is registered with strict criteria: # 1. Correct PID # 2. WORKER process type # 3. RUNNING status # 4. Parent is this orchestrator # 5. Started recently (within last 10 seconds) worker_process = Process.objects.filter( pid=pid, process_type=Process.TypeChoices.WORKER, status=Process.StatusChoices.RUNNING, parent_id=self.db_process.id, started_at__gte=spawn_time - timedelta(seconds=10), ).first() if worker_process: # Worker successfully registered! return pid time.sleep(poll_interval) elapsed += poll_interval # Timeout - worker failed to register log_worker_event( worker_type='Orchestrator', event='Worker failed to register in time', indent_level=0, pid=self.pid, metadata={'worker_type': WorkerClass.name, 'worker_pid': pid, 'timeout': timeout}, ) return None except Exception as e: log_worker_event( worker_type='Orchestrator', event='Failed to spawn worker', indent_level=0, pid=self.pid, metadata={'worker_type': WorkerClass.name}, error=e, ) return None def check_queues_and_spawn_workers(self) -> dict[str, int]: """ Check Binary and Crawl queues and spawn workers as needed. Returns dict of queue sizes. """ from archivebox.crawls.models import Crawl from archivebox.machine.models import Binary, Machine queue_sizes = {} self._enforce_hard_timeouts() materialized_schedule_count = self._materialize_due_schedules() # Check Binary queue machine = Machine.current() binary_queue = Binary.objects.filter( machine=machine, status=Binary.StatusChoices.QUEUED, retry_at__lte=timezone.now() ).order_by('retry_at') binary_count = binary_queue.count() queue_sizes['binary'] = binary_count # Spawn BinaryWorker if needed (singleton - max 1 BinaryWorker, processes ALL binaries) if binary_count > 0: running_binary_workers_list = BinaryWorker.get_running_workers() if len(running_binary_workers_list) == 0: BinaryWorker.start(parent=self.db_process) # Check if any BinaryWorkers are still running running_binary_workers = len(BinaryWorker.get_running_workers()) # Check Crawl queue crawl_queue = Crawl.objects.filter( retry_at__lte=timezone.now() ).exclude( status__in=Crawl.FINAL_STATES ) # Prevent duplicate CrawlWorkers for the same crawl (even across orchestrators) from archivebox.machine.models import Process running_crawl_ids: set[str] = set() running_crawl_workers = Process.objects.filter( process_type=Process.TypeChoices.WORKER, worker_type='crawl', status=Process.StatusChoices.RUNNING, ).values_list('env', flat=True) for env in running_crawl_workers: if isinstance(env, dict): crawl_id = env.get('CRAWL_ID') if crawl_id: running_crawl_ids.add(str(crawl_id)) if running_crawl_ids: crawl_queue = crawl_queue.exclude(id__in=running_crawl_ids) # Apply crawl_id filter if set if self.crawl_id: crawl_queue = crawl_queue.filter(id=self.crawl_id) crawl_queue = crawl_queue.order_by('retry_at') crawl_count = crawl_queue.count() queue_sizes['crawl'] = crawl_count # CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running # This ensures all binaries are installed before snapshots start processing if binary_count == 0 and running_binary_workers == 0 and materialized_schedule_count == 0: # Spawn CrawlWorker if needed if self.should_spawn_worker(CrawlWorker, crawl_count): # Claim next crawl crawl = crawl_queue.first() if crawl and self._claim_crawl(crawl): CrawlWorker.start(parent=self.db_process, crawl_id=str(crawl.id)) return queue_sizes def _refresh_db_connections(self) -> None: """ Drop long-lived DB connections before each poll tick. The daemon orchestrator must observe rows created by sibling processes (server requests, CLI helpers, docker-compose run invocations). With SQLite, reusing the same connection indefinitely can miss externally committed rows until the process reconnects. """ connections.close_all() def _should_process_schedules(self) -> bool: return (not self.exit_on_idle) and (self.crawl_id is None) def _materialize_due_schedules(self) -> int: if not self._should_process_schedules(): return 0 from archivebox.crawls.models import CrawlSchedule now = timezone.now() due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by') materialized_count = 0 for schedule in due_schedules: if not schedule.is_due(now): continue schedule.enqueue(queued_at=now) materialized_count += 1 return materialized_count def _enforce_hard_timeouts(self) -> None: """Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits.""" import time from datetime import timedelta from archivebox.config.constants import CONSTANTS from archivebox.machine.models import Process from archivebox.core.models import Snapshot, ArchiveResult from archivebox.crawls.models import Crawl throttle_seconds = 30 now_ts = time.time() if now_ts - self._last_hard_timeout_check < throttle_seconds: return self._last_hard_timeout_check = now_ts now = timezone.now() # Hard limit for hook processes / archiveresults hook_cutoff = now - timedelta(seconds=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS) overdue_hooks = Process.objects.filter( process_type=Process.TypeChoices.HOOK, status=Process.StatusChoices.RUNNING, started_at__lt=hook_cutoff, ).select_related('archiveresult') for proc in overdue_hooks: try: proc.kill_tree(graceful_timeout=0.0) except Exception: pass ar = getattr(proc, 'archiveresult', None) if ar and ar.status == ArchiveResult.StatusChoices.STARTED: ar.status = ArchiveResult.StatusChoices.FAILED ar.end_ts = now ar.retry_at = None ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at']) # Hard limit for snapshots snapshot_cutoff = now - timedelta(seconds=CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS) overdue_snapshots = Snapshot.objects.filter( status=Snapshot.StatusChoices.STARTED, modified_at__lt=snapshot_cutoff, ) overdue_snapshot_ids = {str(s.id) for s in overdue_snapshots} if overdue_snapshot_ids: running_snapshot_workers = Process.objects.filter( process_type=Process.TypeChoices.WORKER, worker_type='snapshot', status=Process.StatusChoices.RUNNING, ) for proc in running_snapshot_workers: env = proc.env or {} if isinstance(env, dict) and str(env.get('SNAPSHOT_ID', '')) in overdue_snapshot_ids: try: proc.terminate(graceful_timeout=1.0) except Exception: pass for snapshot in overdue_snapshots: running_hooks = Process.objects.filter( archiveresult__snapshot=snapshot, process_type=Process.TypeChoices.HOOK, status=Process.StatusChoices.RUNNING, ).distinct() for process in running_hooks: try: process.kill_tree(graceful_timeout=0.0) except Exception: continue snapshot.archiveresult_set.filter( status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED], ).update( status=ArchiveResult.StatusChoices.FAILED, end_ts=now, retry_at=None, modified_at=now, ) snapshot.cleanup() snapshot.status = Snapshot.StatusChoices.SEALED snapshot.retry_at = None snapshot.save(update_fields=['status', 'retry_at', 'modified_at']) crawl = snapshot.crawl if crawl and crawl.is_finished(): crawl.status = crawl.StatusChoices.SEALED crawl.retry_at = None crawl.save(update_fields=['status', 'retry_at', 'modified_at']) # Reconcile snapshot/crawl state with running archiveresults started_snapshot_ids = list( ArchiveResult.objects.filter( status=ArchiveResult.StatusChoices.STARTED, ).values_list('snapshot_id', flat=True).distinct() ) if started_snapshot_ids: Snapshot.objects.filter( id__in=started_snapshot_ids, ).exclude( status=Snapshot.StatusChoices.SEALED, ).exclude( status=Snapshot.StatusChoices.STARTED, ).update( status=Snapshot.StatusChoices.STARTED, retry_at=None, modified_at=now, ) Crawl.objects.filter( snapshot_set__id__in=started_snapshot_ids, status=Crawl.StatusChoices.QUEUED, ).distinct().update( status=Crawl.StatusChoices.STARTED, retry_at=None, modified_at=now, ) # If a snapshot is sealed, any still-started archiveresults should be failed sealed_snapshot_ids = list( Snapshot.objects.filter(status=Snapshot.StatusChoices.SEALED).values_list('id', flat=True) ) if sealed_snapshot_ids: started_ars = ArchiveResult.objects.filter( snapshot_id__in=sealed_snapshot_ids, status=ArchiveResult.StatusChoices.STARTED, ).select_related('process') for ar in started_ars: process_id = getattr(ar, 'process_id', None) if process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING: try: ar.process.kill_tree(graceful_timeout=0.0) except Exception: pass ar.status = ArchiveResult.StatusChoices.FAILED ar.end_ts = now ar.retry_at = None ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at']) # Clear queued/started snapshots that belong to sealed crawls Snapshot.objects.filter( crawl__status=Crawl.StatusChoices.SEALED, status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED], ).update( status=Snapshot.StatusChoices.SEALED, retry_at=None, modified_at=now, ) def _claim_crawl(self, crawl) -> bool: """Atomically claim a due crawl using the shared retry_at lock lifecycle.""" from archivebox.crawls.models import Crawl return Crawl.claim_for_worker(crawl, lock_seconds=24 * 60 * 60) def has_pending_work(self, queue_sizes: dict[str, int]) -> bool: """Check if any queue has pending work.""" return any(count > 0 for count in queue_sizes.values()) def has_running_workers(self) -> bool: """Check if any workers are still running.""" return self.get_total_worker_count() > 0 def has_future_work(self) -> bool: """Check if there's work scheduled for the future (retry_at > now) in Crawl queue.""" from archivebox.crawls.models import Crawl # Build filter for future work, respecting crawl_id if set qs = Crawl.objects.filter( retry_at__gt=timezone.now() ).exclude( status__in=Crawl.FINAL_STATES ) # Apply crawl_id filter if set if self.crawl_id: qs = qs.filter(id=self.crawl_id) return qs.count() > 0 def on_tick(self, queue_sizes: dict[str, int]) -> None: """Called each orchestrator tick. Override for custom behavior.""" # Tick logging suppressed to reduce noise pass def on_idle(self) -> None: """Called when orchestrator is idle (no work, no workers).""" # Idle logging suppressed to reduce noise pass def should_exit(self, queue_sizes: dict[str, int]) -> bool: """Determine if orchestrator should exit.""" if not self.exit_on_idle: return False if self.IDLE_TIMEOUT == 0: return False # Don't exit if there's pending or future work if self.has_pending_work(queue_sizes): return False if self.has_running_workers(): return False if self.has_future_work(): return False # Exit after idle timeout return self.idle_count >= self.IDLE_TIMEOUT def runloop(self) -> None: """Main orchestrator loop.""" from rich.live import Live from archivebox.misc.progress_layout import ArchiveBoxProgressLayout import os is_tty = sys.stdout.isatty() # Enable progress layout only in TTY + foreground mode show_progress = is_tty and self.exit_on_idle # When stdout is not a TTY, it may be reserved for JSONL pipeline output. # Keep the plain progress view, but emit it to stderr instead of stdout. plain_output = not is_tty self.on_startup() if not show_progress: # No progress layout - optionally emit plain lines for non-TTY output progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id) if plain_output else None self._run_orchestrator_loop(progress_layout, plain_output=plain_output) else: # Redirect worker subprocess output to /dev/null devnull_fd = os.open(os.devnull, os.O_WRONLY) # Save original stdout/stderr (make 2 copies - one for Console, one for restoring) original_stdout = sys.stdout.fileno() original_stderr = sys.stderr.fileno() stdout_for_console = os.dup(original_stdout) stdout_for_restore = os.dup(original_stdout) stderr_for_restore = os.dup(original_stderr) try: # Redirect stdout/stderr to /dev/null (workers will inherit this) os.dup2(devnull_fd, original_stdout) os.dup2(devnull_fd, original_stderr) # Create Console using saved stdout (not the redirected one) from rich.console import Console import archivebox.misc.logging as logging_module orchestrator_console = Console(file=os.fdopen(stdout_for_console, 'w'), force_terminal=True) # Update global CONSOLE so orchestrator logs appear too original_console = logging_module.CONSOLE logging_module.CONSOLE = orchestrator_console # Create layout and run with Live display progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id) with Live( progress_layout.get_layout(), refresh_per_second=8, screen=True, console=orchestrator_console, ): self._run_orchestrator_loop(progress_layout, plain_output=False) # Restore original console logging_module.CONSOLE = original_console finally: # Restore stdout/stderr os.dup2(stdout_for_restore, original_stdout) os.dup2(stderr_for_restore, original_stderr) # Cleanup try: os.close(devnull_fd) os.close(stdout_for_restore) os.close(stderr_for_restore) except OSError: pass # stdout_for_console is closed by orchestrator_console def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False): """Run the main orchestrator loop with optional progress display.""" last_snapshot_count = None tick_count = 0 last_plain_lines: set[tuple[str, str]] = set() # Track snapshot progress to detect changes snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin) try: while True: tick_count += 1 # Refresh DB state before polling so this long-lived daemon sees # work created by other processes using the same collection. self._refresh_db_connections() # Check queues and spawn workers queue_sizes = self.check_queues_and_spawn_workers() # Get worker counts for each type worker_counts = { WorkerClass.name: len(WorkerClass.get_running_workers()) for WorkerClass in self.WORKER_TYPES } # Update layout if enabled if progress_layout: # Get crawl queue and worker counts crawl_queue_count = queue_sizes.get('crawl', 0) crawl_workers_count = worker_counts.get('crawl', 0) # Determine orchestrator status if crawl_workers_count > 0: status = "Working" elif crawl_queue_count > 0: status = "Spawning" else: status = "Idle" binary_workers_count = worker_counts.get('binary', 0) # Update orchestrator status progress_layout.update_orchestrator_status( status=status, crawl_queue_count=crawl_queue_count, crawl_workers_count=crawl_workers_count, binary_queue_count=queue_sizes.get('binary', 0), binary_workers_count=binary_workers_count, max_crawl_workers=self.MAX_CRAWL_WORKERS, ) # Update crawl queue tree (active + recently completed) from archivebox.crawls.models import Crawl from archivebox.core.models import Snapshot, ArchiveResult recent_cutoff = timezone.now() - timedelta(minutes=5) pending_snapshot_candidates: list[Snapshot] = [] hooks_by_snapshot: dict[str, list] = {} active_qs = Crawl.objects.exclude(status__in=Crawl.FINAL_STATES) if self.crawl_id: active_qs = active_qs.filter(id=self.crawl_id) active_qs = active_qs.order_by('retry_at') recent_done_qs = Crawl.objects.filter( status__in=Crawl.FINAL_STATES, modified_at__gte=recent_cutoff, ) if self.crawl_id: recent_done_qs = recent_done_qs.filter(id=self.crawl_id) recent_done_qs = recent_done_qs.order_by('-modified_at') crawls = list(active_qs) active_ids = {c.id for c in crawls} for crawl in recent_done_qs: if crawl.id not in active_ids: crawls.append(crawl) def _abbrev(text: str, max_len: int = 80) -> str: return text if len(text) <= max_len else f"{text[:max_len - 3]}..." def _format_size(num_bytes: int | None) -> str: if not num_bytes: return '' size = float(num_bytes) for unit in ('b', 'kb', 'mb', 'gb', 'tb'): if size < 1024 or unit == 'tb': return f"{size:.1f}{unit}" size /= 1024 return '' def _format_seconds(total_seconds: float | None) -> str: if total_seconds is None: return '' seconds = max(0.0, float(total_seconds)) return f"{seconds:.1f}s" def _tail_stderr_line(proc) -> str: try: path = getattr(proc, 'stderr_file', None) if not path or not path.exists(): return '' with open(path, 'rb') as f: f.seek(0, os.SEEK_END) size = f.tell() f.seek(max(0, size - 4096)) data = f.read().decode('utf-8', errors='ignore') lines = [ln.strip() for ln in data.splitlines() if ln.strip()] return lines[-1] if lines else '' except Exception: return '' tree_data: list[dict] = [] for crawl in crawls: urls = crawl.get_urls_list() url_count = len(urls) label = f"{url_count} url" + ("s" if url_count != 1 else "") label = _abbrev(label) snapshots = [] snap_qs = Snapshot.objects.filter(crawl_id=crawl.id) active_snaps = list( snap_qs.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]) .order_by('created_at')[:16] ) recent_snaps = list( snap_qs.filter(status__in=Snapshot.FINAL_STATES) .order_by('-modified_at')[:8] ) snap_ids = {s.id for s in active_snaps} for s in recent_snaps: if s.id not in snap_ids: active_snaps.append(s) for snap in active_snaps: try: from archivebox.config.configset import get_config from archivebox.hooks import discover_hooks snap_config = get_config(snapshot=snap) hooks_list = discover_hooks('Snapshot', config=snap_config) hooks_by_snapshot[str(snap.id)] = hooks_list from archivebox.hooks import get_plugin_special_config hook_timeouts = {} for hook_path in hooks_list: plugin_name = hook_path.parent.name try: hook_timeouts[hook_path.name] = int(get_plugin_special_config(plugin_name, snap_config)['timeout']) except Exception: pass except Exception: hooks_list = [] hook_timeouts = {} try: from archivebox import DATA_DIR data_dir = Path(DATA_DIR) snap_path = snap.output_dir try: rel = Path(snap_path) if rel.is_absolute(): rel = rel.relative_to(data_dir) snap_path = f"./{rel}" if not str(rel).startswith("./") else str(rel) except Exception: snap_path = str(snap_path) ars = list( snap.archiveresult_set.select_related('process').order_by('start_ts') ) ar_by_hook = {ar.hook_name: ar for ar in ars if ar.hook_name} except Exception: snap_path = '' ar_by_hook = {} plugin_hooks: dict[str, list[dict]] = {} now = timezone.now() for hook_path in hooks_list: hook_name = hook_path.name is_bg = '.bg.' in hook_name ar = ar_by_hook.get(hook_name) status = 'pending' is_running = False is_pending = True elapsed = '' timeout = '' size = '' stderr_tail = '' if ar: process_id = getattr(ar, 'process_id', None) if process_id and ar.process: stderr_tail = _tail_stderr_line(ar.process) if ar.status == ArchiveResult.StatusChoices.STARTED: status = 'started' is_running = True is_pending = False start_ts = ar.start_ts or (ar.process.started_at if process_id and ar.process else None) if start_ts: elapsed = _format_seconds((now - start_ts).total_seconds()) hook_timeout = None if process_id and ar.process and ar.process.timeout: hook_timeout = ar.process.timeout hook_timeout = hook_timeout or hook_timeouts.get(hook_name) if hook_timeout: timeout = _format_seconds(hook_timeout) else: status = ar.status if process_id and ar.process and ar.process.exit_code == 137: status = 'failed' is_pending = False start_ts = ar.start_ts or (ar.process.started_at if process_id and ar.process else None) end_ts = ar.end_ts or (ar.process.ended_at if process_id and ar.process else None) if start_ts and end_ts: elapsed = _format_seconds((end_ts - start_ts).total_seconds()) size = _format_size(getattr(ar, 'output_size', None)) else: hook_timeout = hook_timeouts.get(hook_name) if hook_timeout: timeout = _format_seconds(hook_timeout) elapsed = _format_seconds(0) plugin_name = hook_path.parent.name if plugin_name in ('plugins', '.'): plugin_name = hook_name.split('__')[-1].split('.')[0] plugin_hooks.setdefault(plugin_name, []).append({ 'status': status, 'size': size, 'elapsed': elapsed, 'timeout': timeout, 'is_bg': is_bg, 'is_running': is_running, 'is_pending': is_pending, 'hook_name': hook_name, 'stderr': stderr_tail, }) hooks = [] for plugin_name, hook_entries in plugin_hooks.items(): running = next((h for h in hook_entries if h['is_running']), None) pending = next((h for h in hook_entries if h['is_pending']), None) any_failed = any(h['status'] == ArchiveResult.StatusChoices.FAILED for h in hook_entries) any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries) any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries) stderr_tail = '' if running: status = 'started' is_running = True is_pending = False is_bg = running['is_bg'] elapsed = running.get('elapsed', '') timeout = running.get('timeout', '') stderr_tail = running.get('stderr', '') size = '' elif pending: status = 'pending' is_running = False is_pending = True is_bg = pending['is_bg'] elapsed = pending.get('elapsed', '') or _format_seconds(0) timeout = pending.get('timeout', '') stderr_tail = pending.get('stderr', '') size = '' else: is_running = False is_pending = False is_bg = any(h['is_bg'] for h in hook_entries) if any_failed: status = 'failed' elif any_succeeded: status = 'succeeded' elif any_skipped: status = 'skipped' else: status = 'skipped' for h in hook_entries: if h.get('stderr'): stderr_tail = h['stderr'] break total_elapsed = 0.0 has_elapsed = False for h in hook_entries: if h.get('elapsed'): try: total_elapsed += float(h['elapsed'].rstrip('s')) has_elapsed = True except Exception: pass elapsed = _format_seconds(total_elapsed) if has_elapsed else '' max_output = 0 # Use the largest output_size we already computed on ArchiveResult ar_sizes = [ ar_by_hook[h['hook_name']].output_size for h in hook_entries if h.get('hook_name') in ar_by_hook and getattr(ar_by_hook[h['hook_name']], 'output_size', 0) ] if ar_sizes: max_output = max(ar_sizes) size = _format_size(max_output) if max_output else '' timeout = '' hooks.append({ 'status': status, 'path': f"./{plugin_name}", 'size': size, 'elapsed': elapsed, 'timeout': timeout, 'is_bg': is_bg, 'is_running': is_running, 'is_pending': is_pending, 'stderr': stderr_tail, }) snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80) snapshots.append({ 'id': str(snap.id), 'status': snap.status, 'label': snap_label, 'output_path': snap_path, 'hooks': hooks, }) pending_snapshot_candidates.append(snap) tree_data.append({ 'id': str(crawl.id), 'status': crawl.status, 'label': label, 'snapshots': snapshots, }) progress_layout.update_crawl_tree(tree_data) # Update running process panels (tail stdout/stderr for each running process) from archivebox.machine.models import Process if self.crawl_id and getattr(self, 'db_process', None): process_qs = self.db_process.get_descendants(include_self=False) process_qs = process_qs.filter(status=Process.StatusChoices.RUNNING) else: process_qs = Process.objects.filter( status=Process.StatusChoices.RUNNING, ).exclude(process_type=Process.TypeChoices.ORCHESTRATOR) running_processes = [ proc for proc in process_qs.order_by('process_type', 'worker_type', 'started_at') if proc.is_running ] pending_processes = [] try: from types import SimpleNamespace for snap in pending_snapshot_candidates: hooks_list = hooks_by_snapshot.get(str(snap.id), []) if not hooks_list: continue existing = set( snap.archiveresult_set.exclude(hook_name='').values_list('hook_name', flat=True) ) for hook_path in hooks_list: if hook_path.name in existing: continue pending_processes.append(SimpleNamespace( process_type='hook', worker_type='', pid=None, cmd=['', str(hook_path)], url=snap.url, status='queued', started_at=None, timeout=None, pwd=None, )) except Exception: pending_processes = [] progress_layout.update_process_panels(running_processes, pending=pending_processes) # Update snapshot progress from archivebox.core.models import Snapshot # Get all started snapshots (optionally filtered by crawl_id) snapshot_filter: dict[str, str | datetime] = {'status': 'started'} if self.crawl_id: snapshot_filter['crawl_id'] = self.crawl_id else: # Only if processing all crawls, filter by recent modified_at to avoid stale snapshots recent_cutoff = timezone.now() - timedelta(minutes=5) snapshot_filter['modified_at__gte'] = recent_cutoff active_snapshots = list(Snapshot.objects.filter(**snapshot_filter)) # Log snapshot count changes and details if len(active_snapshots) != last_snapshot_count: if last_snapshot_count is not None: if len(active_snapshots) > last_snapshot_count: progress_layout.log_event( f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}", style="cyan" ) # Log which snapshots started for snapshot in active_snapshots[-1:]: # Just show the newest one progress_layout.log_event( f"Started: {snapshot.url[:60]}", style="green" ) # Log SnapshotWorker count from archivebox.machine.models import Process all_workers = Process.objects.filter( process_type=Process.TypeChoices.WORKER, status__in=['running', 'started'] ).count() progress_layout.log_event( f"Workers running: {all_workers} ({crawl_workers_count} CrawlWorkers)", style="grey53" ) else: progress_layout.log_event( f"Active snapshots: {last_snapshot_count} → {len(active_snapshots)}", style="blue" ) last_snapshot_count = len(active_snapshots) # Track which snapshots are still active active_ids = set() for snapshot in active_snapshots: active_ids.add(snapshot.id) total = snapshot.archiveresult_set.count() completed = snapshot.archiveresult_set.filter( status__in=['succeeded', 'skipped', 'failed'] ).count() # Count hooks by status for debugging queued = snapshot.archiveresult_set.filter(status='queued').count() # Find currently running hook (ordered by hook_name to get lowest step number) current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first() if not current_ar: # If nothing running, show next queued item (ordered to get next in sequence) current_ar = snapshot.archiveresult_set.filter(status='queued').order_by('hook_name').first() current_plugin = '' if current_ar: # Use hook_name if available, otherwise plugin name hook_name = current_ar.hook_name or current_ar.plugin or '' # Extract just the hook name without path (e.g., "on_Snapshot__50_wget.py" -> "wget") if hook_name: # Clean up the name: remove prefix and extension clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '') current_plugin = clean_name elif total == 0: # Snapshot just started, hooks not created yet current_plugin = "initializing" elif queued > 0: # Hooks created but none started yet current_plugin = "waiting" # Debug: Log first time we see this snapshot if snapshot.id not in snapshot_progress: progress_layout.log_event( f"Tracking snapshot: {snapshot.url[:50]}", style="grey53" ) # Track progress changes prev_progress = snapshot_progress.get(snapshot.id, (0, 0, '')) curr_progress = (total, completed, current_plugin) if prev_progress != curr_progress: prev_total, prev_completed, prev_plugin = prev_progress # Log hook completion if completed > prev_completed: completed_ar = snapshot.archiveresult_set.filter( status__in=['succeeded', 'skipped', 'failed'] ).order_by('-end_ts', '-modified_at').first() hook_label = '' if completed_ar: hook_name = completed_ar.hook_name or completed_ar.plugin or '' if hook_name: hook_label = hook_name.split('__')[-1] if '__' in hook_name else hook_name hook_label = hook_label.replace('.py', '').replace('.js', '').replace('.sh', '').replace('.bg', '') if not hook_label: hook_label = f"{completed}/{total}" progress_layout.log_event( f"Hook completed: {hook_label}", style="green" ) # Log plugin change if current_plugin and current_plugin != prev_plugin: progress_layout.log_event( f"Running: {current_plugin} ({snapshot.url[:40]})", style="yellow" ) snapshot_progress[snapshot.id] = curr_progress # Debug: Every 10 ticks, log detailed status if stuck at initializing if tick_count % 10 == 0 and total == 0 and current_plugin == "initializing": progress_layout.log_event( f"DEBUG: Snapshot stuck at initializing (status={snapshot.status})", style="red" ) # No per-snapshot panels; logs only # Cleanup progress tracking for completed snapshots for snapshot_id in list(snapshot_progress.keys()): if snapshot_id not in active_ids: progress_layout.log_event( "Snapshot completed/removed", style="blue" ) if snapshot_id in snapshot_progress: del snapshot_progress[snapshot_id] if plain_output: plain_lines = progress_layout.plain_lines() new_lines = [line for line in plain_lines if line not in last_plain_lines] if new_lines: ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S") for panel, line in new_lines: if line: print(f"[{ts}] [{panel}] {line}", file=sys.stderr) last_plain_lines = set(plain_lines) # Track idle state has_pending = self.has_pending_work(queue_sizes) has_running = self.has_running_workers() if has_pending or has_running: self.idle_count = 0 self.on_tick(queue_sizes) else: self.idle_count += 1 self.on_idle() # Check if we should exit if self.should_exit(queue_sizes): if progress_layout: progress_layout.log_event("All work complete", style="green") log_worker_event( worker_type='Orchestrator', event='All work complete', indent_level=0, pid=self.pid, ) break time.sleep(self.POLL_INTERVAL) except KeyboardInterrupt: if progress_layout: progress_layout.log_event("Interrupted by user", style="red") print(file=sys.stderr) # Newline after ^C self.on_shutdown(error=KeyboardInterrupt()) except BaseException as e: if progress_layout: progress_layout.log_event(f"Error: {e}", style="red") self.on_shutdown(error=e) raise else: self.on_shutdown() def start(self) -> int: """ Fork orchestrator as a background process. Returns the PID of the new process. """ # Use module-level function to avoid pickle errors with local functions proc = MPProcess( target=_run_orchestrator_process, args=(self.exit_on_idle,), name='orchestrator' ) proc.start() assert proc.pid is not None log_worker_event( worker_type='Orchestrator', event='Started in background', indent_level=0, pid=proc.pid, ) return proc.pid @classmethod def get_or_start(cls, exit_on_idle: bool = True) -> 'Orchestrator': """ Get running orchestrator or start a new one. Used by commands like 'add' to ensure orchestrator is running. """ if cls.is_running(): print('[grey53]👨‍✈️ Orchestrator already running[/grey53]', file=sys.stderr) # Return a placeholder - actual orchestrator is in another process return cls(exit_on_idle=exit_on_idle) orchestrator = cls(exit_on_idle=exit_on_idle) return orchestrator