Files
ArchiveBox/archivebox/workers/orchestrator.py
2025-12-24 20:10:38 -08:00

288 lines
10 KiB
Python

"""
Orchestrator for managing worker processes.
The Orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
and lazily spawns worker processes when there is work to be done.
Architecture:
Orchestrator (main loop, polls queues)
├── CrawlWorker subprocess(es)
├── SnapshotWorker subprocess(es)
└── ArchiveResultWorker subprocess(es)
└── Each worker spawns task subprocesses via CLI
Usage:
# Embedded in other commands (exits when done)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Daemon mode (runs forever)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start() # fork and return
# Or run via CLI
archivebox orchestrator [--daemon]
"""
__package__ = 'archivebox.workers'
import os
import time
from typing import Type
from multiprocessing import Process
from django.utils import timezone
from rich import print
from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker
from .pid_utils import (
write_pid_file,
remove_pid_file,
get_all_worker_pids,
cleanup_stale_pid_files,
)
class Orchestrator:
"""
Manages worker processes by polling queues and spawning workers as needed.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. If items exist and fewer than MAX_CONCURRENT workers are running, spawns workers
3. Monitors worker health and cleans up stale PIDs
4. Exits when all queues are empty (unless daemon mode)
"""
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker, SnapshotWorker, ArchiveResultWorker]
# Configuration
POLL_INTERVAL: float = 1.0
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
MAX_WORKERS_PER_TYPE: int = 4 # Max workers per model type
MAX_TOTAL_WORKERS: int = 12 # Max workers across all types
def __init__(self, exit_on_idle: bool = True):
self.exit_on_idle = exit_on_idle
self.pid: int = os.getpid()
self.pid_file = None
self.idle_count: int = 0
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@classmethod
def is_running(cls) -> bool:
"""Check if an orchestrator is already running."""
workers = get_all_worker_pids('orchestrator')
return len(workers) > 0
def on_startup(self) -> None:
"""Called when orchestrator starts."""
self.pid = os.getpid()
self.pid_file = write_pid_file('orchestrator', worker_id=0)
print(f'[green]👨‍✈️ {self} STARTED[/green]')
# Clean up any stale PID files from previous runs
stale_count = cleanup_stale_pid_files()
if stale_count:
print(f'[yellow]👨‍✈️ {self} cleaned up {stale_count} stale PID files[/yellow]')
def on_shutdown(self, error: BaseException | None = None) -> None:
"""Called when orchestrator shuts down."""
if self.pid_file:
remove_pid_file(self.pid_file)
if error and not isinstance(error, KeyboardInterrupt):
print(f'[red]👨‍✈️ {self} SHUTDOWN with error:[/red] {type(error).__name__}: {error}')
else:
print(f'[grey53]👨‍✈️ {self} SHUTDOWN[/grey53]')
def get_total_worker_count(self) -> int:
"""Get total count of running workers across all types."""
cleanup_stale_pid_files()
return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
"""Determine if we should spawn a new worker of the given type."""
if queue_count == 0:
return False
# Check per-type limit
running_workers = WorkerClass.get_running_workers()
if len(running_workers) >= self.MAX_WORKERS_PER_TYPE:
return False
# Check total limit
if self.get_total_worker_count() >= self.MAX_TOTAL_WORKERS:
return False
# Check if we already have enough workers for the queue size
# Spawn more gradually - don't flood with workers
if len(running_workers) > 0 and queue_count <= len(running_workers) * WorkerClass.MAX_CONCURRENT_TASKS:
return False
return True
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
pid = WorkerClass.start(daemon=False)
print(f'[blue]👨‍✈️ {self} spawned {WorkerClass.name} worker[/blue] pid={pid}')
return pid
except Exception as e:
print(f'[red]👨‍✈️ {self} failed to spawn {WorkerClass.name} worker:[/red] {e}')
return None
def check_queues_and_spawn_workers(self) -> dict[str, int]:
"""
Check all queues and spawn workers as needed.
Returns dict of queue sizes by worker type.
"""
queue_sizes = {}
for WorkerClass in self.WORKER_TYPES:
# Get queue for this worker type
# Need to instantiate worker to get queue (for model access)
worker = WorkerClass(worker_id=-1) # temp instance just for queue access
queue = worker.get_queue()
queue_count = queue.count()
queue_sizes[WorkerClass.name] = queue_count
# Spawn worker if needed
if self.should_spawn_worker(WorkerClass, queue_count):
self.spawn_worker(WorkerClass)
return queue_sizes
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
"""Check if any queue has pending work."""
return any(count > 0 for count in queue_sizes.values())
def has_running_workers(self) -> bool:
"""Check if any workers are still running."""
return self.get_total_worker_count() > 0
def has_future_work(self) -> bool:
"""Check if there's work scheduled for the future (retry_at > now)."""
for WorkerClass in self.WORKER_TYPES:
worker = WorkerClass(worker_id=-1)
Model = worker.get_model()
# Check for items not in final state with future retry_at
future_count = Model.objects.filter(
retry_at__gt=timezone.now()
).exclude(
status__in=Model.FINAL_STATES
).count()
if future_count > 0:
return True
return False
def on_tick(self, queue_sizes: dict[str, int]) -> None:
"""Called each orchestrator tick. Override for custom behavior."""
total_queued = sum(queue_sizes.values())
total_workers = self.get_total_worker_count()
if total_queued > 0 or total_workers > 0:
# Build status line
status_parts = []
for WorkerClass in self.WORKER_TYPES:
name = WorkerClass.name
queued = queue_sizes.get(name, 0)
workers = len(WorkerClass.get_running_workers())
if queued > 0 or workers > 0:
status_parts.append(f'{name}={queued}q/{workers}w')
if status_parts:
print(f'[grey53]👨‍✈️ {self} tick:[/grey53] {" ".join(status_parts)}')
def on_idle(self) -> None:
"""Called when orchestrator is idle (no work, no workers)."""
if self.idle_count == 1:
print(f'[grey53]👨‍✈️ {self} idle, waiting for work...[/grey53]')
def should_exit(self, queue_sizes: dict[str, int]) -> bool:
"""Determine if orchestrator should exit."""
if not self.exit_on_idle:
return False
if self.IDLE_TIMEOUT == 0:
return False
# Don't exit if there's pending or future work
if self.has_pending_work(queue_sizes):
return False
if self.has_running_workers():
return False
if self.has_future_work():
return False
# Exit after idle timeout
return self.idle_count >= self.IDLE_TIMEOUT
def runloop(self) -> None:
"""Main orchestrator loop."""
self.on_startup()
try:
while True:
# Check queues and spawn workers
queue_sizes = self.check_queues_and_spawn_workers()
# Track idle state
if self.has_pending_work(queue_sizes) or self.has_running_workers():
self.idle_count = 0
self.on_tick(queue_sizes)
else:
self.idle_count += 1
self.on_idle()
# Check if we should exit
if self.should_exit(queue_sizes):
print(f'[green]👨‍✈️ {self} all work complete, exiting[/green]')
break
time.sleep(self.POLL_INTERVAL)
except KeyboardInterrupt:
print() # Newline after ^C
except BaseException as e:
self.on_shutdown(error=e)
raise
else:
self.on_shutdown()
def start(self) -> int:
"""
Fork orchestrator as a background process.
Returns the PID of the new process.
"""
def run_orchestrator():
from archivebox.config.django import setup_django
setup_django()
self.runloop()
proc = Process(target=run_orchestrator, name='orchestrator')
proc.start()
assert proc.pid is not None
print(f'[green]👨‍✈️ Orchestrator started in background[/green] pid={proc.pid}')
return proc.pid
@classmethod
def get_or_start(cls, exit_on_idle: bool = True) -> 'Orchestrator':
"""
Get running orchestrator or start a new one.
Used by commands like 'add' to ensure orchestrator is running.
"""
if cls.is_running():
print('[grey53]👨‍✈️ Orchestrator already running[/grey53]')
# Return a placeholder - actual orchestrator is in another process
return cls(exit_on_idle=exit_on_idle)
orchestrator = cls(exit_on_idle=exit_on_idle)
return orchestrator