Files
ArchiveBox/archivebox/workers/orchestrator.py
2026-03-15 22:09:56 -07:00

1307 lines
59 KiB
Python

"""
Orchestrator for managing worker processes.
The Orchestrator polls the Crawl queue and spawns CrawlWorkers as needed.
Orchestrator (takes list of specific crawls | polls for pending queued crawls forever) spawns:
└── CrawlWorker(s) (one per active Crawl)
└── SnapshotWorker(s) (one per Snapshot, up to limit)
└── Hook Processes (sequential, forked by SnapshotWorker)
e.g on_Snapshot__23_save_pdf.js
on_Snapshot__24_save_screenshot.js
...
Usage:
# Default: runs forever (for use as subprocess of server)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.runloop()
# Exit when done (for embedded use in other commands)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Or run via CLI
archivebox manage orchestrator # runs forever
archivebox manage orchestrator --exit-on-idle # exits when done
"""
__package__ = 'archivebox.workers'
import os
import sys
import time
from typing import Type
from datetime import datetime, timedelta
from multiprocessing import Process as MPProcess
from pathlib import Path
from django.db import connections
from django.utils import timezone
from rich import print
from archivebox.misc.logging_util import log_worker_event
from .worker import Worker, BinaryWorker, CrawlWorker
def _run_orchestrator_process(exit_on_idle: bool) -> None:
"""Top-level function for multiprocessing (must be picklable)."""
import os
os.environ['ARCHIVEBOX_ORCHESTRATOR_PROCESS'] = '1'
from archivebox.config.django import setup_django
setup_django()
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
orchestrator.runloop()
class Orchestrator:
"""
Manages worker processes by polling queues and spawning workers as needed.
The orchestrator:
1. Polls Crawl queue
2. If crawls exist and fewer than MAX_CRAWL_WORKERS are running, spawns CrawlWorkers
3. Monitors worker health and cleans up stale PIDs
4. Exits when queue is empty (unless daemon mode)
Architecture:
- Orchestrator spawns CrawlWorkers (one per active Crawl)
- Each CrawlWorker spawns SnapshotWorkers (one per Snapshot, up to limit)
- Each SnapshotWorker runs hooks sequentially for its snapshot
"""
# BinaryWorker (singleton daemon) and CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
WORKER_TYPES: list[Type[Worker]] = [BinaryWorker, CrawlWorker]
# Configuration
POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds)
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously
MAX_BINARY_WORKERS: int = 1 # Max binaries installing simultaneously (sequential only)
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
self.exit_on_idle = exit_on_idle
self.crawl_id = crawl_id # If set, only process work for this crawl
self.pid: int = os.getpid()
self.pid_file = None
self.idle_count: int = 0
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
self._last_hard_timeout_check: float = 0.0 # Throttle hard timeout enforcement
# In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker
if self.exit_on_idle:
self.MAX_CRAWL_WORKERS = 1
# Faster UI updates for interactive runs
self.POLL_INTERVAL = 0.25
# Exit quickly once idle in foreground mode
self.IDLE_TIMEOUT = 1
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@classmethod
def is_running(cls) -> bool:
"""Check if an orchestrator is already running."""
from archivebox.machine.models import Process
# Clean up stale processes before counting
Process.cleanup_stale_running()
return Process.get_running_count(process_type=Process.TypeChoices.ORCHESTRATOR) > 0
def on_startup(self) -> None:
"""Called when orchestrator starts."""
from archivebox.machine.models import Process
self.pid = os.getpid()
# Register orchestrator process in database with explicit type
self.db_process = Process.current()
# Ensure the process type is correctly set to ORCHESTRATOR
if self.db_process.process_type != Process.TypeChoices.ORCHESTRATOR:
self.db_process.process_type = Process.TypeChoices.ORCHESTRATOR
self.db_process.save(update_fields=['process_type'])
# Clean up any stale Process records from previous runs
stale_count = Process.cleanup_stale_running()
# Foreground runs should start fast; skip expensive orphan cleanup unless in daemon mode.
chrome_count = 0
orphaned_workers = 0
if not self.exit_on_idle:
# Clean up orphaned Chrome processes from previous crashes
chrome_count = Process.cleanup_orphaned_chrome()
# Clean up orphaned workers from previous crashes
orphaned_workers = Process.cleanup_orphaned_workers()
# Collect startup metadata
metadata = {
'max_crawl_workers': self.MAX_CRAWL_WORKERS,
'poll_interval': self.POLL_INTERVAL,
}
if stale_count:
metadata['cleaned_stale_pids'] = stale_count
if chrome_count:
metadata['cleaned_orphaned_chrome'] = chrome_count
if orphaned_workers:
metadata['cleaned_orphaned_workers'] = orphaned_workers
log_worker_event(
worker_type='Orchestrator',
event='Starting...',
indent_level=0,
pid=self.pid,
metadata=metadata,
)
def terminate_all_workers(self) -> None:
"""Terminate all running worker processes."""
from archivebox.machine.models import Process
# Get running worker processes scoped to this orchestrator when possible
if getattr(self, 'db_process', None):
running_workers = self._get_scoped_running_workers()
else:
running_workers = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
)
for worker_process in running_workers:
try:
# Gracefully terminate the worker and update Process status
worker_process.terminate(graceful_timeout=5.0)
except Exception:
pass
def on_shutdown(self, error: BaseException | None = None) -> None:
"""Called when orchestrator shuts down."""
# Terminate all worker processes on shutdown
self.terminate_all_workers()
# Update Process record status
if hasattr(self, 'db_process') and self.db_process:
# KeyboardInterrupt is a graceful shutdown, not an error
self.db_process.exit_code = 1 if error and not isinstance(error, KeyboardInterrupt) else 0
self.db_process.status = self.db_process.StatusChoices.EXITED
self.db_process.ended_at = timezone.now()
self.db_process.save()
log_worker_event(
worker_type='Orchestrator',
event='Shutting down',
indent_level=0,
pid=self.pid,
error=error if isinstance(error, Exception) and not isinstance(error, KeyboardInterrupt) else None,
)
def get_total_worker_count(self) -> int:
"""Get total count of running workers across all types."""
from archivebox.machine.models import Process
import time
# Throttle cleanup to once every 30 seconds to avoid performance issues
CLEANUP_THROTTLE_SECONDS = 30
now = time.time()
if now - self._last_cleanup_time > CLEANUP_THROTTLE_SECONDS:
Process.cleanup_stale_running()
self._last_cleanup_time = now
if self.crawl_id and getattr(self, 'db_process', None):
return self._get_scoped_running_workers().count()
return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int:
"""Get count of running workers for a specific worker type."""
if self.crawl_id and getattr(self, 'db_process', None):
return self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count()
return len(WorkerClass.get_running_workers())
def _get_scoped_running_workers(self):
"""Get running workers scoped to this orchestrator process tree."""
from archivebox.machine.models import Process
descendants = self.db_process.get_descendants(include_self=False)
return descendants.filter(
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
)
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
"""Determine if we should spawn a new worker."""
if queue_count == 0:
return False
# Get appropriate limit based on worker type
if WorkerClass.name == 'crawl':
max_workers = self.MAX_CRAWL_WORKERS
elif WorkerClass.name == 'binary':
max_workers = self.MAX_BINARY_WORKERS # Force sequential: only 1 binary at a time
else:
max_workers = 1 # Default for unknown types
# Check worker limit
if self.crawl_id and getattr(self, 'db_process', None) and WorkerClass.name != 'binary':
running_count = self._get_scoped_running_workers().filter(worker_type=WorkerClass.name).count()
else:
running_workers = WorkerClass.get_running_workers()
running_count = len(running_workers)
if running_count >= max_workers:
return False
# Check if we already have enough workers for the queue size
# Spawn more gradually - don't flood with workers
if running_count > 0 and queue_count <= running_count * WorkerClass.MAX_CONCURRENT_TASKS:
return False
return True
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
# CRITICAL: Block until worker registers itself in Process table
# This prevents race condition where orchestrator spawns multiple workers
# before any of them finish on_startup() and register
from archivebox.machine.models import Process
import time
timeout = 5.0 # seconds to wait for worker registration
poll_interval = 0.1 # check every 100ms
elapsed = 0.0
spawn_time = timezone.now()
while elapsed < timeout:
# Check if worker process is registered with strict criteria:
# 1. Correct PID
# 2. WORKER process type
# 3. RUNNING status
# 4. Parent is this orchestrator
# 5. Started recently (within last 10 seconds)
worker_process = Process.objects.filter(
pid=pid,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
parent_id=self.db_process.id,
started_at__gte=spawn_time - timedelta(seconds=10),
).first()
if worker_process:
# Worker successfully registered!
return pid
time.sleep(poll_interval)
elapsed += poll_interval
# Timeout - worker failed to register
log_worker_event(
worker_type='Orchestrator',
event='Worker failed to register in time',
indent_level=0,
pid=self.pid,
metadata={'worker_type': WorkerClass.name, 'worker_pid': pid, 'timeout': timeout},
)
return None
except Exception as e:
log_worker_event(
worker_type='Orchestrator',
event='Failed to spawn worker',
indent_level=0,
pid=self.pid,
metadata={'worker_type': WorkerClass.name},
error=e,
)
return None
def check_queues_and_spawn_workers(self) -> dict[str, int]:
"""
Check Binary and Crawl queues and spawn workers as needed.
Returns dict of queue sizes.
"""
from archivebox.crawls.models import Crawl
from archivebox.machine.models import Binary, Machine
queue_sizes = {}
self._enforce_hard_timeouts()
materialized_schedule_count = self._materialize_due_schedules()
# Check Binary queue
machine = Machine.current()
binary_queue = Binary.objects.filter(
machine=machine,
status=Binary.StatusChoices.QUEUED,
retry_at__lte=timezone.now()
).order_by('retry_at')
binary_count = binary_queue.count()
queue_sizes['binary'] = binary_count
# Spawn BinaryWorker if needed (singleton - max 1 BinaryWorker, processes ALL binaries)
if binary_count > 0:
running_binary_workers_list = BinaryWorker.get_running_workers()
if len(running_binary_workers_list) == 0:
BinaryWorker.start(parent=self.db_process)
# Check if any BinaryWorkers are still running
running_binary_workers = len(BinaryWorker.get_running_workers())
# Check Crawl queue
crawl_queue = Crawl.objects.filter(
retry_at__lte=timezone.now()
).exclude(
status__in=Crawl.FINAL_STATES
)
# Prevent duplicate CrawlWorkers for the same crawl (even across orchestrators)
from archivebox.machine.models import Process
running_crawl_ids: set[str] = set()
running_crawl_workers = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
worker_type='crawl',
status=Process.StatusChoices.RUNNING,
).values_list('env', flat=True)
for env in running_crawl_workers:
if isinstance(env, dict):
crawl_id = env.get('CRAWL_ID')
if crawl_id:
running_crawl_ids.add(str(crawl_id))
if running_crawl_ids:
crawl_queue = crawl_queue.exclude(id__in=running_crawl_ids)
# Apply crawl_id filter if set
if self.crawl_id:
crawl_queue = crawl_queue.filter(id=self.crawl_id)
crawl_queue = crawl_queue.order_by('retry_at')
crawl_count = crawl_queue.count()
queue_sizes['crawl'] = crawl_count
# CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
# This ensures all binaries are installed before snapshots start processing
if binary_count == 0 and running_binary_workers == 0 and materialized_schedule_count == 0:
# Spawn CrawlWorker if needed
if self.should_spawn_worker(CrawlWorker, crawl_count):
# Claim next crawl
crawl = crawl_queue.first()
if crawl and self._claim_crawl(crawl):
CrawlWorker.start(parent=self.db_process, crawl_id=str(crawl.id))
return queue_sizes
def _refresh_db_connections(self) -> None:
"""
Drop long-lived DB connections before each poll tick.
The daemon orchestrator must observe rows created by sibling processes
(server requests, CLI helpers, docker-compose run invocations). With
SQLite, reusing the same connection indefinitely can miss externally
committed rows until the process reconnects.
"""
connections.close_all()
def _should_process_schedules(self) -> bool:
return (not self.exit_on_idle) and (self.crawl_id is None)
def _materialize_due_schedules(self) -> int:
if not self._should_process_schedules():
return 0
from archivebox.crawls.models import CrawlSchedule
now = timezone.now()
due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by')
materialized_count = 0
for schedule in due_schedules:
if not schedule.is_due(now):
continue
schedule.enqueue(queued_at=now)
materialized_count += 1
return materialized_count
def _enforce_hard_timeouts(self) -> None:
"""Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
import time
from datetime import timedelta
from archivebox.config.constants import CONSTANTS
from archivebox.machine.models import Process
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
throttle_seconds = 30
now_ts = time.time()
if now_ts - self._last_hard_timeout_check < throttle_seconds:
return
self._last_hard_timeout_check = now_ts
now = timezone.now()
# Hard limit for hook processes / archiveresults
hook_cutoff = now - timedelta(seconds=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)
overdue_hooks = Process.objects.filter(
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
started_at__lt=hook_cutoff,
).select_related('archiveresult')
for proc in overdue_hooks:
try:
proc.kill_tree(graceful_timeout=0.0)
except Exception:
pass
ar = getattr(proc, 'archiveresult', None)
if ar and ar.status == ArchiveResult.StatusChoices.STARTED:
ar.status = ArchiveResult.StatusChoices.FAILED
ar.end_ts = now
ar.retry_at = None
ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at'])
# Hard limit for snapshots
snapshot_cutoff = now - timedelta(seconds=CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS)
overdue_snapshots = Snapshot.objects.filter(
status=Snapshot.StatusChoices.STARTED,
modified_at__lt=snapshot_cutoff,
)
overdue_snapshot_ids = {str(s.id) for s in overdue_snapshots}
if overdue_snapshot_ids:
running_snapshot_workers = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
worker_type='snapshot',
status=Process.StatusChoices.RUNNING,
)
for proc in running_snapshot_workers:
env = proc.env or {}
if isinstance(env, dict) and str(env.get('SNAPSHOT_ID', '')) in overdue_snapshot_ids:
try:
proc.terminate(graceful_timeout=1.0)
except Exception:
pass
for snapshot in overdue_snapshots:
running_hooks = Process.objects.filter(
archiveresult__snapshot=snapshot,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
).distinct()
for process in running_hooks:
try:
process.kill_tree(graceful_timeout=0.0)
except Exception:
continue
snapshot.archiveresult_set.filter(
status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED],
).update(
status=ArchiveResult.StatusChoices.FAILED,
end_ts=now,
retry_at=None,
modified_at=now,
)
snapshot.cleanup()
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
crawl = snapshot.crawl
if crawl and crawl.is_finished():
crawl.status = crawl.StatusChoices.SEALED
crawl.retry_at = None
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
# Reconcile snapshot/crawl state with running archiveresults
started_snapshot_ids = list(
ArchiveResult.objects.filter(
status=ArchiveResult.StatusChoices.STARTED,
).values_list('snapshot_id', flat=True).distinct()
)
if started_snapshot_ids:
Snapshot.objects.filter(
id__in=started_snapshot_ids,
).exclude(
status=Snapshot.StatusChoices.SEALED,
).exclude(
status=Snapshot.StatusChoices.STARTED,
).update(
status=Snapshot.StatusChoices.STARTED,
retry_at=None,
modified_at=now,
)
Crawl.objects.filter(
snapshot_set__id__in=started_snapshot_ids,
status=Crawl.StatusChoices.QUEUED,
).distinct().update(
status=Crawl.StatusChoices.STARTED,
retry_at=None,
modified_at=now,
)
# If a snapshot is sealed, any still-started archiveresults should be failed
sealed_snapshot_ids = list(
Snapshot.objects.filter(status=Snapshot.StatusChoices.SEALED).values_list('id', flat=True)
)
if sealed_snapshot_ids:
started_ars = ArchiveResult.objects.filter(
snapshot_id__in=sealed_snapshot_ids,
status=ArchiveResult.StatusChoices.STARTED,
).select_related('process')
for ar in started_ars:
process_id = getattr(ar, 'process_id', None)
if process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
try:
ar.process.kill_tree(graceful_timeout=0.0)
except Exception:
pass
ar.status = ArchiveResult.StatusChoices.FAILED
ar.end_ts = now
ar.retry_at = None
ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at'])
# Clear queued/started snapshots that belong to sealed crawls
Snapshot.objects.filter(
crawl__status=Crawl.StatusChoices.SEALED,
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
).update(
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
modified_at=now,
)
def _claim_crawl(self, crawl) -> bool:
"""Atomically claim a due crawl using the shared retry_at lock lifecycle."""
from archivebox.crawls.models import Crawl
return Crawl.claim_for_worker(crawl, lock_seconds=24 * 60 * 60)
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
"""Check if any queue has pending work."""
return any(count > 0 for count in queue_sizes.values())
def has_running_workers(self) -> bool:
"""Check if any workers are still running."""
return self.get_total_worker_count() > 0
def has_future_work(self) -> bool:
"""Check if there's work scheduled for the future (retry_at > now) in Crawl queue."""
from archivebox.crawls.models import Crawl
# Build filter for future work, respecting crawl_id if set
qs = Crawl.objects.filter(
retry_at__gt=timezone.now()
).exclude(
status__in=Crawl.FINAL_STATES
)
# Apply crawl_id filter if set
if self.crawl_id:
qs = qs.filter(id=self.crawl_id)
return qs.count() > 0
def on_tick(self, queue_sizes: dict[str, int]) -> None:
"""Called each orchestrator tick. Override for custom behavior."""
# Tick logging suppressed to reduce noise
pass
def on_idle(self) -> None:
"""Called when orchestrator is idle (no work, no workers)."""
# Idle logging suppressed to reduce noise
pass
def should_exit(self, queue_sizes: dict[str, int]) -> bool:
"""Determine if orchestrator should exit."""
if not self.exit_on_idle:
return False
if self.IDLE_TIMEOUT == 0:
return False
# Don't exit if there's pending or future work
if self.has_pending_work(queue_sizes):
return False
if self.has_running_workers():
return False
if self.has_future_work():
return False
# Exit after idle timeout
return self.idle_count >= self.IDLE_TIMEOUT
def runloop(self) -> None:
"""Main orchestrator loop."""
from rich.live import Live
from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
import os
is_tty = sys.stdout.isatty()
# Enable progress layout only in TTY + foreground mode
show_progress = is_tty and self.exit_on_idle
# When stdout is not a TTY, it may be reserved for JSONL pipeline output.
# Keep the plain progress view, but emit it to stderr instead of stdout.
plain_output = not is_tty
self.on_startup()
if not show_progress:
# No progress layout - optionally emit plain lines for non-TTY output
progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id) if plain_output else None
self._run_orchestrator_loop(progress_layout, plain_output=plain_output)
else:
# Redirect worker subprocess output to /dev/null
devnull_fd = os.open(os.devnull, os.O_WRONLY)
# Save original stdout/stderr (make 2 copies - one for Console, one for restoring)
original_stdout = sys.stdout.fileno()
original_stderr = sys.stderr.fileno()
stdout_for_console = os.dup(original_stdout)
stdout_for_restore = os.dup(original_stdout)
stderr_for_restore = os.dup(original_stderr)
try:
# Redirect stdout/stderr to /dev/null (workers will inherit this)
os.dup2(devnull_fd, original_stdout)
os.dup2(devnull_fd, original_stderr)
# Create Console using saved stdout (not the redirected one)
from rich.console import Console
import archivebox.misc.logging as logging_module
orchestrator_console = Console(file=os.fdopen(stdout_for_console, 'w'), force_terminal=True)
# Update global CONSOLE so orchestrator logs appear too
original_console = logging_module.CONSOLE
logging_module.CONSOLE = orchestrator_console
# Create layout and run with Live display
progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id)
with Live(
progress_layout.get_layout(),
refresh_per_second=8,
screen=True,
console=orchestrator_console,
):
self._run_orchestrator_loop(progress_layout, plain_output=False)
# Restore original console
logging_module.CONSOLE = original_console
finally:
# Restore stdout/stderr
os.dup2(stdout_for_restore, original_stdout)
os.dup2(stderr_for_restore, original_stderr)
# Cleanup
try:
os.close(devnull_fd)
os.close(stdout_for_restore)
os.close(stderr_for_restore)
except OSError:
pass
# stdout_for_console is closed by orchestrator_console
def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False):
"""Run the main orchestrator loop with optional progress display."""
last_snapshot_count = None
tick_count = 0
last_plain_lines: set[tuple[str, str]] = set()
# Track snapshot progress to detect changes
snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin)
try:
while True:
tick_count += 1
# Refresh DB state before polling so this long-lived daemon sees
# work created by other processes using the same collection.
self._refresh_db_connections()
# Check queues and spawn workers
queue_sizes = self.check_queues_and_spawn_workers()
# Get worker counts for each type
worker_counts = {
WorkerClass.name: len(WorkerClass.get_running_workers())
for WorkerClass in self.WORKER_TYPES
}
# Update layout if enabled
if progress_layout:
# Get crawl queue and worker counts
crawl_queue_count = queue_sizes.get('crawl', 0)
crawl_workers_count = worker_counts.get('crawl', 0)
# Determine orchestrator status
if crawl_workers_count > 0:
status = "Working"
elif crawl_queue_count > 0:
status = "Spawning"
else:
status = "Idle"
binary_workers_count = worker_counts.get('binary', 0)
# Update orchestrator status
progress_layout.update_orchestrator_status(
status=status,
crawl_queue_count=crawl_queue_count,
crawl_workers_count=crawl_workers_count,
binary_queue_count=queue_sizes.get('binary', 0),
binary_workers_count=binary_workers_count,
max_crawl_workers=self.MAX_CRAWL_WORKERS,
)
# Update crawl queue tree (active + recently completed)
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
recent_cutoff = timezone.now() - timedelta(minutes=5)
pending_snapshot_candidates: list[Snapshot] = []
hooks_by_snapshot: dict[str, list] = {}
active_qs = Crawl.objects.exclude(status__in=Crawl.FINAL_STATES)
if self.crawl_id:
active_qs = active_qs.filter(id=self.crawl_id)
active_qs = active_qs.order_by('retry_at')
recent_done_qs = Crawl.objects.filter(
status__in=Crawl.FINAL_STATES,
modified_at__gte=recent_cutoff,
)
if self.crawl_id:
recent_done_qs = recent_done_qs.filter(id=self.crawl_id)
recent_done_qs = recent_done_qs.order_by('-modified_at')
crawls = list(active_qs)
active_ids = {c.id for c in crawls}
for crawl in recent_done_qs:
if crawl.id not in active_ids:
crawls.append(crawl)
def _abbrev(text: str, max_len: int = 80) -> str:
return text if len(text) <= max_len else f"{text[:max_len - 3]}..."
def _format_size(num_bytes: int | None) -> str:
if not num_bytes:
return ''
size = float(num_bytes)
for unit in ('b', 'kb', 'mb', 'gb', 'tb'):
if size < 1024 or unit == 'tb':
return f"{size:.1f}{unit}"
size /= 1024
return ''
def _format_seconds(total_seconds: float | None) -> str:
if total_seconds is None:
return ''
seconds = max(0.0, float(total_seconds))
return f"{seconds:.1f}s"
def _tail_stderr_line(proc) -> str:
try:
path = getattr(proc, 'stderr_file', None)
if not path or not path.exists():
return ''
with open(path, 'rb') as f:
f.seek(0, os.SEEK_END)
size = f.tell()
f.seek(max(0, size - 4096))
data = f.read().decode('utf-8', errors='ignore')
lines = [ln.strip() for ln in data.splitlines() if ln.strip()]
return lines[-1] if lines else ''
except Exception:
return ''
tree_data: list[dict] = []
for crawl in crawls:
urls = crawl.get_urls_list()
url_count = len(urls)
label = f"{url_count} url" + ("s" if url_count != 1 else "")
label = _abbrev(label)
snapshots = []
snap_qs = Snapshot.objects.filter(crawl_id=crawl.id)
active_snaps = list(
snap_qs.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED])
.order_by('created_at')[:16]
)
recent_snaps = list(
snap_qs.filter(status__in=Snapshot.FINAL_STATES)
.order_by('-modified_at')[:8]
)
snap_ids = {s.id for s in active_snaps}
for s in recent_snaps:
if s.id not in snap_ids:
active_snaps.append(s)
for snap in active_snaps:
try:
from archivebox.config.configset import get_config
from archivebox.hooks import discover_hooks
snap_config = get_config(snapshot=snap)
hooks_list = discover_hooks('Snapshot', config=snap_config)
hooks_by_snapshot[str(snap.id)] = hooks_list
from archivebox.hooks import get_plugin_special_config
hook_timeouts = {}
for hook_path in hooks_list:
plugin_name = hook_path.parent.name
try:
hook_timeouts[hook_path.name] = int(get_plugin_special_config(plugin_name, snap_config)['timeout'])
except Exception:
pass
except Exception:
hooks_list = []
hook_timeouts = {}
try:
from archivebox import DATA_DIR
data_dir = Path(DATA_DIR)
snap_path = snap.output_dir
try:
rel = Path(snap_path)
if rel.is_absolute():
rel = rel.relative_to(data_dir)
snap_path = f"./{rel}" if not str(rel).startswith("./") else str(rel)
except Exception:
snap_path = str(snap_path)
ars = list(
snap.archiveresult_set.select_related('process').order_by('start_ts')
)
ar_by_hook = {ar.hook_name: ar for ar in ars if ar.hook_name}
except Exception:
snap_path = ''
ar_by_hook = {}
plugin_hooks: dict[str, list[dict]] = {}
now = timezone.now()
for hook_path in hooks_list:
hook_name = hook_path.name
is_bg = '.bg.' in hook_name
ar = ar_by_hook.get(hook_name)
status = 'pending'
is_running = False
is_pending = True
elapsed = ''
timeout = ''
size = ''
stderr_tail = ''
if ar:
process_id = getattr(ar, 'process_id', None)
if process_id and ar.process:
stderr_tail = _tail_stderr_line(ar.process)
if ar.status == ArchiveResult.StatusChoices.STARTED:
status = 'started'
is_running = True
is_pending = False
start_ts = ar.start_ts or (ar.process.started_at if process_id and ar.process else None)
if start_ts:
elapsed = _format_seconds((now - start_ts).total_seconds())
hook_timeout = None
if process_id and ar.process and ar.process.timeout:
hook_timeout = ar.process.timeout
hook_timeout = hook_timeout or hook_timeouts.get(hook_name)
if hook_timeout:
timeout = _format_seconds(hook_timeout)
else:
status = ar.status
if process_id and ar.process and ar.process.exit_code == 137:
status = 'failed'
is_pending = False
start_ts = ar.start_ts or (ar.process.started_at if process_id and ar.process else None)
end_ts = ar.end_ts or (ar.process.ended_at if process_id and ar.process else None)
if start_ts and end_ts:
elapsed = _format_seconds((end_ts - start_ts).total_seconds())
size = _format_size(getattr(ar, 'output_size', None))
else:
hook_timeout = hook_timeouts.get(hook_name)
if hook_timeout:
timeout = _format_seconds(hook_timeout)
elapsed = _format_seconds(0)
plugin_name = hook_path.parent.name
if plugin_name in ('plugins', '.'):
plugin_name = hook_name.split('__')[-1].split('.')[0]
plugin_hooks.setdefault(plugin_name, []).append({
'status': status,
'size': size,
'elapsed': elapsed,
'timeout': timeout,
'is_bg': is_bg,
'is_running': is_running,
'is_pending': is_pending,
'hook_name': hook_name,
'stderr': stderr_tail,
})
hooks = []
for plugin_name, hook_entries in plugin_hooks.items():
running = next((h for h in hook_entries if h['is_running']), None)
pending = next((h for h in hook_entries if h['is_pending']), None)
any_failed = any(h['status'] == ArchiveResult.StatusChoices.FAILED for h in hook_entries)
any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries)
any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries)
stderr_tail = ''
if running:
status = 'started'
is_running = True
is_pending = False
is_bg = running['is_bg']
elapsed = running.get('elapsed', '')
timeout = running.get('timeout', '')
stderr_tail = running.get('stderr', '')
size = ''
elif pending:
status = 'pending'
is_running = False
is_pending = True
is_bg = pending['is_bg']
elapsed = pending.get('elapsed', '') or _format_seconds(0)
timeout = pending.get('timeout', '')
stderr_tail = pending.get('stderr', '')
size = ''
else:
is_running = False
is_pending = False
is_bg = any(h['is_bg'] for h in hook_entries)
if any_failed:
status = 'failed'
elif any_succeeded:
status = 'succeeded'
elif any_skipped:
status = 'skipped'
else:
status = 'skipped'
for h in hook_entries:
if h.get('stderr'):
stderr_tail = h['stderr']
break
total_elapsed = 0.0
has_elapsed = False
for h in hook_entries:
if h.get('elapsed'):
try:
total_elapsed += float(h['elapsed'].rstrip('s'))
has_elapsed = True
except Exception:
pass
elapsed = _format_seconds(total_elapsed) if has_elapsed else ''
max_output = 0
# Use the largest output_size we already computed on ArchiveResult
ar_sizes = [
ar_by_hook[h['hook_name']].output_size
for h in hook_entries
if h.get('hook_name') in ar_by_hook and getattr(ar_by_hook[h['hook_name']], 'output_size', 0)
]
if ar_sizes:
max_output = max(ar_sizes)
size = _format_size(max_output) if max_output else ''
timeout = ''
hooks.append({
'status': status,
'path': f"./{plugin_name}",
'size': size,
'elapsed': elapsed,
'timeout': timeout,
'is_bg': is_bg,
'is_running': is_running,
'is_pending': is_pending,
'stderr': stderr_tail,
})
snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80)
snapshots.append({
'id': str(snap.id),
'status': snap.status,
'label': snap_label,
'output_path': snap_path,
'hooks': hooks,
})
pending_snapshot_candidates.append(snap)
tree_data.append({
'id': str(crawl.id),
'status': crawl.status,
'label': label,
'snapshots': snapshots,
})
progress_layout.update_crawl_tree(tree_data)
# Update running process panels (tail stdout/stderr for each running process)
from archivebox.machine.models import Process
if self.crawl_id and getattr(self, 'db_process', None):
process_qs = self.db_process.get_descendants(include_self=False)
process_qs = process_qs.filter(status=Process.StatusChoices.RUNNING)
else:
process_qs = Process.objects.filter(
status=Process.StatusChoices.RUNNING,
).exclude(process_type=Process.TypeChoices.ORCHESTRATOR)
running_processes = [
proc for proc in process_qs.order_by('process_type', 'worker_type', 'started_at')
if proc.is_running
]
pending_processes = []
try:
from types import SimpleNamespace
for snap in pending_snapshot_candidates:
hooks_list = hooks_by_snapshot.get(str(snap.id), [])
if not hooks_list:
continue
existing = set(
snap.archiveresult_set.exclude(hook_name='').values_list('hook_name', flat=True)
)
for hook_path in hooks_list:
if hook_path.name in existing:
continue
pending_processes.append(SimpleNamespace(
process_type='hook',
worker_type='',
pid=None,
cmd=['', str(hook_path)],
url=snap.url,
status='queued',
started_at=None,
timeout=None,
pwd=None,
))
except Exception:
pending_processes = []
progress_layout.update_process_panels(running_processes, pending=pending_processes)
# Update snapshot progress
from archivebox.core.models import Snapshot
# Get all started snapshots (optionally filtered by crawl_id)
snapshot_filter: dict[str, str | datetime] = {'status': 'started'}
if self.crawl_id:
snapshot_filter['crawl_id'] = self.crawl_id
else:
# Only if processing all crawls, filter by recent modified_at to avoid stale snapshots
recent_cutoff = timezone.now() - timedelta(minutes=5)
snapshot_filter['modified_at__gte'] = recent_cutoff
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
# Log snapshot count changes and details
if len(active_snapshots) != last_snapshot_count:
if last_snapshot_count is not None:
if len(active_snapshots) > last_snapshot_count:
progress_layout.log_event(
f"Active snapshots: {last_snapshot_count}{len(active_snapshots)}",
style="cyan"
)
# Log which snapshots started
for snapshot in active_snapshots[-1:]: # Just show the newest one
progress_layout.log_event(
f"Started: {snapshot.url[:60]}",
style="green"
)
# Log SnapshotWorker count
from archivebox.machine.models import Process
all_workers = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
status__in=['running', 'started']
).count()
progress_layout.log_event(
f"Workers running: {all_workers} ({crawl_workers_count} CrawlWorkers)",
style="grey53"
)
else:
progress_layout.log_event(
f"Active snapshots: {last_snapshot_count}{len(active_snapshots)}",
style="blue"
)
last_snapshot_count = len(active_snapshots)
# Track which snapshots are still active
active_ids = set()
for snapshot in active_snapshots:
active_ids.add(snapshot.id)
total = snapshot.archiveresult_set.count()
completed = snapshot.archiveresult_set.filter(
status__in=['succeeded', 'skipped', 'failed']
).count()
# Count hooks by status for debugging
queued = snapshot.archiveresult_set.filter(status='queued').count()
# Find currently running hook (ordered by hook_name to get lowest step number)
current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
if not current_ar:
# If nothing running, show next queued item (ordered to get next in sequence)
current_ar = snapshot.archiveresult_set.filter(status='queued').order_by('hook_name').first()
current_plugin = ''
if current_ar:
# Use hook_name if available, otherwise plugin name
hook_name = current_ar.hook_name or current_ar.plugin or ''
# Extract just the hook name without path (e.g., "on_Snapshot__50_wget.py" -> "wget")
if hook_name:
# Clean up the name: remove prefix and extension
clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name
clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '')
current_plugin = clean_name
elif total == 0:
# Snapshot just started, hooks not created yet
current_plugin = "initializing"
elif queued > 0:
# Hooks created but none started yet
current_plugin = "waiting"
# Debug: Log first time we see this snapshot
if snapshot.id not in snapshot_progress:
progress_layout.log_event(
f"Tracking snapshot: {snapshot.url[:50]}",
style="grey53"
)
# Track progress changes
prev_progress = snapshot_progress.get(snapshot.id, (0, 0, ''))
curr_progress = (total, completed, current_plugin)
if prev_progress != curr_progress:
prev_total, prev_completed, prev_plugin = prev_progress
# Log hook completion
if completed > prev_completed:
completed_ar = snapshot.archiveresult_set.filter(
status__in=['succeeded', 'skipped', 'failed']
).order_by('-end_ts', '-modified_at').first()
hook_label = ''
if completed_ar:
hook_name = completed_ar.hook_name or completed_ar.plugin or ''
if hook_name:
hook_label = hook_name.split('__')[-1] if '__' in hook_name else hook_name
hook_label = hook_label.replace('.py', '').replace('.js', '').replace('.sh', '').replace('.bg', '')
if not hook_label:
hook_label = f"{completed}/{total}"
progress_layout.log_event(
f"Hook completed: {hook_label}",
style="green"
)
# Log plugin change
if current_plugin and current_plugin != prev_plugin:
progress_layout.log_event(
f"Running: {current_plugin} ({snapshot.url[:40]})",
style="yellow"
)
snapshot_progress[snapshot.id] = curr_progress
# Debug: Every 10 ticks, log detailed status if stuck at initializing
if tick_count % 10 == 0 and total == 0 and current_plugin == "initializing":
progress_layout.log_event(
f"DEBUG: Snapshot stuck at initializing (status={snapshot.status})",
style="red"
)
# No per-snapshot panels; logs only
# Cleanup progress tracking for completed snapshots
for snapshot_id in list(snapshot_progress.keys()):
if snapshot_id not in active_ids:
progress_layout.log_event(
"Snapshot completed/removed",
style="blue"
)
if snapshot_id in snapshot_progress:
del snapshot_progress[snapshot_id]
if plain_output:
plain_lines = progress_layout.plain_lines()
new_lines = [line for line in plain_lines if line not in last_plain_lines]
if new_lines:
ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
for panel, line in new_lines:
if line:
print(f"[{ts}] [{panel}] {line}", file=sys.stderr)
last_plain_lines = set(plain_lines)
# Track idle state
has_pending = self.has_pending_work(queue_sizes)
has_running = self.has_running_workers()
if has_pending or has_running:
self.idle_count = 0
self.on_tick(queue_sizes)
else:
self.idle_count += 1
self.on_idle()
# Check if we should exit
if self.should_exit(queue_sizes):
if progress_layout:
progress_layout.log_event("All work complete", style="green")
log_worker_event(
worker_type='Orchestrator',
event='All work complete',
indent_level=0,
pid=self.pid,
)
break
time.sleep(self.POLL_INTERVAL)
except KeyboardInterrupt:
if progress_layout:
progress_layout.log_event("Interrupted by user", style="red")
print(file=sys.stderr) # Newline after ^C
self.on_shutdown(error=KeyboardInterrupt())
except BaseException as e:
if progress_layout:
progress_layout.log_event(f"Error: {e}", style="red")
self.on_shutdown(error=e)
raise
else:
self.on_shutdown()
def start(self) -> int:
"""
Fork orchestrator as a background process.
Returns the PID of the new process.
"""
# Use module-level function to avoid pickle errors with local functions
proc = MPProcess(
target=_run_orchestrator_process,
args=(self.exit_on_idle,),
name='orchestrator'
)
proc.start()
assert proc.pid is not None
log_worker_event(
worker_type='Orchestrator',
event='Started in background',
indent_level=0,
pid=proc.pid,
)
return proc.pid
@classmethod
def get_or_start(cls, exit_on_idle: bool = True) -> 'Orchestrator':
"""
Get running orchestrator or start a new one.
Used by commands like 'add' to ensure orchestrator is running.
"""
if cls.is_running():
print('[grey53]👨‍✈️ Orchestrator already running[/grey53]', file=sys.stderr)
# Return a placeholder - actual orchestrator is in another process
return cls(exit_on_idle=exit_on_idle)
orchestrator = cls(exit_on_idle=exit_on_idle)
return orchestrator