mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
79
archivebox/workers/management/commands/orchestrator_watch.py
Normal file
79
archivebox/workers/management/commands/orchestrator_watch.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Watch the runserver autoreload PID file and restart orchestrator on reloads."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"--pidfile",
|
||||
default=None,
|
||||
help="Path to runserver pidfile to watch",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--interval",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Polling interval in seconds",
|
||||
)
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
import os
|
||||
import time
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.machine.models import Process, Machine
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
os.environ['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
|
||||
|
||||
pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
|
||||
if not pidfile:
|
||||
pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
|
||||
|
||||
interval = max(0.2, float(kwargs.get("interval", 1.0)))
|
||||
|
||||
last_pid = None
|
||||
|
||||
def restart_orchestrator():
|
||||
Process.cleanup_stale_running()
|
||||
machine = Machine.current()
|
||||
|
||||
running = Process.objects.filter(
|
||||
machine=machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.ORCHESTRATOR,
|
||||
Process.TypeChoices.WORKER,
|
||||
Process.TypeChoices.HOOK,
|
||||
],
|
||||
)
|
||||
for proc in running:
|
||||
try:
|
||||
if proc.process_type == Process.TypeChoices.HOOK:
|
||||
proc.kill_tree(graceful_timeout=0.5)
|
||||
else:
|
||||
proc.terminate(graceful_timeout=1.0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
Orchestrator(exit_on_idle=False).start()
|
||||
|
||||
while True:
|
||||
try:
|
||||
if os.path.exists(pidfile):
|
||||
with open(pidfile, "r") as handle:
|
||||
pid = handle.read().strip() or None
|
||||
else:
|
||||
pid = None
|
||||
|
||||
if pid and pid != last_pid:
|
||||
restart_orchestrator()
|
||||
last_pid = pid
|
||||
elif not Orchestrator.is_running():
|
||||
Orchestrator(exit_on_idle=False).start()
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time.sleep(interval)
|
||||
@@ -42,6 +42,8 @@ from .worker import Worker, BinaryWorker, CrawlWorker
|
||||
|
||||
def _run_orchestrator_process(exit_on_idle: bool) -> None:
|
||||
"""Top-level function for multiprocessing (must be picklable)."""
|
||||
import os
|
||||
os.environ['ARCHIVEBOX_ORCHESTRATOR_PROCESS'] = '1'
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
|
||||
@@ -80,6 +82,7 @@ class Orchestrator:
|
||||
self.pid_file = None
|
||||
self.idle_count: int = 0
|
||||
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
|
||||
self._last_hard_timeout_check: float = 0.0 # Throttle hard timeout enforcement
|
||||
|
||||
# In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker
|
||||
if self.exit_on_idle:
|
||||
@@ -255,10 +258,6 @@ class Orchestrator:
|
||||
pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
|
||||
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
|
||||
|
||||
if self.exit_on_idle:
|
||||
# Foreground runs have MAX_CRAWL_WORKERS=1; avoid blocking startup on registration.
|
||||
return pid
|
||||
|
||||
# CRITICAL: Block until worker registers itself in Process table
|
||||
# This prevents race condition where orchestrator spawns multiple workers
|
||||
# before any of them finish on_startup() and register
|
||||
@@ -333,6 +332,8 @@ class Orchestrator:
|
||||
|
||||
queue_sizes = {}
|
||||
|
||||
self._enforce_hard_timeouts()
|
||||
|
||||
# Check Binary queue
|
||||
machine = Machine.current()
|
||||
binary_queue = Binary.objects.filter(
|
||||
@@ -359,6 +360,22 @@ class Orchestrator:
|
||||
status__in=Crawl.FINAL_STATES
|
||||
)
|
||||
|
||||
# Prevent duplicate CrawlWorkers for the same crawl (even across orchestrators)
|
||||
from archivebox.machine.models import Process
|
||||
running_crawl_ids: set[str] = set()
|
||||
running_crawl_workers = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='crawl',
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
).values_list('env', flat=True)
|
||||
for env in running_crawl_workers:
|
||||
if isinstance(env, dict):
|
||||
crawl_id = env.get('CRAWL_ID')
|
||||
if crawl_id:
|
||||
running_crawl_ids.add(str(crawl_id))
|
||||
if running_crawl_ids:
|
||||
crawl_queue = crawl_queue.exclude(id__in=running_crawl_ids)
|
||||
|
||||
# Apply crawl_id filter if set
|
||||
if self.crawl_id:
|
||||
crawl_queue = crawl_queue.filter(id=self.crawl_id)
|
||||
@@ -379,6 +396,156 @@ class Orchestrator:
|
||||
|
||||
return queue_sizes
|
||||
|
||||
def _enforce_hard_timeouts(self) -> None:
|
||||
"""Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
|
||||
import time
|
||||
from datetime import timedelta
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.machine.models import Process
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
throttle_seconds = 30
|
||||
now_ts = time.time()
|
||||
if now_ts - self._last_hard_timeout_check < throttle_seconds:
|
||||
return
|
||||
self._last_hard_timeout_check = now_ts
|
||||
|
||||
now = timezone.now()
|
||||
|
||||
# Hard limit for hook processes / archiveresults
|
||||
hook_cutoff = now - timedelta(seconds=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)
|
||||
overdue_hooks = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
started_at__lt=hook_cutoff,
|
||||
).select_related('archiveresult')
|
||||
|
||||
for proc in overdue_hooks:
|
||||
try:
|
||||
proc.kill_tree(graceful_timeout=0.0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
ar = getattr(proc, 'archiveresult', None)
|
||||
if ar and ar.status == ArchiveResult.StatusChoices.STARTED:
|
||||
ar.status = ArchiveResult.StatusChoices.FAILED
|
||||
ar.end_ts = now
|
||||
ar.retry_at = None
|
||||
ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at'])
|
||||
|
||||
# Hard limit for snapshots
|
||||
snapshot_cutoff = now - timedelta(seconds=CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS)
|
||||
overdue_snapshots = Snapshot.objects.filter(
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
modified_at__lt=snapshot_cutoff,
|
||||
)
|
||||
|
||||
overdue_snapshot_ids = {str(s.id) for s in overdue_snapshots}
|
||||
if overdue_snapshot_ids:
|
||||
running_snapshot_workers = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='snapshot',
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
)
|
||||
for proc in running_snapshot_workers:
|
||||
env = proc.env or {}
|
||||
if isinstance(env, dict) and str(env.get('SNAPSHOT_ID', '')) in overdue_snapshot_ids:
|
||||
try:
|
||||
proc.terminate(graceful_timeout=1.0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for snapshot in overdue_snapshots:
|
||||
running_hooks = Process.objects.filter(
|
||||
archiveresult__snapshot=snapshot,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
).distinct()
|
||||
for process in running_hooks:
|
||||
try:
|
||||
process.kill_tree(graceful_timeout=0.0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
snapshot.archiveresult_set.filter(
|
||||
status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED],
|
||||
).update(
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=now,
|
||||
retry_at=None,
|
||||
modified_at=now,
|
||||
)
|
||||
|
||||
snapshot.cleanup()
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
crawl = snapshot.crawl
|
||||
if crawl and crawl.is_finished():
|
||||
crawl.status = crawl.StatusChoices.SEALED
|
||||
crawl.retry_at = None
|
||||
crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
# Reconcile snapshot/crawl state with running archiveresults
|
||||
started_snapshot_ids = list(
|
||||
ArchiveResult.objects.filter(
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
).values_list('snapshot_id', flat=True).distinct()
|
||||
)
|
||||
if started_snapshot_ids:
|
||||
Snapshot.objects.filter(
|
||||
id__in=started_snapshot_ids,
|
||||
).exclude(
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
).exclude(
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
).update(
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
modified_at=now,
|
||||
)
|
||||
|
||||
Crawl.objects.filter(
|
||||
snapshot_set__id__in=started_snapshot_ids,
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
).distinct().update(
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
modified_at=now,
|
||||
)
|
||||
|
||||
# If a snapshot is sealed, any still-started archiveresults should be failed
|
||||
sealed_snapshot_ids = list(
|
||||
Snapshot.objects.filter(status=Snapshot.StatusChoices.SEALED).values_list('id', flat=True)
|
||||
)
|
||||
if sealed_snapshot_ids:
|
||||
started_ars = ArchiveResult.objects.filter(
|
||||
snapshot_id__in=sealed_snapshot_ids,
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
).select_related('process')
|
||||
for ar in started_ars:
|
||||
if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
|
||||
try:
|
||||
ar.process.kill_tree(graceful_timeout=0.0)
|
||||
except Exception:
|
||||
pass
|
||||
ar.status = ArchiveResult.StatusChoices.FAILED
|
||||
ar.end_ts = now
|
||||
ar.retry_at = None
|
||||
ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at'])
|
||||
|
||||
# Clear queued/started snapshots that belong to sealed crawls
|
||||
Snapshot.objects.filter(
|
||||
crawl__status=Crawl.StatusChoices.SEALED,
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
|
||||
).update(
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
modified_at=now,
|
||||
)
|
||||
|
||||
def _claim_crawl(self, crawl) -> bool:
|
||||
"""Atomically claim a crawl using optimistic locking."""
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
@@ -32,7 +32,8 @@ _supervisord_proc = None
|
||||
|
||||
ORCHESTRATOR_WORKER = {
|
||||
"name": "worker_orchestrator",
|
||||
"command": "archivebox run", # runs forever by default
|
||||
# Use Django management command to avoid stdin/TTY ambiguity in `archivebox run`.
|
||||
"command": "archivebox manage orchestrator",
|
||||
"autostart": "true",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_orchestrator.log",
|
||||
|
||||
@@ -436,6 +436,7 @@ class CrawlWorker(Worker):
|
||||
super().__init__(**kwargs)
|
||||
self.crawl_id = crawl_id
|
||||
self.crawl = None
|
||||
self.crawl_config = None
|
||||
|
||||
def get_model(self):
|
||||
from archivebox.crawls.models import Crawl
|
||||
@@ -446,7 +447,9 @@ class CrawlWorker(Worker):
|
||||
super().on_startup()
|
||||
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config.configset import get_config
|
||||
self.crawl = Crawl.objects.get(id=self.crawl_id)
|
||||
self.crawl_config = get_config(crawl=self.crawl)
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Run crawl state machine, spawn SnapshotWorkers."""
|
||||
@@ -484,6 +487,12 @@ class CrawlWorker(Worker):
|
||||
|
||||
# Now spawn SnapshotWorkers and monitor progress
|
||||
while True:
|
||||
self.crawl.refresh_from_db()
|
||||
if self.crawl.status == Crawl.StatusChoices.SEALED:
|
||||
print(f'🛑 Crawl {self.crawl_id} was sealed, stopping workers', file=sys.stderr)
|
||||
self._terminate_running_snapshot_workers()
|
||||
break
|
||||
|
||||
# Check if crawl is done
|
||||
if self._is_crawl_finished():
|
||||
print(f'🔄 Crawl finished, sealing...', file=sys.stderr)
|
||||
@@ -589,6 +598,22 @@ class CrawlWorker(Worker):
|
||||
thread = threading.Thread(target=pipe_worker_stderr, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def _terminate_running_snapshot_workers(self) -> None:
|
||||
"""Terminate any running SnapshotWorkers for this crawl."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
running_workers = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='snapshot',
|
||||
parent_id=self.db_process.id,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
)
|
||||
for proc in running_workers:
|
||||
try:
|
||||
proc.terminate(graceful_timeout=1.0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def _is_crawl_finished(self) -> bool:
|
||||
"""Check if all snapshots are sealed."""
|
||||
from pathlib import Path
|
||||
@@ -684,19 +709,29 @@ class SnapshotWorker(Worker):
|
||||
from archivebox.core.models import Snapshot
|
||||
self.snapshot = Snapshot.objects.get(id=self.snapshot_id)
|
||||
|
||||
if self.snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
return
|
||||
|
||||
# Use state machine to transition queued -> started (triggers enter_started())
|
||||
self.snapshot.sm.tick()
|
||||
self.snapshot.refresh_from_db()
|
||||
self.snapshot_started_at = self.snapshot.modified_at or self.snapshot.created_at
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Execute all hooks sequentially."""
|
||||
from archivebox.hooks import discover_hooks, is_background_hook
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
self.on_startup()
|
||||
|
||||
try:
|
||||
if self.snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
return
|
||||
if self._snapshot_exceeded_hard_timeout():
|
||||
self._seal_snapshot_due_to_timeout()
|
||||
return
|
||||
|
||||
# Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
|
||||
config = get_config(snapshot=self.snapshot, crawl=self.snapshot.crawl)
|
||||
|
||||
@@ -706,6 +741,13 @@ class SnapshotWorker(Worker):
|
||||
|
||||
# Execute each hook sequentially
|
||||
for hook_path in hooks:
|
||||
self.snapshot.refresh_from_db()
|
||||
if self.snapshot.status == Snapshot.StatusChoices.SEALED:
|
||||
break
|
||||
if self._snapshot_exceeded_hard_timeout():
|
||||
self._seal_snapshot_due_to_timeout()
|
||||
return
|
||||
|
||||
hook_name = hook_path.name
|
||||
plugin = self._extract_plugin_name(hook_path, hook_name)
|
||||
is_background = is_background_hook(hook_name)
|
||||
@@ -756,9 +798,10 @@ class SnapshotWorker(Worker):
|
||||
|
||||
# All hooks launched (or completed) - terminate bg hooks and seal
|
||||
self._finalize_background_hooks()
|
||||
# This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing
|
||||
self.snapshot.sm.seal()
|
||||
self.snapshot.refresh_from_db()
|
||||
if self.snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
# This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing
|
||||
self.snapshot.sm.seal()
|
||||
self.snapshot.refresh_from_db()
|
||||
|
||||
except Exception as e:
|
||||
# Mark snapshot as sealed even on error (still triggers cleanup)
|
||||
@@ -771,17 +814,34 @@ class SnapshotWorker(Worker):
|
||||
|
||||
def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any:
|
||||
"""Fork and run a hook using Process model, return Process."""
|
||||
from archivebox.hooks import run_hook
|
||||
from archivebox.hooks import run_hook, get_plugin_special_config
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
# Create output directory
|
||||
output_dir = ar.create_output_dir()
|
||||
|
||||
timeout = None
|
||||
try:
|
||||
plugin_name = hook_path.parent.name
|
||||
plugin_config = get_plugin_special_config(plugin_name, config)
|
||||
timeout = plugin_config.get('timeout')
|
||||
except Exception:
|
||||
timeout = None
|
||||
|
||||
if getattr(self, 'snapshot_started_at', None):
|
||||
remaining = max(1, int(CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS - (timezone.now() - self.snapshot_started_at).total_seconds()))
|
||||
if timeout:
|
||||
timeout = min(int(timeout), remaining)
|
||||
else:
|
||||
timeout = remaining
|
||||
|
||||
# Run hook using Process.launch() - returns Process model directly
|
||||
# Pass self.db_process as parent to track SnapshotWorker -> Hook hierarchy
|
||||
process = run_hook(
|
||||
script=hook_path,
|
||||
output_dir=output_dir,
|
||||
config=config,
|
||||
timeout=timeout,
|
||||
parent=self.db_process,
|
||||
url=str(self.snapshot.url),
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
@@ -872,6 +932,44 @@ class SnapshotWorker(Worker):
|
||||
# Remove completed hook from tracking
|
||||
self.background_processes.pop(hook_name, None)
|
||||
|
||||
def _snapshot_exceeded_hard_timeout(self) -> bool:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
if not getattr(self, 'snapshot_started_at', None):
|
||||
return False
|
||||
return (timezone.now() - self.snapshot_started_at).total_seconds() > CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS
|
||||
|
||||
def _seal_snapshot_due_to_timeout(self) -> None:
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
now = timezone.now()
|
||||
|
||||
running_hooks = Process.objects.filter(
|
||||
archiveresult__snapshot=self.snapshot,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
).distinct()
|
||||
for process in running_hooks:
|
||||
try:
|
||||
process.kill_tree(graceful_timeout=0.0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
self.snapshot.archiveresult_set.filter(
|
||||
status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED],
|
||||
).update(
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=now,
|
||||
retry_at=None,
|
||||
modified_at=now,
|
||||
)
|
||||
|
||||
self.snapshot.cleanup()
|
||||
self.snapshot.status = self.snapshot.StatusChoices.SEALED
|
||||
self.snapshot.retry_at = None
|
||||
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
|
||||
def on_shutdown(self, error: BaseException | None = None) -> None:
|
||||
"""
|
||||
Terminate all background Snapshot hooks when snapshot finishes.
|
||||
|
||||
Reference in New Issue
Block a user