wip

2026-04-06 07:47:53 +10:00 · 2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions
--- a/archivebox/workers/management/commands/orchestrator_watch.py
+++ b/archivebox/workers/management/commands/orchestrator_watch.py
@@ -0,0 +1,79 @@
+from django.core.management.base import BaseCommand
+
+
+class Command(BaseCommand):
+    help = "Watch the runserver autoreload PID file and restart orchestrator on reloads."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--pidfile",
+            default=None,
+            help="Path to runserver pidfile to watch",
+        )
+        parser.add_argument(
+            "--interval",
+            type=float,
+            default=1.0,
+            help="Polling interval in seconds",
+        )
+
+    def handle(self, *args, **kwargs):
+        import os
+        import time
+        from archivebox.config.common import STORAGE_CONFIG
+        from archivebox.machine.models import Process, Machine
+        from archivebox.workers.orchestrator import Orchestrator
+
+        os.environ['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
+
+        pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
+        if not pidfile:
+            pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
+
+        interval = max(0.2, float(kwargs.get("interval", 1.0)))
+
+        last_pid = None
+
+        def restart_orchestrator():
+            Process.cleanup_stale_running()
+            machine = Machine.current()
+
+            running = Process.objects.filter(
+                machine=machine,
+                status=Process.StatusChoices.RUNNING,
+                process_type__in=[
+                    Process.TypeChoices.ORCHESTRATOR,
+                    Process.TypeChoices.WORKER,
+                    Process.TypeChoices.HOOK,
+                ],
+            )
+            for proc in running:
+                try:
+                    if proc.process_type == Process.TypeChoices.HOOK:
+                        proc.kill_tree(graceful_timeout=0.5)
+                    else:
+                        proc.terminate(graceful_timeout=1.0)
+                except Exception:
+                    continue
+
+            if not Orchestrator.is_running():
+                Orchestrator(exit_on_idle=False).start()
+
+        while True:
+            try:
+                if os.path.exists(pidfile):
+                    with open(pidfile, "r") as handle:
+                        pid = handle.read().strip() or None
+                else:
+                    pid = None
+
+                if pid and pid != last_pid:
+                    restart_orchestrator()
+                    last_pid = pid
+                elif not Orchestrator.is_running():
+                    Orchestrator(exit_on_idle=False).start()
+
+            except Exception:
+                pass
+
+            time.sleep(interval)
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -42,6 +42,8 @@ from .worker import Worker, BinaryWorker, CrawlWorker

 def _run_orchestrator_process(exit_on_idle: bool) -> None:
    """Top-level function for multiprocessing (must be picklable)."""
+    import os
+    os.environ['ARCHIVEBOX_ORCHESTRATOR_PROCESS'] = '1'
    from archivebox.config.django import setup_django
    setup_django()
    orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
@@ -80,6 +82,7 @@ class Orchestrator:
        self.pid_file = None
        self.idle_count: int = 0
        self._last_cleanup_time: float = 0.0  # For throttling cleanup_stale_running()
+        self._last_hard_timeout_check: float = 0.0  # Throttle hard timeout enforcement

        # In foreground mode (exit_on_idle=True), limit to 1 CrawlWorker
        if self.exit_on_idle:
@@ -255,10 +258,6 @@ class Orchestrator:
            pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
            print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')

-            if self.exit_on_idle:
-                # Foreground runs have MAX_CRAWL_WORKERS=1; avoid blocking startup on registration.
-                return pid
-
            # CRITICAL: Block until worker registers itself in Process table
            # This prevents race condition where orchestrator spawns multiple workers
            # before any of them finish on_startup() and register
@@ -333,6 +332,8 @@ class Orchestrator:

        queue_sizes = {}

+        self._enforce_hard_timeouts()
+
        # Check Binary queue
        machine = Machine.current()
        binary_queue = Binary.objects.filter(
@@ -359,6 +360,22 @@ class Orchestrator:
            status__in=Crawl.FINAL_STATES
        )

+        # Prevent duplicate CrawlWorkers for the same crawl (even across orchestrators)
+        from archivebox.machine.models import Process
+        running_crawl_ids: set[str] = set()
+        running_crawl_workers = Process.objects.filter(
+            process_type=Process.TypeChoices.WORKER,
+            worker_type='crawl',
+            status=Process.StatusChoices.RUNNING,
+        ).values_list('env', flat=True)
+        for env in running_crawl_workers:
+            if isinstance(env, dict):
+                crawl_id = env.get('CRAWL_ID')
+                if crawl_id:
+                    running_crawl_ids.add(str(crawl_id))
+        if running_crawl_ids:
+            crawl_queue = crawl_queue.exclude(id__in=running_crawl_ids)
+
        # Apply crawl_id filter if set
        if self.crawl_id:
            crawl_queue = crawl_queue.filter(id=self.crawl_id)
@@ -379,6 +396,156 @@ class Orchestrator:

        return queue_sizes

+    def _enforce_hard_timeouts(self) -> None:
+        """Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
+        import time
+        from datetime import timedelta
+        from archivebox.config.constants import CONSTANTS
+        from archivebox.machine.models import Process
+        from archivebox.core.models import Snapshot, ArchiveResult
+        from archivebox.crawls.models import Crawl
+
+        throttle_seconds = 30
+        now_ts = time.time()
+        if now_ts - self._last_hard_timeout_check < throttle_seconds:
+            return
+        self._last_hard_timeout_check = now_ts
+
+        now = timezone.now()
+
+        # Hard limit for hook processes / archiveresults
+        hook_cutoff = now - timedelta(seconds=CONSTANTS.MAX_HOOK_RUNTIME_SECONDS)
+        overdue_hooks = Process.objects.filter(
+            process_type=Process.TypeChoices.HOOK,
+            status=Process.StatusChoices.RUNNING,
+            started_at__lt=hook_cutoff,
+        ).select_related('archiveresult')
+
+        for proc in overdue_hooks:
+            try:
+                proc.kill_tree(graceful_timeout=0.0)
+            except Exception:
+                pass
+
+            ar = getattr(proc, 'archiveresult', None)
+            if ar and ar.status == ArchiveResult.StatusChoices.STARTED:
+                ar.status = ArchiveResult.StatusChoices.FAILED
+                ar.end_ts = now
+                ar.retry_at = None
+                ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at'])
+
+        # Hard limit for snapshots
+        snapshot_cutoff = now - timedelta(seconds=CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS)
+        overdue_snapshots = Snapshot.objects.filter(
+            status=Snapshot.StatusChoices.STARTED,
+            modified_at__lt=snapshot_cutoff,
+        )
+
+        overdue_snapshot_ids = {str(s.id) for s in overdue_snapshots}
+        if overdue_snapshot_ids:
+            running_snapshot_workers = Process.objects.filter(
+                process_type=Process.TypeChoices.WORKER,
+                worker_type='snapshot',
+                status=Process.StatusChoices.RUNNING,
+            )
+            for proc in running_snapshot_workers:
+                env = proc.env or {}
+                if isinstance(env, dict) and str(env.get('SNAPSHOT_ID', '')) in overdue_snapshot_ids:
+                    try:
+                        proc.terminate(graceful_timeout=1.0)
+                    except Exception:
+                        pass
+
+        for snapshot in overdue_snapshots:
+            running_hooks = Process.objects.filter(
+                archiveresult__snapshot=snapshot,
+                process_type=Process.TypeChoices.HOOK,
+                status=Process.StatusChoices.RUNNING,
+            ).distinct()
+            for process in running_hooks:
+                try:
+                    process.kill_tree(graceful_timeout=0.0)
+                except Exception:
+                    continue
+
+            snapshot.archiveresult_set.filter(
+                status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED],
+            ).update(
+                status=ArchiveResult.StatusChoices.FAILED,
+                end_ts=now,
+                retry_at=None,
+                modified_at=now,
+            )
+
+            snapshot.cleanup()
+            snapshot.status = Snapshot.StatusChoices.SEALED
+            snapshot.retry_at = None
+            snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+            crawl = snapshot.crawl
+            if crawl and crawl.is_finished():
+                crawl.status = crawl.StatusChoices.SEALED
+                crawl.retry_at = None
+                crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+        # Reconcile snapshot/crawl state with running archiveresults
+        started_snapshot_ids = list(
+            ArchiveResult.objects.filter(
+                status=ArchiveResult.StatusChoices.STARTED,
+            ).values_list('snapshot_id', flat=True).distinct()
+        )
+        if started_snapshot_ids:
+            Snapshot.objects.filter(
+                id__in=started_snapshot_ids,
+            ).exclude(
+                status=Snapshot.StatusChoices.SEALED,
+            ).exclude(
+                status=Snapshot.StatusChoices.STARTED,
+            ).update(
+                status=Snapshot.StatusChoices.STARTED,
+                retry_at=None,
+                modified_at=now,
+            )
+
+            Crawl.objects.filter(
+                snapshot_set__id__in=started_snapshot_ids,
+                status=Crawl.StatusChoices.QUEUED,
+            ).distinct().update(
+                status=Crawl.StatusChoices.STARTED,
+                retry_at=None,
+                modified_at=now,
+            )
+
+        # If a snapshot is sealed, any still-started archiveresults should be failed
+        sealed_snapshot_ids = list(
+            Snapshot.objects.filter(status=Snapshot.StatusChoices.SEALED).values_list('id', flat=True)
+        )
+        if sealed_snapshot_ids:
+            started_ars = ArchiveResult.objects.filter(
+                snapshot_id__in=sealed_snapshot_ids,
+                status=ArchiveResult.StatusChoices.STARTED,
+            ).select_related('process')
+            for ar in started_ars:
+                if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
+                    try:
+                        ar.process.kill_tree(graceful_timeout=0.0)
+                    except Exception:
+                        pass
+                ar.status = ArchiveResult.StatusChoices.FAILED
+                ar.end_ts = now
+                ar.retry_at = None
+                ar.save(update_fields=['status', 'end_ts', 'retry_at', 'modified_at'])
+
+        # Clear queued/started snapshots that belong to sealed crawls
+        Snapshot.objects.filter(
+            crawl__status=Crawl.StatusChoices.SEALED,
+            status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
+        ).update(
+            status=Snapshot.StatusChoices.SEALED,
+            retry_at=None,
+            modified_at=now,
+        )
+
    def _claim_crawl(self, crawl) -> bool:
        """Atomically claim a crawl using optimistic locking."""
        from archivebox.crawls.models import Crawl
--- a/archivebox/workers/supervisord_util.py
+++ b/archivebox/workers/supervisord_util.py
@@ -32,7 +32,8 @@ _supervisord_proc = None

 ORCHESTRATOR_WORKER = {
    "name": "worker_orchestrator",
-    "command": "archivebox run",  # runs forever by default
+    # Use Django management command to avoid stdin/TTY ambiguity in `archivebox run`.
+    "command": "archivebox manage orchestrator",
    "autostart": "true",
    "autorestart": "true",
    "stdout_logfile": "logs/worker_orchestrator.log",
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -436,6 +436,7 @@ class CrawlWorker(Worker):
        super().__init__(**kwargs)
        self.crawl_id = crawl_id
        self.crawl = None
+        self.crawl_config = None

    def get_model(self):
        from archivebox.crawls.models import Crawl
@@ -446,7 +447,9 @@ class CrawlWorker(Worker):
        super().on_startup()

        from archivebox.crawls.models import Crawl
+        from archivebox.config.configset import get_config
        self.crawl = Crawl.objects.get(id=self.crawl_id)
+        self.crawl_config = get_config(crawl=self.crawl)

    def runloop(self) -> None:
        """Run crawl state machine, spawn SnapshotWorkers."""
@@ -484,6 +487,12 @@ class CrawlWorker(Worker):

            # Now spawn SnapshotWorkers and monitor progress
            while True:
+                self.crawl.refresh_from_db()
+                if self.crawl.status == Crawl.StatusChoices.SEALED:
+                    print(f'🛑 Crawl {self.crawl_id} was sealed, stopping workers', file=sys.stderr)
+                    self._terminate_running_snapshot_workers()
+                    break
+
                # Check if crawl is done
                if self._is_crawl_finished():
                    print(f'🔄 Crawl finished, sealing...', file=sys.stderr)
@@ -589,6 +598,22 @@ class CrawlWorker(Worker):
                thread = threading.Thread(target=pipe_worker_stderr, daemon=True)
                thread.start()

+    def _terminate_running_snapshot_workers(self) -> None:
+        """Terminate any running SnapshotWorkers for this crawl."""
+        from archivebox.machine.models import Process
+
+        running_workers = Process.objects.filter(
+            process_type=Process.TypeChoices.WORKER,
+            worker_type='snapshot',
+            parent_id=self.db_process.id,
+            status=Process.StatusChoices.RUNNING,
+        )
+        for proc in running_workers:
+            try:
+                proc.terminate(graceful_timeout=1.0)
+            except Exception:
+                continue
+
    def _is_crawl_finished(self) -> bool:
        """Check if all snapshots are sealed."""
        from pathlib import Path
@@ -684,19 +709,29 @@ class SnapshotWorker(Worker):
        from archivebox.core.models import Snapshot
        self.snapshot = Snapshot.objects.get(id=self.snapshot_id)

+        if self.snapshot.status == Snapshot.StatusChoices.SEALED:
+            return
+
        # Use state machine to transition queued -> started (triggers enter_started())
        self.snapshot.sm.tick()
        self.snapshot.refresh_from_db()
+        self.snapshot_started_at = self.snapshot.modified_at or self.snapshot.created_at

    def runloop(self) -> None:
        """Execute all hooks sequentially."""
        from archivebox.hooks import discover_hooks, is_background_hook
-        from archivebox.core.models import ArchiveResult
+        from archivebox.core.models import ArchiveResult, Snapshot
        from archivebox.config.configset import get_config

        self.on_startup()

        try:
+            if self.snapshot.status == Snapshot.StatusChoices.SEALED:
+                return
+            if self._snapshot_exceeded_hard_timeout():
+                self._seal_snapshot_due_to_timeout()
+                return
+
            # Get merged config (includes env vars passed via Process.env, snapshot.config, defaults, etc.)
            config = get_config(snapshot=self.snapshot, crawl=self.snapshot.crawl)

@@ -706,6 +741,13 @@ class SnapshotWorker(Worker):

            # Execute each hook sequentially
            for hook_path in hooks:
+                self.snapshot.refresh_from_db()
+                if self.snapshot.status == Snapshot.StatusChoices.SEALED:
+                    break
+                if self._snapshot_exceeded_hard_timeout():
+                    self._seal_snapshot_due_to_timeout()
+                    return
+
                hook_name = hook_path.name
                plugin = self._extract_plugin_name(hook_path, hook_name)
                is_background = is_background_hook(hook_name)
@@ -756,9 +798,10 @@ class SnapshotWorker(Worker):

            # All hooks launched (or completed) - terminate bg hooks and seal
            self._finalize_background_hooks()
-            # This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing
-            self.snapshot.sm.seal()
-            self.snapshot.refresh_from_db()
+            if self.snapshot.status != Snapshot.StatusChoices.SEALED:
+                # This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing
+                self.snapshot.sm.seal()
+                self.snapshot.refresh_from_db()

        except Exception as e:
            # Mark snapshot as sealed even on error (still triggers cleanup)
@@ -771,17 +814,34 @@ class SnapshotWorker(Worker):

    def _run_hook(self, hook_path: Path, ar: Any, config: dict) -> Any:
        """Fork and run a hook using Process model, return Process."""
-        from archivebox.hooks import run_hook
+        from archivebox.hooks import run_hook, get_plugin_special_config
+        from archivebox.config.constants import CONSTANTS

        # Create output directory
        output_dir = ar.create_output_dir()

+        timeout = None
+        try:
+            plugin_name = hook_path.parent.name
+            plugin_config = get_plugin_special_config(plugin_name, config)
+            timeout = plugin_config.get('timeout')
+        except Exception:
+            timeout = None
+
+        if getattr(self, 'snapshot_started_at', None):
+            remaining = max(1, int(CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS - (timezone.now() - self.snapshot_started_at).total_seconds()))
+            if timeout:
+                timeout = min(int(timeout), remaining)
+            else:
+                timeout = remaining
+
        # Run hook using Process.launch() - returns Process model directly
        # Pass self.db_process as parent to track SnapshotWorker -> Hook hierarchy
        process = run_hook(
            script=hook_path,
            output_dir=output_dir,
            config=config,
+            timeout=timeout,
            parent=self.db_process,
            url=str(self.snapshot.url),
            snapshot_id=str(self.snapshot.id),
@@ -872,6 +932,44 @@ class SnapshotWorker(Worker):
            # Remove completed hook from tracking
            self.background_processes.pop(hook_name, None)

+    def _snapshot_exceeded_hard_timeout(self) -> bool:
+        from archivebox.config.constants import CONSTANTS
+
+        if not getattr(self, 'snapshot_started_at', None):
+            return False
+        return (timezone.now() - self.snapshot_started_at).total_seconds() > CONSTANTS.MAX_SNAPSHOT_RUNTIME_SECONDS
+
+    def _seal_snapshot_due_to_timeout(self) -> None:
+        from archivebox.core.models import ArchiveResult
+        from archivebox.machine.models import Process
+
+        now = timezone.now()
+
+        running_hooks = Process.objects.filter(
+            archiveresult__snapshot=self.snapshot,
+            process_type=Process.TypeChoices.HOOK,
+            status=Process.StatusChoices.RUNNING,
+        ).distinct()
+        for process in running_hooks:
+            try:
+                process.kill_tree(graceful_timeout=0.0)
+            except Exception:
+                continue
+
+        self.snapshot.archiveresult_set.filter(
+            status__in=[ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED],
+        ).update(
+            status=ArchiveResult.StatusChoices.FAILED,
+            end_ts=now,
+            retry_at=None,
+            modified_at=now,
+        )
+
+        self.snapshot.cleanup()
+        self.snapshot.status = self.snapshot.StatusChoices.SEALED
+        self.snapshot.retry_at = None
+        self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
+
    def on_shutdown(self, error: BaseException | None = None) -> None:
        """
        Terminate all background Snapshot hooks when snapshot finishes.