move tests into subfolder, add missing install hooks

2026-04-06 07:47:53 +10:00 · 2026-01-02 00:22:07 -08:00
parent c2afb40350
commit 65ee09ceab
80 changed files with 2659 additions and 859 deletions
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -76,11 +76,11 @@ class Orchestrator:
        self.idle_count: int = 0
        self._last_cleanup_time: float = 0.0  # For throttling cleanup_stale_running()

-        # CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
-        # to keep execution strictly sequential and deterministic
+        # In foreground mode (exit_on_idle=True), limit workers but allow enough
+        # for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker
        if self.exit_on_idle:
            self.MAX_WORKERS_PER_TYPE = 1
-            self.MAX_TOTAL_WORKERS = 1
+            self.MAX_TOTAL_WORKERS = 3  # Allow one worker of each type to run concurrently
    
    def __repr__(self) -> str:
        return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -157,32 +157,41 @@ class Orchestrator:
            self._last_cleanup_time = now

        return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
+
+    def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int:
+        """Get count of running workers for a specific worker type."""
+        return len(WorkerClass.get_running_workers())
    
    def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
        """Determine if we should spawn a new worker of the given type."""
        if queue_count == 0:
            return False
-        
+
        # Check per-type limit
        running_workers = WorkerClass.get_running_workers()
-        if len(running_workers) >= self.MAX_WORKERS_PER_TYPE:
+        running_count = len(running_workers)
+
+        if running_count >= self.MAX_WORKERS_PER_TYPE:
            return False
-        
+
        # Check total limit
-        if self.get_total_worker_count() >= self.MAX_TOTAL_WORKERS:
+        total_workers = self.get_total_worker_count()
+        if total_workers >= self.MAX_TOTAL_WORKERS:
            return False
-        
+
        # Check if we already have enough workers for the queue size
        # Spawn more gradually - don't flood with workers
-        if len(running_workers) > 0 and queue_count <= len(running_workers) * WorkerClass.MAX_CONCURRENT_TASKS:
+        if running_count > 0 and queue_count <= running_count * WorkerClass.MAX_CONCURRENT_TASKS:
            return False
-        
+
        return True
    
    def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
        """Spawn a new worker process. Returns PID or None if spawn failed."""
        try:
+            print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
            pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id)
+            print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')

            # CRITICAL: Block until worker registers itself in Process table
            # This prevents race condition where orchestrator spawns multiple workers
@@ -202,6 +211,15 @@ class Orchestrator:
                # 3. RUNNING status
                # 4. Parent is this orchestrator
                # 5. Started recently (within last 10 seconds)
+
+                # Debug: Check all processes with this PID first
+                if elapsed < 0.5:
+                    all_procs = list(Process.objects.filter(pid=pid))
+                    print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
+                    print(f'[yellow]  Found {len(all_procs)} Process records for pid={pid}[/yellow]')
+                    for p in all_procs:
+                        print(f'[yellow]  -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]')
+
                worker_process = Process.objects.filter(
                    pid=pid,
                    process_type=Process.TypeChoices.WORKER,
@@ -212,6 +230,7 @@ class Orchestrator:

                if worker_process:
                    # Worker successfully registered!
+                    print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
                    return pid

                time.sleep(poll_interval)
@@ -244,7 +263,7 @@ class Orchestrator:
        Returns dict of queue sizes by worker type.
        """
        queue_sizes = {}
-        
+
        for WorkerClass in self.WORKER_TYPES:
            # Get queue for this worker type
            # Need to instantiate worker to get queue (for model access)
@@ -392,11 +411,18 @@ class Orchestrator:

    def _run_orchestrator_loop(self, progress, task_ids):
        """Run the main orchestrator loop with optional progress display."""
+        last_queue_sizes = {}
+        last_snapshot_count = None
        try:
            while True:
                # Check queues and spawn workers
                queue_sizes = self.check_queues_and_spawn_workers()

+                # Debug queue sizes (only when changed)
+                if progress and queue_sizes != last_queue_sizes:
+                    progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]')
+                    last_queue_sizes = queue_sizes.copy()
+
                # Update progress bars
                if progress:
                    from archivebox.core.models import Snapshot
@@ -412,6 +438,11 @@ class Orchestrator:

                    active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))

+                    # Debug snapshot count (only when changed)
+                    if len(active_snapshots) != last_snapshot_count:
+                        progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]')
+                        last_snapshot_count = len(active_snapshots)
+
                    # Track which snapshots are still active
                    active_ids = set()

@@ -461,7 +492,9 @@ class Orchestrator:
                            del task_ids[snapshot_id]

                # Track idle state
-                if self.has_pending_work(queue_sizes) or self.has_running_workers():
+                has_pending = self.has_pending_work(queue_sizes)
+                has_running = self.has_running_workers()
+                if has_pending or has_running:
                    self.idle_count = 0
                    self.on_tick(queue_sizes)
                else: