cleanup archivebox tests

2026-04-06 07:47:53 +10:00 · 2026-03-15 22:09:56 -07:00
parent 9de084da65
commit 57e11879ec
23 changed files with 487 additions and 1495 deletions
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -29,6 +29,7 @@ Usage:
 __package__ = 'archivebox.workers'

 import os
+import sys
 import time
 from typing import Type
 from datetime import datetime, timedelta
@@ -258,9 +259,7 @@ class Orchestrator:
    def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
        """Spawn a new worker process. Returns PID or None if spawn failed."""
        try:
-            print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
            pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
-            print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')

            # CRITICAL: Block until worker registers itself in Process table
            # This prevents race condition where orchestrator spawns multiple workers
@@ -281,17 +280,6 @@ class Orchestrator:
                # 4. Parent is this orchestrator
                # 5. Started recently (within last 10 seconds)

-                # Debug: Check all processes with this PID first
-                if elapsed < 0.5:
-                    all_procs = list(Process.objects.filter(pid=pid))
-                    print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
-                    print(f'[yellow]  Found {len(all_procs)} Process records for pid={pid}[/yellow]')
-                    for p in all_procs:
-                        print(
-                            f'[yellow]  -> type={p.process_type} status={p.status} '
-                            f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]'
-                        )
-
                worker_process = Process.objects.filter(
                    pid=pid,
                    process_type=Process.TypeChoices.WORKER,
@@ -302,7 +290,6 @@ class Orchestrator:

                if worker_process:
                    # Worker successfully registered!
-                    print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
                    return pid

                time.sleep(poll_interval)
@@ -653,14 +640,15 @@ class Orchestrator:
    def runloop(self) -> None:
        """Main orchestrator loop."""
        from rich.live import Live
-        from archivebox.misc.logging import IS_TTY
        from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
-        import sys
        import os

+        is_tty = sys.stdout.isatty()
        # Enable progress layout only in TTY + foreground mode
-        show_progress = IS_TTY and self.exit_on_idle
-        plain_output = not IS_TTY
+        show_progress = is_tty and self.exit_on_idle
+        # When stdout is not a TTY, it may be reserved for JSONL pipeline output.
+        # Keep the plain progress view, but emit it to stderr instead of stdout.
+        plain_output = not is_tty
        self.on_startup()

        if not show_progress:
@@ -1241,7 +1229,7 @@ class Orchestrator:
                            ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
                            for panel, line in new_lines:
                                if line:
-                                    print(f"[{ts}] [{panel}] {line}")
+                                    print(f"[{ts}] [{panel}] {line}", file=sys.stderr)
                        last_plain_lines = set(plain_lines)

                # Track idle state
@@ -1271,7 +1259,7 @@ class Orchestrator:
        except KeyboardInterrupt:
            if progress_layout:
                progress_layout.log_event("Interrupted by user", style="red")
-            print()  # Newline after ^C
+            print(file=sys.stderr)  # Newline after ^C
            self.on_shutdown(error=KeyboardInterrupt())
        except BaseException as e:
            if progress_layout:
@@ -1310,7 +1298,7 @@ class Orchestrator:
        Used by commands like 'add' to ensure orchestrator is running.
        """
        if cls.is_running():
-            print('[grey53]👨‍✈️ Orchestrator already running[/grey53]')
+            print('[grey53]👨‍✈️ Orchestrator already running[/grey53]', file=sys.stderr)
            # Return a placeholder - actual orchestrator is in another process
            return cls(exit_on_idle=exit_on_idle)

--- a/archivebox/workers/tests/test_orchestrator.py
+++ b/archivebox/workers/tests/test_orchestrator.py
@@ -1,484 +0,0 @@
-"""
-Unit tests for the Orchestrator and Worker classes.
-
-Tests cover:
-1. Orchestrator lifecycle (startup, shutdown)
-2. Queue polling and worker spawning
-3. Idle detection and exit logic
-4. Worker registration and management
-5. Process model methods (replacing old pid_utils)
-"""
-
-import os
-import time
-from datetime import datetime, timedelta
-from unittest.mock import patch
-from typing import ClassVar
-
-import pytest
-from django.test import TestCase
-from django.utils import timezone
-
-from archivebox.workers.orchestrator import Orchestrator
-from archivebox.workers.worker import Worker
-
-
-class FakeWorker(Worker):
-    name: ClassVar[str] = 'crawl'
-    MAX_CONCURRENT_TASKS: ClassVar[int] = 5
-    running_workers: ClassVar[list[dict[str, object]]] = []
-
-    @classmethod
-    def get_running_workers(cls) -> list[dict[str, object]]:
-        return cls.running_workers
-
-
-class TestOrchestratorUnit(TestCase):
-    """Unit tests for Orchestrator class (mocked dependencies)."""
-
-    def test_orchestrator_creation(self):
-        """Orchestrator should initialize with correct defaults."""
-        orchestrator = Orchestrator(exit_on_idle=True)
-
-        self.assertTrue(orchestrator.exit_on_idle)
-        self.assertEqual(orchestrator.idle_count, 0)
-        self.assertIsNone(orchestrator.pid_file)
-
-    def test_orchestrator_repr(self):
-        """Orchestrator __repr__ should include PID."""
-        orchestrator = Orchestrator()
-        repr_str = repr(orchestrator)
-
-        self.assertIn('Orchestrator', repr_str)
-        self.assertIn(str(os.getpid()), repr_str)
-
-    def test_has_pending_work(self):
-        """has_pending_work should check if any queue has items."""
-        orchestrator = Orchestrator()
-
-        self.assertFalse(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 0}))
-        self.assertTrue(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 5}))
-        self.assertTrue(orchestrator.has_pending_work({'crawl': 10, 'snapshot': 0}))
-
-    def test_should_exit_not_exit_on_idle(self):
-        """should_exit should return False when exit_on_idle is False."""
-        orchestrator = Orchestrator(exit_on_idle=False)
-        orchestrator.idle_count = 100
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 0}))
-
-    def test_should_exit_pending_work(self):
-        """should_exit should return False when there's pending work."""
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = 100
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 5}))
-
-    @patch.object(Orchestrator, 'has_running_workers')
-    def test_should_exit_running_workers(self, mock_has_workers):
-        """should_exit should return False when workers are running."""
-        mock_has_workers.return_value = True
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = 100
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 0}))
-
-    @patch.object(Orchestrator, 'has_running_workers')
-    @patch.object(Orchestrator, 'has_future_work')
-    def test_should_exit_idle_timeout(self, mock_future, mock_workers):
-        """should_exit should return True after idle timeout with no work."""
-        mock_workers.return_value = False
-        mock_future.return_value = False
-
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = orchestrator.IDLE_TIMEOUT
-
-        self.assertTrue(orchestrator.should_exit({'crawl': 0, 'snapshot': 0}))
-
-    @patch.object(Orchestrator, 'has_running_workers')
-    @patch.object(Orchestrator, 'has_future_work')
-    def test_should_exit_below_idle_timeout(self, mock_future, mock_workers):
-        """should_exit should return False below idle timeout."""
-        mock_workers.return_value = False
-        mock_future.return_value = False
-
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = orchestrator.IDLE_TIMEOUT - 1
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 0}))
-
-    def test_should_spawn_worker_no_queue(self):
-        """should_spawn_worker should return False when queue is empty."""
-        orchestrator = Orchestrator()
-
-        FakeWorker.running_workers = []
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 0))
-
-    def test_should_spawn_worker_at_limit(self):
-        """should_spawn_worker should return False when at per-type limit."""
-        orchestrator = Orchestrator()
-
-        running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
-        FakeWorker.running_workers = running_workers
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
-
-    @patch.object(Orchestrator, 'get_total_worker_count')
-    def test_should_spawn_worker_at_total_limit(self, mock_total):
-        """should_spawn_worker should return False when at total limit."""
-        orchestrator = Orchestrator()
-        mock_total.return_value = 0
-        running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
-        FakeWorker.running_workers = running_workers
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
-
-    @patch.object(Orchestrator, 'get_total_worker_count')
-    def test_should_spawn_worker_success(self, mock_total):
-        """should_spawn_worker should return True when conditions are met."""
-        orchestrator = Orchestrator()
-        mock_total.return_value = 0
-
-        FakeWorker.running_workers = []
-        self.assertTrue(orchestrator.should_spawn_worker(FakeWorker, 10))
-
-    @patch.object(Orchestrator, 'get_total_worker_count')
-    def test_should_spawn_worker_enough_workers(self, mock_total):
-        """should_spawn_worker should return False when enough workers for queue."""
-        orchestrator = Orchestrator()
-        mock_total.return_value = 2
-
-        FakeWorker.running_workers = [{}]  # 1 worker running
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 3))
-
-
-class TestOrchestratorWithProcess(TestCase):
-    """Test Orchestrator using Process model for tracking."""
-
-    def setUp(self):
-        """Reset process cache."""
-        import archivebox.machine.models as models
-        models._CURRENT_MACHINE = None
-        models._CURRENT_PROCESS = None
-
-    def test_is_running_no_orchestrator(self):
-        """is_running should return False when no orchestrator process exists."""
-        from archivebox.machine.models import Process
-
-        # Clean up any stale processes first
-        Process.cleanup_stale_running()
-
-        # Mark any running orchestrators as exited for clean test state
-        Process.objects.filter(
-            process_type=Process.TypeChoices.ORCHESTRATOR,
-            status=Process.StatusChoices.RUNNING
-        ).update(status=Process.StatusChoices.EXITED)
-
-        self.assertFalse(Orchestrator.is_running())
-
-    def test_is_running_with_orchestrator_process(self):
-        """is_running should return True when orchestrator Process exists."""
-        from archivebox.machine.models import Process, Machine
-        import psutil
-
-        machine = Machine.current()
-        current_proc = psutil.Process(os.getpid())
-
-        # Create an orchestrator Process record
-        proc = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.ORCHESTRATOR,
-            status=Process.StatusChoices.RUNNING,
-            pid=os.getpid(),  # Use current PID so it appears alive
-            started_at=datetime.fromtimestamp(current_proc.create_time(), tz=timezone.get_current_timezone()),
-            cmd=current_proc.cmdline(),
-        )
-
-        try:
-            # Should detect running orchestrator
-            self.assertTrue(Orchestrator.is_running())
-        finally:
-            # Clean up
-            proc.status = Process.StatusChoices.EXITED
-            proc.save()
-
-    def test_orchestrator_uses_process_for_is_running(self):
-        """Orchestrator.is_running should use Process.get_running_count."""
-        from archivebox.machine.models import Process
-
-        # Verify is_running uses Process model, not pid files
-        with patch.object(Process, 'get_running_count') as mock_count:
-            mock_count.return_value = 1
-
-            result = Orchestrator.is_running()
-
-            # Should have called Process.get_running_count with orchestrator type
-            mock_count.assert_called()
-            self.assertTrue(result)
-
-    def test_orchestrator_scoped_worker_count(self):
-        """Orchestrator with crawl_id should count only descendant workers."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-        orchestrator = Orchestrator(exit_on_idle=True, crawl_id='test-crawl')
-
-        orchestrator.db_process = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.ORCHESTRATOR,
-            status=Process.StatusChoices.RUNNING,
-            pid=12345,
-            started_at=timezone.now(),
-        )
-
-        # Prevent cleanup from marking fake PIDs as exited
-        orchestrator._last_cleanup_time = time.time()
-
-        Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.WORKER,
-            worker_type='crawl',
-            status=Process.StatusChoices.RUNNING,
-            pid=12346,
-            parent=orchestrator.db_process,
-            started_at=timezone.now(),
-        )
-
-        Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.WORKER,
-            worker_type='crawl',
-            status=Process.StatusChoices.RUNNING,
-            pid=12347,
-            started_at=timezone.now(),
-        )
-
-        self.assertEqual(orchestrator.get_total_worker_count(), 1)
-
-
-class TestProcessBasedWorkerTracking(TestCase):
-    """Test Process model methods that replace pid_utils functionality."""
-
-    def setUp(self):
-        """Reset caches."""
-        import archivebox.machine.models as models
-        models._CURRENT_MACHINE = None
-        models._CURRENT_PROCESS = None
-
-    def test_process_current_creates_record(self):
-        """Process.current() should create a Process record for current PID."""
-        from archivebox.machine.models import Process
-
-        proc = Process.current()
-
-        self.assertIsNotNone(proc)
-        self.assertEqual(proc.pid, os.getpid())
-        self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
-        self.assertIsNotNone(proc.machine)
-        self.assertIsNotNone(proc.started_at)
-
-    def test_process_current_caches_result(self):
-        """Process.current() should return cached Process within interval."""
-        from archivebox.machine.models import Process
-
-        proc1 = Process.current()
-        proc2 = Process.current()
-
-        self.assertEqual(proc1.id, proc2.id)
-
-    def test_process_get_running_count(self):
-        """Process.get_running_count should count running processes by type."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-
-        # Create some worker processes
-        for i in range(3):
-            Process.objects.create(
-                machine=machine,
-                process_type=Process.TypeChoices.WORKER,
-                status=Process.StatusChoices.RUNNING,
-                pid=99990 + i,  # Fake PIDs
-                started_at=timezone.now(),
-            )
-
-        count = Process.get_running_count(process_type=Process.TypeChoices.WORKER)
-        self.assertGreaterEqual(count, 3)
-
-    def test_process_get_next_worker_id(self):
-        """Process.get_next_worker_id should return count of running workers."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-
-        # Create 2 worker processes
-        for i in range(2):
-            Process.objects.create(
-                machine=machine,
-                process_type=Process.TypeChoices.WORKER,
-                status=Process.StatusChoices.RUNNING,
-                pid=99980 + i,
-                started_at=timezone.now(),
-            )
-
-        next_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
-        self.assertGreaterEqual(next_id, 2)
-
-    def test_process_cleanup_stale_running(self):
-        """Process.cleanup_stale_running should mark stale processes as exited."""
-        from archivebox.machine.models import Process, Machine, PID_REUSE_WINDOW
-
-        machine = Machine.current()
-
-        # Create a stale process (old started_at, fake PID)
-        stale_proc = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.WORKER,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,  # Fake PID that doesn't exist
-            started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
-        )
-
-        cleaned = Process.cleanup_stale_running()
-
-        self.assertGreaterEqual(cleaned, 1)
-
-        stale_proc.refresh_from_db()
-        self.assertEqual(stale_proc.status, Process.StatusChoices.EXITED)
-
-    def test_process_get_running(self):
-        """Process.get_running should return queryset of running processes."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-
-        # Create a running process
-        proc = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.HOOK,
-            status=Process.StatusChoices.RUNNING,
-            pid=99970,
-            started_at=timezone.now(),
-        )
-
-        running = Process.get_running(process_type=Process.TypeChoices.HOOK)
-
-        self.assertIn(proc, running)
-
-    def test_process_type_detection(self):
-        """Process._detect_process_type should detect process type from argv."""
-        from archivebox.machine.models import Process
-
-        # Test detection logic
-        with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
-            result = Process._detect_process_type()
-            self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
-
-        with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
-            result = Process._detect_process_type()
-            self.assertEqual(result, Process.TypeChoices.CLI)
-
-        with patch('sys.argv', ['supervisord', '-c', 'config.ini']):
-            result = Process._detect_process_type()
-            self.assertEqual(result, Process.TypeChoices.SUPERVISORD)
-
-
-class TestProcessLifecycle(TestCase):
-    """Test Process model lifecycle methods."""
-
-    def setUp(self):
-        """Reset caches and create a machine."""
-        import archivebox.machine.models as models
-        models._CURRENT_MACHINE = None
-        models._CURRENT_PROCESS = None
-        self.machine = models.Machine.current()
-
-    def test_process_is_running_property(self):
-        """Process.is_running should check actual OS process."""
-        from archivebox.machine.models import Process
-        proc = Process.current()
-
-        # Should be running (current process exists)
-        self.assertTrue(proc.is_running)
-
-        # Create a process with fake PID
-        fake_proc = Process.objects.create(
-            machine=self.machine,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,
-            started_at=timezone.now(),
-        )
-
-        # Should not be running (PID doesn't exist)
-        self.assertFalse(fake_proc.is_running)
-
-    def test_process_poll(self):
-        """Process.poll should check and update exit status."""
-        from archivebox.machine.models import Process
-
-        # Create a process with fake PID (already exited)
-        proc = Process.objects.create(
-            machine=self.machine,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,
-            started_at=timezone.now(),
-        )
-
-        exit_code = proc.poll()
-
-        # Should have detected exit and updated status
-        self.assertIsNotNone(exit_code)
-        proc.refresh_from_db()
-        self.assertEqual(proc.status, Process.StatusChoices.EXITED)
-
-    def test_process_terminate_already_dead(self):
-        """Process.terminate should handle already-dead processes."""
-        from archivebox.machine.models import Process
-
-        # Create a process with fake PID
-        proc = Process.objects.create(
-            machine=self.machine,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,
-            started_at=timezone.now(),
-        )
-
-        result = proc.terminate()
-
-        # Should return False (was already dead)
-        self.assertFalse(result)
-
-        proc.refresh_from_db()
-        self.assertEqual(proc.status, Process.StatusChoices.EXITED)
-
-    def test_process_tree_traversal(self):
-        """Process parent/children relationships should work."""
-        from archivebox.machine.models import Process
-
-        # Create parent process
-        parent = Process.objects.create(
-            machine=self.machine,
-            process_type=Process.TypeChoices.CLI,
-            status=Process.StatusChoices.RUNNING,
-            pid=1,
-            started_at=timezone.now(),
-        )
-
-        # Create child process
-        child = Process.objects.create(
-            machine=self.machine,
-            parent=parent,
-            process_type=Process.TypeChoices.WORKER,
-            status=Process.StatusChoices.RUNNING,
-            pid=2,
-            started_at=timezone.now(),
-        )
-
-        # Test relationships
-        self.assertEqual(child.parent, parent)
-        self.assertIn(child, parent.children.all())
-        self.assertEqual(child.root, parent)
-        self.assertEqual(child.depth, 1)
-        self.assertEqual(parent.depth, 0)
-
-
-if __name__ == '__main__':
-    pytest.main([__file__, '-v'])
--- a/archivebox/workers/tests/test_scheduled_crawls.py
+++ b/archivebox/workers/tests/test_scheduled_crawls.py
@@ -1,84 +0,0 @@
-from datetime import timedelta
-from typing import cast
-from unittest.mock import patch
-
-from django.contrib.auth import get_user_model
-from django.contrib.auth.models import UserManager
-from django.test import TestCase
-from django.utils import timezone
-
-from archivebox.crawls.models import Crawl, CrawlSchedule
-from archivebox.workers.orchestrator import Orchestrator
-from archivebox.workers.worker import CrawlWorker
-
-
-class TestScheduledCrawlMaterialization(TestCase):
-    def setUp(self):
-        user_manager = cast(UserManager, get_user_model().objects)
-        self.user = user_manager.create_user(
-            username='schedule-user',
-            password='password',
-        )
-
-    def _create_due_schedule(self) -> CrawlSchedule:
-        template = Crawl.objects.create(
-            urls='https://example.com/feed.xml',
-            max_depth=1,
-            tags_str='scheduled',
-            label='Scheduled Feed',
-            notes='template',
-            created_by=self.user,
-            status=Crawl.StatusChoices.SEALED,
-            retry_at=None,
-        )
-        schedule = CrawlSchedule.objects.create(
-            template=template,
-            schedule='daily',
-            is_enabled=True,
-            label='Scheduled Feed',
-            notes='template',
-            created_by=self.user,
-        )
-        past = timezone.now() - timedelta(days=2)
-        Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
-        template.refresh_from_db()
-        schedule.refresh_from_db()
-        return schedule
-
-    def test_global_orchestrator_materializes_due_schedule(self):
-        schedule = self._create_due_schedule()
-
-        orchestrator = Orchestrator(exit_on_idle=False)
-        orchestrator._materialize_due_schedules()
-
-        scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
-        self.assertEqual(scheduled_crawls.count(), 2)
-
-        queued_crawl = scheduled_crawls.last()
-        self.assertIsNotNone(queued_crawl)
-        assert queued_crawl is not None
-        self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
-        self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
-        self.assertEqual(queued_crawl.max_depth, 1)
-        self.assertEqual(queued_crawl.tags_str, 'scheduled')
-
-    def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
-        schedule = self._create_due_schedule()
-
-        Orchestrator(exit_on_idle=True)._materialize_due_schedules()
-        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
-
-        Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
-        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
-
-    @patch.object(CrawlWorker, 'start')
-    def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
-        schedule = self._create_due_schedule()
-
-        orchestrator = Orchestrator(exit_on_idle=False)
-        with patch.object(orchestrator, '_claim_crawl', return_value=True):
-            queue_sizes = orchestrator.check_queues_and_spawn_workers()
-
-        self.assertEqual(queue_sizes['crawl'], 1)
-        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
-        mock_start.assert_not_called()
--- a/archivebox/workers/tests/test_snapshot_worker.py
+++ b/archivebox/workers/tests/test_snapshot_worker.py
@@ -1,76 +0,0 @@
-from pathlib import Path
-from types import SimpleNamespace
-from typing import Any, cast
-from unittest.mock import patch
-
-from django.test import SimpleTestCase
-
-from archivebox.workers.worker import SnapshotWorker
-
-
-class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
-    def _make_worker(self):
-        worker = SnapshotWorker.__new__(SnapshotWorker)
-        worker.pid = 12345
-        cast(Any, worker).snapshot = SimpleNamespace(
-            status='started',
-            refresh_from_db=lambda: None,
-        )
-        worker._snapshot_exceeded_hard_timeout = lambda: False
-        worker._seal_snapshot_due_to_timeout = lambda: None
-        worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
-        worker._wait_for_hook = lambda process, ar: None
-        return worker
-
-    @patch('archivebox.workers.worker.log_worker_event')
-    def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
-        worker = self._make_worker()
-        archive_result = SimpleNamespace(
-            status='succeeded',
-            output_files={},
-            output_str='scrolled 600px',
-            output_json=None,
-            refresh_from_db=lambda: None,
-        )
-
-        worker._retry_failed_empty_foreground_hooks(
-            [(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
-            config={},
-        )
-
-        mock_log.assert_not_called()
-
-    @patch('archivebox.workers.worker.log_worker_event')
-    def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
-        worker = self._make_worker()
-        run_calls = []
-        wait_calls = []
-
-        def run_hook(*args, **kwargs):
-            run_calls.append((args, kwargs))
-            return SimpleNamespace()
-
-        def wait_for_hook(process, ar):
-            wait_calls.append((process, ar))
-            ar.status = 'succeeded'
-            ar.output_files = {'singlefile.html': {}}
-
-        archive_result = SimpleNamespace(
-            status='failed',
-            output_files={},
-            output_str='',
-            output_json=None,
-            refresh_from_db=lambda: None,
-        )
-
-        worker._run_hook = run_hook
-        worker._wait_for_hook = wait_for_hook
-
-        worker._retry_failed_empty_foreground_hooks(
-            [(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
-            config={},
-        )
-
-        assert len(run_calls) == 1
-        assert len(wait_calls) == 1
-        mock_log.assert_called_once()