mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
cleanup archivebox tests
This commit is contained in:
@@ -29,6 +29,7 @@ Usage:
|
||||
__package__ = 'archivebox.workers'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import Type
|
||||
from datetime import datetime, timedelta
|
||||
@@ -258,9 +259,7 @@ class Orchestrator:
|
||||
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
|
||||
"""Spawn a new worker process. Returns PID or None if spawn failed."""
|
||||
try:
|
||||
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
|
||||
pid = WorkerClass.start(parent=self.db_process, crawl_id=self.crawl_id)
|
||||
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
|
||||
|
||||
# CRITICAL: Block until worker registers itself in Process table
|
||||
# This prevents race condition where orchestrator spawns multiple workers
|
||||
@@ -281,17 +280,6 @@ class Orchestrator:
|
||||
# 4. Parent is this orchestrator
|
||||
# 5. Started recently (within last 10 seconds)
|
||||
|
||||
# Debug: Check all processes with this PID first
|
||||
if elapsed < 0.5:
|
||||
all_procs = list(Process.objects.filter(pid=pid))
|
||||
print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
|
||||
print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]')
|
||||
for p in all_procs:
|
||||
print(
|
||||
f'[yellow] -> type={p.process_type} status={p.status} '
|
||||
f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]'
|
||||
)
|
||||
|
||||
worker_process = Process.objects.filter(
|
||||
pid=pid,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
@@ -302,7 +290,6 @@ class Orchestrator:
|
||||
|
||||
if worker_process:
|
||||
# Worker successfully registered!
|
||||
print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
|
||||
return pid
|
||||
|
||||
time.sleep(poll_interval)
|
||||
@@ -653,14 +640,15 @@ class Orchestrator:
|
||||
def runloop(self) -> None:
|
||||
"""Main orchestrator loop."""
|
||||
from rich.live import Live
|
||||
from archivebox.misc.logging import IS_TTY
|
||||
from archivebox.misc.progress_layout import ArchiveBoxProgressLayout
|
||||
import sys
|
||||
import os
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
# Enable progress layout only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
plain_output = not IS_TTY
|
||||
show_progress = is_tty and self.exit_on_idle
|
||||
# When stdout is not a TTY, it may be reserved for JSONL pipeline output.
|
||||
# Keep the plain progress view, but emit it to stderr instead of stdout.
|
||||
plain_output = not is_tty
|
||||
self.on_startup()
|
||||
|
||||
if not show_progress:
|
||||
@@ -1241,7 +1229,7 @@ class Orchestrator:
|
||||
ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
for panel, line in new_lines:
|
||||
if line:
|
||||
print(f"[{ts}] [{panel}] {line}")
|
||||
print(f"[{ts}] [{panel}] {line}", file=sys.stderr)
|
||||
last_plain_lines = set(plain_lines)
|
||||
|
||||
# Track idle state
|
||||
@@ -1271,7 +1259,7 @@ class Orchestrator:
|
||||
except KeyboardInterrupt:
|
||||
if progress_layout:
|
||||
progress_layout.log_event("Interrupted by user", style="red")
|
||||
print() # Newline after ^C
|
||||
print(file=sys.stderr) # Newline after ^C
|
||||
self.on_shutdown(error=KeyboardInterrupt())
|
||||
except BaseException as e:
|
||||
if progress_layout:
|
||||
@@ -1310,7 +1298,7 @@ class Orchestrator:
|
||||
Used by commands like 'add' to ensure orchestrator is running.
|
||||
"""
|
||||
if cls.is_running():
|
||||
print('[grey53]👨✈️ Orchestrator already running[/grey53]')
|
||||
print('[grey53]👨✈️ Orchestrator already running[/grey53]', file=sys.stderr)
|
||||
# Return a placeholder - actual orchestrator is in another process
|
||||
return cls(exit_on_idle=exit_on_idle)
|
||||
|
||||
|
||||
@@ -1,484 +0,0 @@
|
||||
"""
|
||||
Unit tests for the Orchestrator and Worker classes.
|
||||
|
||||
Tests cover:
|
||||
1. Orchestrator lifecycle (startup, shutdown)
|
||||
2. Queue polling and worker spawning
|
||||
3. Idle detection and exit logic
|
||||
4. Worker registration and management
|
||||
5. Process model methods (replacing old pid_utils)
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from unittest.mock import patch
|
||||
from typing import ClassVar
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import Worker
|
||||
|
||||
|
||||
class FakeWorker(Worker):
|
||||
name: ClassVar[str] = 'crawl'
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 5
|
||||
running_workers: ClassVar[list[dict[str, object]]] = []
|
||||
|
||||
@classmethod
|
||||
def get_running_workers(cls) -> list[dict[str, object]]:
|
||||
return cls.running_workers
|
||||
|
||||
|
||||
class TestOrchestratorUnit(TestCase):
|
||||
"""Unit tests for Orchestrator class (mocked dependencies)."""
|
||||
|
||||
def test_orchestrator_creation(self):
|
||||
"""Orchestrator should initialize with correct defaults."""
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
|
||||
self.assertTrue(orchestrator.exit_on_idle)
|
||||
self.assertEqual(orchestrator.idle_count, 0)
|
||||
self.assertIsNone(orchestrator.pid_file)
|
||||
|
||||
def test_orchestrator_repr(self):
|
||||
"""Orchestrator __repr__ should include PID."""
|
||||
orchestrator = Orchestrator()
|
||||
repr_str = repr(orchestrator)
|
||||
|
||||
self.assertIn('Orchestrator', repr_str)
|
||||
self.assertIn(str(os.getpid()), repr_str)
|
||||
|
||||
def test_has_pending_work(self):
|
||||
"""has_pending_work should check if any queue has items."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
self.assertFalse(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 0}))
|
||||
self.assertTrue(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 5}))
|
||||
self.assertTrue(orchestrator.has_pending_work({'crawl': 10, 'snapshot': 0}))
|
||||
|
||||
def test_should_exit_not_exit_on_idle(self):
|
||||
"""should_exit should return False when exit_on_idle is False."""
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.idle_count = 100
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
|
||||
|
||||
def test_should_exit_pending_work(self):
|
||||
"""should_exit should return False when there's pending work."""
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = 100
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 5}))
|
||||
|
||||
@patch.object(Orchestrator, 'has_running_workers')
|
||||
def test_should_exit_running_workers(self, mock_has_workers):
|
||||
"""should_exit should return False when workers are running."""
|
||||
mock_has_workers.return_value = True
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = 100
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
|
||||
|
||||
@patch.object(Orchestrator, 'has_running_workers')
|
||||
@patch.object(Orchestrator, 'has_future_work')
|
||||
def test_should_exit_idle_timeout(self, mock_future, mock_workers):
|
||||
"""should_exit should return True after idle timeout with no work."""
|
||||
mock_workers.return_value = False
|
||||
mock_future.return_value = False
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = orchestrator.IDLE_TIMEOUT
|
||||
|
||||
self.assertTrue(orchestrator.should_exit({'crawl': 0, 'snapshot': 0}))
|
||||
|
||||
@patch.object(Orchestrator, 'has_running_workers')
|
||||
@patch.object(Orchestrator, 'has_future_work')
|
||||
def test_should_exit_below_idle_timeout(self, mock_future, mock_workers):
|
||||
"""should_exit should return False below idle timeout."""
|
||||
mock_workers.return_value = False
|
||||
mock_future.return_value = False
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = orchestrator.IDLE_TIMEOUT - 1
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
|
||||
|
||||
def test_should_spawn_worker_no_queue(self):
|
||||
"""should_spawn_worker should return False when queue is empty."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
FakeWorker.running_workers = []
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 0))
|
||||
|
||||
def test_should_spawn_worker_at_limit(self):
|
||||
"""should_spawn_worker should return False when at per-type limit."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
|
||||
FakeWorker.running_workers = running_workers
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_at_total_limit(self, mock_total):
|
||||
"""should_spawn_worker should return False when at total limit."""
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 0
|
||||
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
|
||||
FakeWorker.running_workers = running_workers
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_success(self, mock_total):
|
||||
"""should_spawn_worker should return True when conditions are met."""
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 0
|
||||
|
||||
FakeWorker.running_workers = []
|
||||
self.assertTrue(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_enough_workers(self, mock_total):
|
||||
"""should_spawn_worker should return False when enough workers for queue."""
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 2
|
||||
|
||||
FakeWorker.running_workers = [{}] # 1 worker running
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 3))
|
||||
|
||||
|
||||
class TestOrchestratorWithProcess(TestCase):
|
||||
"""Test Orchestrator using Process model for tracking."""
|
||||
|
||||
def setUp(self):
|
||||
"""Reset process cache."""
|
||||
import archivebox.machine.models as models
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
|
||||
def test_is_running_no_orchestrator(self):
|
||||
"""is_running should return False when no orchestrator process exists."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Clean up any stale processes first
|
||||
Process.cleanup_stale_running()
|
||||
|
||||
# Mark any running orchestrators as exited for clean test state
|
||||
Process.objects.filter(
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING
|
||||
).update(status=Process.StatusChoices.EXITED)
|
||||
|
||||
self.assertFalse(Orchestrator.is_running())
|
||||
|
||||
def test_is_running_with_orchestrator_process(self):
|
||||
"""is_running should return True when orchestrator Process exists."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
import psutil
|
||||
|
||||
machine = Machine.current()
|
||||
current_proc = psutil.Process(os.getpid())
|
||||
|
||||
# Create an orchestrator Process record
|
||||
proc = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=os.getpid(), # Use current PID so it appears alive
|
||||
started_at=datetime.fromtimestamp(current_proc.create_time(), tz=timezone.get_current_timezone()),
|
||||
cmd=current_proc.cmdline(),
|
||||
)
|
||||
|
||||
try:
|
||||
# Should detect running orchestrator
|
||||
self.assertTrue(Orchestrator.is_running())
|
||||
finally:
|
||||
# Clean up
|
||||
proc.status = Process.StatusChoices.EXITED
|
||||
proc.save()
|
||||
|
||||
def test_orchestrator_uses_process_for_is_running(self):
|
||||
"""Orchestrator.is_running should use Process.get_running_count."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Verify is_running uses Process model, not pid files
|
||||
with patch.object(Process, 'get_running_count') as mock_count:
|
||||
mock_count.return_value = 1
|
||||
|
||||
result = Orchestrator.is_running()
|
||||
|
||||
# Should have called Process.get_running_count with orchestrator type
|
||||
mock_count.assert_called()
|
||||
self.assertTrue(result)
|
||||
|
||||
def test_orchestrator_scoped_worker_count(self):
|
||||
"""Orchestrator with crawl_id should count only descendant workers."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
orchestrator = Orchestrator(exit_on_idle=True, crawl_id='test-crawl')
|
||||
|
||||
orchestrator.db_process = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=12345,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Prevent cleanup from marking fake PIDs as exited
|
||||
orchestrator._last_cleanup_time = time.time()
|
||||
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='crawl',
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=12346,
|
||||
parent=orchestrator.db_process,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='crawl',
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=12347,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
self.assertEqual(orchestrator.get_total_worker_count(), 1)
|
||||
|
||||
|
||||
class TestProcessBasedWorkerTracking(TestCase):
|
||||
"""Test Process model methods that replace pid_utils functionality."""
|
||||
|
||||
def setUp(self):
|
||||
"""Reset caches."""
|
||||
import archivebox.machine.models as models
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
|
||||
def test_process_current_creates_record(self):
|
||||
"""Process.current() should create a Process record for current PID."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
proc = Process.current()
|
||||
|
||||
self.assertIsNotNone(proc)
|
||||
self.assertEqual(proc.pid, os.getpid())
|
||||
self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
|
||||
self.assertIsNotNone(proc.machine)
|
||||
self.assertIsNotNone(proc.started_at)
|
||||
|
||||
def test_process_current_caches_result(self):
|
||||
"""Process.current() should return cached Process within interval."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
proc1 = Process.current()
|
||||
proc2 = Process.current()
|
||||
|
||||
self.assertEqual(proc1.id, proc2.id)
|
||||
|
||||
def test_process_get_running_count(self):
|
||||
"""Process.get_running_count should count running processes by type."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create some worker processes
|
||||
for i in range(3):
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=99990 + i, # Fake PIDs
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
count = Process.get_running_count(process_type=Process.TypeChoices.WORKER)
|
||||
self.assertGreaterEqual(count, 3)
|
||||
|
||||
def test_process_get_next_worker_id(self):
|
||||
"""Process.get_next_worker_id should return count of running workers."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create 2 worker processes
|
||||
for i in range(2):
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=99980 + i,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
next_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
|
||||
self.assertGreaterEqual(next_id, 2)
|
||||
|
||||
def test_process_cleanup_stale_running(self):
|
||||
"""Process.cleanup_stale_running should mark stale processes as exited."""
|
||||
from archivebox.machine.models import Process, Machine, PID_REUSE_WINDOW
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create a stale process (old started_at, fake PID)
|
||||
stale_proc = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999, # Fake PID that doesn't exist
|
||||
started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
|
||||
)
|
||||
|
||||
cleaned = Process.cleanup_stale_running()
|
||||
|
||||
self.assertGreaterEqual(cleaned, 1)
|
||||
|
||||
stale_proc.refresh_from_db()
|
||||
self.assertEqual(stale_proc.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_process_get_running(self):
|
||||
"""Process.get_running should return queryset of running processes."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create a running process
|
||||
proc = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=99970,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
running = Process.get_running(process_type=Process.TypeChoices.HOOK)
|
||||
|
||||
self.assertIn(proc, running)
|
||||
|
||||
def test_process_type_detection(self):
|
||||
"""Process._detect_process_type should detect process type from argv."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Test detection logic
|
||||
with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
|
||||
|
||||
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.CLI)
|
||||
|
||||
with patch('sys.argv', ['supervisord', '-c', 'config.ini']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.SUPERVISORD)
|
||||
|
||||
|
||||
class TestProcessLifecycle(TestCase):
|
||||
"""Test Process model lifecycle methods."""
|
||||
|
||||
def setUp(self):
|
||||
"""Reset caches and create a machine."""
|
||||
import archivebox.machine.models as models
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
self.machine = models.Machine.current()
|
||||
|
||||
def test_process_is_running_property(self):
|
||||
"""Process.is_running should check actual OS process."""
|
||||
from archivebox.machine.models import Process
|
||||
proc = Process.current()
|
||||
|
||||
# Should be running (current process exists)
|
||||
self.assertTrue(proc.is_running)
|
||||
|
||||
# Create a process with fake PID
|
||||
fake_proc = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Should not be running (PID doesn't exist)
|
||||
self.assertFalse(fake_proc.is_running)
|
||||
|
||||
def test_process_poll(self):
|
||||
"""Process.poll should check and update exit status."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Create a process with fake PID (already exited)
|
||||
proc = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
exit_code = proc.poll()
|
||||
|
||||
# Should have detected exit and updated status
|
||||
self.assertIsNotNone(exit_code)
|
||||
proc.refresh_from_db()
|
||||
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_process_terminate_already_dead(self):
|
||||
"""Process.terminate should handle already-dead processes."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Create a process with fake PID
|
||||
proc = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
result = proc.terminate()
|
||||
|
||||
# Should return False (was already dead)
|
||||
self.assertFalse(result)
|
||||
|
||||
proc.refresh_from_db()
|
||||
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_process_tree_traversal(self):
|
||||
"""Process parent/children relationships should work."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Create parent process
|
||||
parent = Process.objects.create(
|
||||
machine=self.machine,
|
||||
process_type=Process.TypeChoices.CLI,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=1,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Create child process
|
||||
child = Process.objects.create(
|
||||
machine=self.machine,
|
||||
parent=parent,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=2,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Test relationships
|
||||
self.assertEqual(child.parent, parent)
|
||||
self.assertIn(child, parent.children.all())
|
||||
self.assertEqual(child.root, parent)
|
||||
self.assertEqual(child.depth, 1)
|
||||
self.assertEqual(parent.depth, 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,84 +0,0 @@
|
||||
from datetime import timedelta
|
||||
from typing import cast
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import UserManager
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
|
||||
class TestScheduledCrawlMaterialization(TestCase):
|
||||
def setUp(self):
|
||||
user_manager = cast(UserManager, get_user_model().objects)
|
||||
self.user = user_manager.create_user(
|
||||
username='schedule-user',
|
||||
password='password',
|
||||
)
|
||||
|
||||
def _create_due_schedule(self) -> CrawlSchedule:
|
||||
template = Crawl.objects.create(
|
||||
urls='https://example.com/feed.xml',
|
||||
max_depth=1,
|
||||
tags_str='scheduled',
|
||||
label='Scheduled Feed',
|
||||
notes='template',
|
||||
created_by=self.user,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
)
|
||||
schedule = CrawlSchedule.objects.create(
|
||||
template=template,
|
||||
schedule='daily',
|
||||
is_enabled=True,
|
||||
label='Scheduled Feed',
|
||||
notes='template',
|
||||
created_by=self.user,
|
||||
)
|
||||
past = timezone.now() - timedelta(days=2)
|
||||
Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
|
||||
template.refresh_from_db()
|
||||
schedule.refresh_from_db()
|
||||
return schedule
|
||||
|
||||
def test_global_orchestrator_materializes_due_schedule(self):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator._materialize_due_schedules()
|
||||
|
||||
scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
|
||||
self.assertEqual(scheduled_crawls.count(), 2)
|
||||
|
||||
queued_crawl = scheduled_crawls.last()
|
||||
self.assertIsNotNone(queued_crawl)
|
||||
assert queued_crawl is not None
|
||||
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
|
||||
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
|
||||
self.assertEqual(queued_crawl.max_depth, 1)
|
||||
self.assertEqual(queued_crawl.tags_str, 'scheduled')
|
||||
|
||||
def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
||||
|
||||
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
||||
|
||||
@patch.object(CrawlWorker, 'start')
|
||||
def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
with patch.object(orchestrator, '_claim_crawl', return_value=True):
|
||||
queue_sizes = orchestrator.check_queues_and_spawn_workers()
|
||||
|
||||
self.assertEqual(queue_sizes['crawl'], 1)
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
|
||||
mock_start.assert_not_called()
|
||||
@@ -1,76 +0,0 @@
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, cast
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.test import SimpleTestCase
|
||||
|
||||
from archivebox.workers.worker import SnapshotWorker
|
||||
|
||||
|
||||
class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
|
||||
def _make_worker(self):
|
||||
worker = SnapshotWorker.__new__(SnapshotWorker)
|
||||
worker.pid = 12345
|
||||
cast(Any, worker).snapshot = SimpleNamespace(
|
||||
status='started',
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
worker._snapshot_exceeded_hard_timeout = lambda: False
|
||||
worker._seal_snapshot_due_to_timeout = lambda: None
|
||||
worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
|
||||
worker._wait_for_hook = lambda process, ar: None
|
||||
return worker
|
||||
|
||||
@patch('archivebox.workers.worker.log_worker_event')
|
||||
def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
|
||||
worker = self._make_worker()
|
||||
archive_result = SimpleNamespace(
|
||||
status='succeeded',
|
||||
output_files={},
|
||||
output_str='scrolled 600px',
|
||||
output_json=None,
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
|
||||
worker._retry_failed_empty_foreground_hooks(
|
||||
[(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
|
||||
config={},
|
||||
)
|
||||
|
||||
mock_log.assert_not_called()
|
||||
|
||||
@patch('archivebox.workers.worker.log_worker_event')
|
||||
def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
|
||||
worker = self._make_worker()
|
||||
run_calls = []
|
||||
wait_calls = []
|
||||
|
||||
def run_hook(*args, **kwargs):
|
||||
run_calls.append((args, kwargs))
|
||||
return SimpleNamespace()
|
||||
|
||||
def wait_for_hook(process, ar):
|
||||
wait_calls.append((process, ar))
|
||||
ar.status = 'succeeded'
|
||||
ar.output_files = {'singlefile.html': {}}
|
||||
|
||||
archive_result = SimpleNamespace(
|
||||
status='failed',
|
||||
output_files={},
|
||||
output_str='',
|
||||
output_json=None,
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
|
||||
worker._run_hook = run_hook
|
||||
worker._wait_for_hook = wait_for_hook
|
||||
|
||||
worker._retry_failed_empty_foreground_hooks(
|
||||
[(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
|
||||
config={},
|
||||
)
|
||||
|
||||
assert len(run_calls) == 1
|
||||
assert len(wait_calls) == 1
|
||||
mock_log.assert_called_once()
|
||||
Reference in New Issue
Block a user