mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Fix code review issues in process management refactor
- Add pwd validation in Process.launch() to prevent crashes - Fix psutil returncode handling (use wait() return value, not returncode attr) - Add None check for proc.pid in cleanup_stale_running() - Add stale process cleanup in Orchestrator.is_running() - Ensure orchestrator process_type is correctly set to ORCHESTRATOR - Fix KeyboardInterrupt handling (exit code 0 for graceful shutdown) - Throttle cleanup_stale_running() to once per 30 seconds for performance - Fix worker process_type to use TypeChoices.WORKER consistently - Fix get_running_workers() API to return list of dicts (not Process objects) - Only delete PID files after successful kill or confirmed stale - Fix migration index names to match between SQL and Django state - Remove db_index=True from process_type (index created manually) - Update documentation to reflect actual implementation - Add explanatory comments to empty except blocks - Fix exit codes to use Unix convention (128 + signal number) Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
This commit is contained in:
@@ -72,6 +72,7 @@ class Orchestrator:
|
||||
self.pid: int = os.getpid()
|
||||
self.pid_file = None
|
||||
self.idle_count: int = 0
|
||||
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
|
||||
@@ -81,15 +82,21 @@ class Orchestrator:
|
||||
"""Check if an orchestrator is already running."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
return Process.get_running_count(process_type='orchestrator') > 0
|
||||
# Clean up stale processes before counting
|
||||
Process.cleanup_stale_running()
|
||||
return Process.get_running_count(process_type=Process.TypeChoices.ORCHESTRATOR) > 0
|
||||
|
||||
def on_startup(self) -> None:
|
||||
"""Called when orchestrator starts."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
self.pid = os.getpid()
|
||||
# Register orchestrator process in database
|
||||
# Register orchestrator process in database with explicit type
|
||||
self.db_process = Process.current()
|
||||
# Ensure the process type is correctly set to ORCHESTRATOR
|
||||
if self.db_process.process_type != Process.TypeChoices.ORCHESTRATOR:
|
||||
self.db_process.process_type = Process.TypeChoices.ORCHESTRATOR
|
||||
self.db_process.save(update_fields=['process_type'])
|
||||
|
||||
# Clean up any stale Process records from previous runs
|
||||
stale_count = Process.cleanup_stale_running()
|
||||
@@ -115,7 +122,8 @@ class Orchestrator:
|
||||
"""Called when orchestrator shuts down."""
|
||||
# Update Process record status
|
||||
if hasattr(self, 'db_process') and self.db_process:
|
||||
self.db_process.exit_code = 1 if error else 0
|
||||
# KeyboardInterrupt is a graceful shutdown, not an error
|
||||
self.db_process.exit_code = 1 if error and not isinstance(error, KeyboardInterrupt) else 0
|
||||
self.db_process.status = self.db_process.StatusChoices.EXITED
|
||||
self.db_process.ended_at = timezone.now()
|
||||
self.db_process.save()
|
||||
@@ -131,8 +139,15 @@ class Orchestrator:
|
||||
def get_total_worker_count(self) -> int:
|
||||
"""Get total count of running workers across all types."""
|
||||
from archivebox.machine.models import Process
|
||||
import time
|
||||
|
||||
# Throttle cleanup to once every 30 seconds to avoid performance issues
|
||||
CLEANUP_THROTTLE_SECONDS = 30
|
||||
now = time.time()
|
||||
if now - self._last_cleanup_time > CLEANUP_THROTTLE_SECONDS:
|
||||
Process.cleanup_stale_running()
|
||||
self._last_cleanup_time = now
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
|
||||
|
||||
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
|
||||
|
||||
@@ -290,7 +290,7 @@ class Worker:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
if worker_id is None:
|
||||
worker_id = Process.get_next_worker_id(process_type=cls.name)
|
||||
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
|
||||
|
||||
# Use module-level function for pickling compatibility
|
||||
proc = MPProcess(
|
||||
@@ -310,14 +310,24 @@ class Worker:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
return list(Process.get_running(process_type=cls.name))
|
||||
# Convert Process objects to dicts to match the expected API contract
|
||||
processes = Process.get_running(process_type=Process.TypeChoices.WORKER)
|
||||
return [
|
||||
{
|
||||
'pid': p.pid,
|
||||
'worker_id': p.id,
|
||||
'started_at': p.started_at.isoformat() if p.started_at else None,
|
||||
'status': p.status,
|
||||
}
|
||||
for p in processes
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_worker_count(cls) -> int:
|
||||
"""Get count of running workers of this type."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
return Process.get_running_count(process_type=cls.name)
|
||||
return Process.get_running_count(process_type=Process.TypeChoices.WORKER)
|
||||
|
||||
|
||||
class CrawlWorker(Worker):
|
||||
|
||||
Reference in New Issue
Block a user