Fix code review issues in process management refactor

- Add pwd validation in Process.launch() to prevent crashes
- Fix psutil returncode handling (use wait() return value, not returncode attr)
- Add None check for proc.pid in cleanup_stale_running()
- Add stale process cleanup in Orchestrator.is_running()
- Ensure orchestrator process_type is correctly set to ORCHESTRATOR
- Fix KeyboardInterrupt handling (exit code 0 for graceful shutdown)
- Throttle cleanup_stale_running() to once per 30 seconds for performance
- Fix worker process_type to use TypeChoices.WORKER consistently
- Fix get_running_workers() API to return list of dicts (not Process objects)
- Only delete PID files after successful kill or confirmed stale
- Fix migration index names to match between SQL and Django state
- Remove db_index=True from process_type (index created manually)
- Update documentation to reflect actual implementation
- Add explanatory comments to empty except blocks
- Fix exit codes to use Unix convention (128 + signal number)

Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
This commit is contained in:
claude[bot]
2025-12-31 11:14:47 +00:00
parent b822352fc3
commit ee201a0f83
6 changed files with 60 additions and 23 deletions

View File

@@ -72,6 +72,7 @@ class Orchestrator:
self.pid: int = os.getpid()
self.pid_file = None
self.idle_count: int = 0
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -81,15 +82,21 @@ class Orchestrator:
"""Check if an orchestrator is already running."""
from archivebox.machine.models import Process
return Process.get_running_count(process_type='orchestrator') > 0
# Clean up stale processes before counting
Process.cleanup_stale_running()
return Process.get_running_count(process_type=Process.TypeChoices.ORCHESTRATOR) > 0
def on_startup(self) -> None:
"""Called when orchestrator starts."""
from archivebox.machine.models import Process
self.pid = os.getpid()
# Register orchestrator process in database
# Register orchestrator process in database with explicit type
self.db_process = Process.current()
# Ensure the process type is correctly set to ORCHESTRATOR
if self.db_process.process_type != Process.TypeChoices.ORCHESTRATOR:
self.db_process.process_type = Process.TypeChoices.ORCHESTRATOR
self.db_process.save(update_fields=['process_type'])
# Clean up any stale Process records from previous runs
stale_count = Process.cleanup_stale_running()
@@ -115,7 +122,8 @@ class Orchestrator:
"""Called when orchestrator shuts down."""
# Update Process record status
if hasattr(self, 'db_process') and self.db_process:
self.db_process.exit_code = 1 if error else 0
# KeyboardInterrupt is a graceful shutdown, not an error
self.db_process.exit_code = 1 if error and not isinstance(error, KeyboardInterrupt) else 0
self.db_process.status = self.db_process.StatusChoices.EXITED
self.db_process.ended_at = timezone.now()
self.db_process.save()
@@ -131,8 +139,15 @@ class Orchestrator:
def get_total_worker_count(self) -> int:
"""Get total count of running workers across all types."""
from archivebox.machine.models import Process
import time
# Throttle cleanup to once every 30 seconds to avoid performance issues
CLEANUP_THROTTLE_SECONDS = 30
now = time.time()
if now - self._last_cleanup_time > CLEANUP_THROTTLE_SECONDS:
Process.cleanup_stale_running()
self._last_cleanup_time = now
Process.cleanup_stale_running()
return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:

View File

@@ -290,7 +290,7 @@ class Worker:
from archivebox.machine.models import Process
if worker_id is None:
worker_id = Process.get_next_worker_id(process_type=cls.name)
worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
# Use module-level function for pickling compatibility
proc = MPProcess(
@@ -310,14 +310,24 @@ class Worker:
from archivebox.machine.models import Process
Process.cleanup_stale_running()
return list(Process.get_running(process_type=cls.name))
# Convert Process objects to dicts to match the expected API contract
processes = Process.get_running(process_type=Process.TypeChoices.WORKER)
return [
{
'pid': p.pid,
'worker_id': p.id,
'started_at': p.started_at.isoformat() if p.started_at else None,
'status': p.status,
}
for p in processes
]
@classmethod
def get_worker_count(cls) -> int:
"""Get count of running workers of this type."""
from archivebox.machine.models import Process
return Process.get_running_count(process_type=cls.name)
return Process.get_running_count(process_type=Process.TypeChoices.WORKER)
class CrawlWorker(Worker):