mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
fix lint
This commit is contained in:
@@ -94,10 +94,10 @@ class Orchestrator:
|
||||
self.POLL_INTERVAL = 0.25
|
||||
# Exit quickly once idle in foreground mode
|
||||
self.IDLE_TIMEOUT = 1
|
||||
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
|
||||
|
||||
|
||||
@classmethod
|
||||
def is_running(cls) -> bool:
|
||||
"""Check if an orchestrator is already running."""
|
||||
@@ -223,7 +223,7 @@ class Orchestrator:
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
)
|
||||
|
||||
|
||||
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
|
||||
"""Determine if we should spawn a new worker."""
|
||||
if queue_count == 0:
|
||||
@@ -253,7 +253,7 @@ class Orchestrator:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
|
||||
"""Spawn a new worker process. Returns PID or None if spawn failed."""
|
||||
try:
|
||||
@@ -286,7 +286,10 @@ class Orchestrator:
|
||||
print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
|
||||
print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]')
|
||||
for p in all_procs:
|
||||
print(f'[yellow] -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]')
|
||||
print(
|
||||
f'[yellow] -> type={p.process_type} status={p.status} '
|
||||
f'parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]'
|
||||
)
|
||||
|
||||
worker_process = Process.objects.filter(
|
||||
pid=pid,
|
||||
@@ -324,7 +327,7 @@ class Orchestrator:
|
||||
error=e,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def check_queues_and_spawn_workers(self) -> dict[str, int]:
|
||||
"""
|
||||
Check Binary and Crawl queues and spawn workers as needed.
|
||||
@@ -584,11 +587,11 @@ class Orchestrator:
|
||||
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
|
||||
"""Check if any queue has pending work."""
|
||||
return any(count > 0 for count in queue_sizes.values())
|
||||
|
||||
|
||||
def has_running_workers(self) -> bool:
|
||||
"""Check if any workers are still running."""
|
||||
return self.get_total_worker_count() > 0
|
||||
|
||||
|
||||
def has_future_work(self) -> bool:
|
||||
"""Check if there's work scheduled for the future (retry_at > now) in Crawl queue."""
|
||||
from archivebox.crawls.models import Crawl
|
||||
@@ -605,38 +608,38 @@ class Orchestrator:
|
||||
qs = qs.filter(id=self.crawl_id)
|
||||
|
||||
return qs.count() > 0
|
||||
|
||||
|
||||
def on_tick(self, queue_sizes: dict[str, int]) -> None:
|
||||
"""Called each orchestrator tick. Override for custom behavior."""
|
||||
# Tick logging suppressed to reduce noise
|
||||
pass
|
||||
|
||||
|
||||
def on_idle(self) -> None:
|
||||
"""Called when orchestrator is idle (no work, no workers)."""
|
||||
# Idle logging suppressed to reduce noise
|
||||
pass
|
||||
|
||||
|
||||
def should_exit(self, queue_sizes: dict[str, int]) -> bool:
|
||||
"""Determine if orchestrator should exit."""
|
||||
if not self.exit_on_idle:
|
||||
return False
|
||||
|
||||
|
||||
if self.IDLE_TIMEOUT == 0:
|
||||
return False
|
||||
|
||||
|
||||
# Don't exit if there's pending or future work
|
||||
if self.has_pending_work(queue_sizes):
|
||||
return False
|
||||
|
||||
|
||||
if self.has_running_workers():
|
||||
return False
|
||||
|
||||
|
||||
if self.has_future_work():
|
||||
return False
|
||||
|
||||
|
||||
# Exit after idle timeout
|
||||
return self.idle_count >= self.IDLE_TIMEOUT
|
||||
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Main orchestrator loop."""
|
||||
from rich.live import Live
|
||||
@@ -702,7 +705,7 @@ class Orchestrator:
|
||||
os.close(devnull_fd)
|
||||
os.close(stdout_for_restore)
|
||||
os.close(stderr_for_restore)
|
||||
except:
|
||||
except OSError:
|
||||
pass
|
||||
# stdout_for_console is closed by orchestrator_console
|
||||
|
||||
@@ -1132,7 +1135,6 @@ class Orchestrator:
|
||||
|
||||
# Count hooks by status for debugging
|
||||
queued = snapshot.archiveresult_set.filter(status='queued').count()
|
||||
started = snapshot.archiveresult_set.filter(status='started').count()
|
||||
|
||||
# Find currently running hook (ordered by hook_name to get lowest step number)
|
||||
current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first()
|
||||
@@ -1211,7 +1213,7 @@ class Orchestrator:
|
||||
for snapshot_id in list(snapshot_progress.keys()):
|
||||
if snapshot_id not in active_ids:
|
||||
progress_layout.log_event(
|
||||
f"Snapshot completed/removed",
|
||||
"Snapshot completed/removed",
|
||||
style="blue"
|
||||
)
|
||||
if snapshot_id in snapshot_progress:
|
||||
@@ -1263,7 +1265,7 @@ class Orchestrator:
|
||||
raise
|
||||
else:
|
||||
self.on_shutdown()
|
||||
|
||||
|
||||
def start(self) -> int:
|
||||
"""
|
||||
Fork orchestrator as a background process.
|
||||
@@ -1285,7 +1287,7 @@ class Orchestrator:
|
||||
pid=proc.pid,
|
||||
)
|
||||
return proc.pid
|
||||
|
||||
|
||||
@classmethod
|
||||
def get_or_start(cls, exit_on_idle: bool = True) -> 'Orchestrator':
|
||||
"""
|
||||
@@ -1296,6 +1298,6 @@ class Orchestrator:
|
||||
print('[grey53]👨✈️ Orchestrator already running[/grey53]')
|
||||
# Return a placeholder - actual orchestrator is in another process
|
||||
return cls(exit_on_idle=exit_on_idle)
|
||||
|
||||
|
||||
orchestrator = cls(exit_on_idle=exit_on_idle)
|
||||
return orchestrator
|
||||
|
||||
@@ -2,7 +2,6 @@ __package__ = 'archivebox.workers'
|
||||
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
import socket
|
||||
import psutil
|
||||
import shutil
|
||||
@@ -42,7 +41,7 @@ ORCHESTRATOR_WORKER = {
|
||||
|
||||
SERVER_WORKER = lambda host, port: {
|
||||
"name": "worker_daphne",
|
||||
"command": f"daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
|
||||
"command": f"{sys.executable} -m daphne --bind={host} --port={port} --application-close-timeout=600 archivebox.core.asgi:application",
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_daphne.log",
|
||||
@@ -513,8 +512,6 @@ def watch_worker(supervisor, daemon_name, interval=5):
|
||||
|
||||
|
||||
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||
global _supervisord_proc
|
||||
|
||||
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
|
||||
|
||||
bg_workers = [
|
||||
@@ -551,8 +548,6 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||
|
||||
|
||||
def start_cli_workers(watch=False):
|
||||
global _supervisord_proc
|
||||
|
||||
supervisor = get_or_create_supervisord_process(daemonize=False)
|
||||
|
||||
start_worker(supervisor, ORCHESTRATOR_WORKER)
|
||||
|
||||
@@ -10,9 +10,7 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import timedelta
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
@@ -217,7 +215,6 @@ class TestOrchestratorWithProcess(TestCase):
|
||||
|
||||
def test_orchestrator_scoped_worker_count(self):
|
||||
"""Orchestrator with crawl_id should count only descendant workers."""
|
||||
import time
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
@@ -13,13 +13,10 @@ __package__ = 'archivebox.workers'
|
||||
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from typing import ClassVar, Any
|
||||
from datetime import timedelta
|
||||
from typing import TYPE_CHECKING, Any, ClassVar
|
||||
from pathlib import Path
|
||||
from multiprocessing import cpu_count
|
||||
|
||||
from django.db.models import QuerySet
|
||||
from django.utils import timezone
|
||||
from django.conf import settings
|
||||
|
||||
@@ -28,6 +25,9 @@ from rich import print
|
||||
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
|
||||
CPU_COUNT = cpu_count()
|
||||
|
||||
@@ -314,7 +314,10 @@ class Worker:
|
||||
process.kill(signal_num=signal.SIGKILL)
|
||||
log_worker_event(
|
||||
worker_type=worker_type,
|
||||
event=f'⚠ Sent SIGKILL to {hook_name} + {len(children_pids) if children_pids else 0} children (exceeded timeout)',
|
||||
event=(
|
||||
f'⚠ Sent SIGKILL to {hook_name} + '
|
||||
f'{len(children_pids) if children_pids else 0} children (exceeded timeout)'
|
||||
),
|
||||
indent_level=indent_level,
|
||||
pid=self.pid,
|
||||
)
|
||||
@@ -341,7 +344,6 @@ class Worker:
|
||||
from archivebox.machine.models import Process, Machine
|
||||
from archivebox.config.configset import get_config
|
||||
from pathlib import Path
|
||||
from django.conf import settings
|
||||
import sys
|
||||
|
||||
refresh_machine_config = bool(
|
||||
@@ -552,7 +554,7 @@ class CrawlWorker(Worker):
|
||||
|
||||
# Check if crawl is done
|
||||
if self._is_crawl_finished():
|
||||
print(f'🔄 Crawl finished, sealing...', file=sys.stderr)
|
||||
print('🔄 Crawl finished, sealing...', file=sys.stderr)
|
||||
self.crawl.sm.seal()
|
||||
break
|
||||
|
||||
@@ -813,7 +815,8 @@ class SnapshotWorker(Worker):
|
||||
is_background = is_background_hook(hook_name)
|
||||
|
||||
# Create ArchiveResult for THIS HOOK (not per plugin)
|
||||
# One plugin can have multiple hooks (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js)
|
||||
# One plugin can have multiple hooks
|
||||
# (e.g., chrome/on_Snapshot__20_launch_chrome.js, chrome/on_Snapshot__21_navigate_chrome.js)
|
||||
# Unique key = (snapshot, plugin, hook_name) for idempotency
|
||||
ar, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=self.snapshot,
|
||||
@@ -868,7 +871,7 @@ class SnapshotWorker(Worker):
|
||||
self.snapshot.sm.seal()
|
||||
self.snapshot.refresh_from_db()
|
||||
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
# Mark snapshot as sealed even on error (still triggers cleanup)
|
||||
self._finalize_background_hooks()
|
||||
self.snapshot.sm.seal()
|
||||
@@ -1019,7 +1022,6 @@ class SnapshotWorker(Worker):
|
||||
self.background_processes = {}
|
||||
|
||||
# Update background results now that hooks are done
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
bg_results = self.snapshot.archiveresult_set.filter(
|
||||
hook_name__contains='.bg.',
|
||||
@@ -1034,7 +1036,6 @@ class SnapshotWorker(Worker):
|
||||
if not self.background_processes:
|
||||
return
|
||||
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
for hook_name, process in list(self.background_processes.items()):
|
||||
exit_code = process.poll()
|
||||
@@ -1165,7 +1166,6 @@ class BinaryWorker(Worker):
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Install binary(ies)."""
|
||||
import sys
|
||||
|
||||
self.on_startup()
|
||||
|
||||
@@ -1216,7 +1216,7 @@ class BinaryWorker(Worker):
|
||||
except Exception as e:
|
||||
log_worker_event(
|
||||
worker_type='BinaryWorker',
|
||||
event=f'Failed to install binary',
|
||||
event='Failed to install binary',
|
||||
indent_level=1,
|
||||
pid=self.pid,
|
||||
error=e,
|
||||
|
||||
Reference in New Issue
Block a user