mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
logging and admin ui improvements
This commit is contained in:
@@ -7,9 +7,14 @@ class Command(BaseCommand):
|
||||
help = 'Run the archivebox orchestrator'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
|
||||
parser.add_argument(
|
||||
'--exit-on-idle',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Exit when all work is complete (default: run forever)"
|
||||
)
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
daemon = kwargs.get('daemon', False)
|
||||
orchestrator = Orchestrator(exit_on_idle=not daemon)
|
||||
exit_on_idle = kwargs.get('exit_on_idle', False)
|
||||
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
|
||||
orchestrator.runloop()
|
||||
|
||||
@@ -12,16 +12,17 @@ Architecture:
|
||||
└── Each worker spawns task subprocesses via CLI
|
||||
|
||||
Usage:
|
||||
# Embedded in other commands (exits when done)
|
||||
# Default: runs forever (for use as subprocess of server)
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.runloop()
|
||||
|
||||
# Exit when done (for embedded use in other commands)
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.runloop()
|
||||
|
||||
# Daemon mode (runs forever)
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start() # fork and return
|
||||
|
||||
|
||||
# Or run via CLI
|
||||
archivebox orchestrator [--daemon]
|
||||
archivebox manage orchestrator # runs forever
|
||||
archivebox manage orchestrator --exit-on-idle # exits when done
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.workers'
|
||||
@@ -45,6 +46,14 @@ from .pid_utils import (
|
||||
)
|
||||
|
||||
|
||||
def _run_orchestrator_process(exit_on_idle: bool) -> None:
|
||||
"""Top-level function for multiprocessing (must be picklable)."""
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
|
||||
orchestrator.runloop()
|
||||
|
||||
|
||||
class Orchestrator:
|
||||
"""
|
||||
Manages worker processes by polling queues and spawning workers as needed.
|
||||
@@ -277,12 +286,12 @@ class Orchestrator:
|
||||
Fork orchestrator as a background process.
|
||||
Returns the PID of the new process.
|
||||
"""
|
||||
def run_orchestrator():
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
self.runloop()
|
||||
|
||||
proc = Process(target=run_orchestrator, name='orchestrator')
|
||||
# Use module-level function to avoid pickle errors with local functions
|
||||
proc = Process(
|
||||
target=_run_orchestrator_process,
|
||||
args=(self.exit_on_idle,),
|
||||
name='orchestrator'
|
||||
)
|
||||
proc.start()
|
||||
|
||||
assert proc.pid is not None
|
||||
|
||||
@@ -28,7 +28,7 @@ WORKERS_DIR_NAME = "workers"
|
||||
|
||||
ORCHESTRATOR_WORKER = {
|
||||
"name": "worker_orchestrator",
|
||||
"command": "archivebox manage orchestrator",
|
||||
"command": "archivebox manage orchestrator", # runs forever by default
|
||||
"autostart": "true",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_orchestrator.log",
|
||||
@@ -332,14 +332,14 @@ def stop_worker(supervisor, daemon_name):
|
||||
|
||||
def tail_worker_logs(log_path: str):
|
||||
get_or_create_supervisord_process(daemonize=False)
|
||||
|
||||
|
||||
from rich.live import Live
|
||||
from rich.table import Table
|
||||
|
||||
|
||||
table = Table()
|
||||
table.add_column("TS")
|
||||
table.add_column("URL")
|
||||
|
||||
|
||||
try:
|
||||
with Live(table, refresh_per_second=1) as live: # update 4 times a second to feel fluid
|
||||
with open(log_path, 'r') as f:
|
||||
@@ -352,6 +352,83 @@ def tail_worker_logs(log_path: str):
|
||||
except SystemExit:
|
||||
pass
|
||||
|
||||
|
||||
def tail_multiple_worker_logs(log_files: list[str], follow=True):
|
||||
"""Tail multiple log files simultaneously, interleaving their output."""
|
||||
import select
|
||||
from pathlib import Path
|
||||
|
||||
# Convert relative paths to absolute paths
|
||||
log_paths = []
|
||||
for log_file in log_files:
|
||||
log_path = Path(log_file)
|
||||
if not log_path.is_absolute():
|
||||
log_path = CONSTANTS.DATA_DIR / log_path
|
||||
|
||||
# Create log file if it doesn't exist
|
||||
if not log_path.exists():
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_path.touch()
|
||||
|
||||
log_paths.append(log_path)
|
||||
|
||||
# Open all log files
|
||||
file_handles = []
|
||||
for log_path in log_paths:
|
||||
try:
|
||||
f = open(log_path, 'r')
|
||||
# Seek to end of file if following
|
||||
if follow:
|
||||
f.seek(0, 2) # Seek to end
|
||||
file_handles.append((log_path.name, f))
|
||||
except Exception as e:
|
||||
print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]")
|
||||
|
||||
if not file_handles:
|
||||
print("[red]No log files could be opened[/red]")
|
||||
return
|
||||
|
||||
# Print which logs we're tailing
|
||||
log_names = [name for name, _ in file_handles]
|
||||
print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
|
||||
print()
|
||||
|
||||
try:
|
||||
while follow:
|
||||
# Read available lines from all files
|
||||
for log_name, f in file_handles:
|
||||
line = f.readline()
|
||||
if line:
|
||||
# Colorize based on log source
|
||||
if 'orchestrator' in log_name.lower():
|
||||
color = 'cyan'
|
||||
elif 'daphne' in log_name.lower():
|
||||
color = 'green'
|
||||
else:
|
||||
color = 'white'
|
||||
|
||||
# Strip ANSI codes if present (supervisord does this but just in case)
|
||||
import re
|
||||
line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
|
||||
|
||||
if line_clean:
|
||||
print(f'[{color}][{log_name}][/{color}] {line_clean}')
|
||||
|
||||
# Small sleep to avoid busy-waiting
|
||||
time.sleep(0.1)
|
||||
|
||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||
print("\n[yellow][i] Stopped tailing logs[/i][/yellow]")
|
||||
except SystemExit:
|
||||
pass
|
||||
finally:
|
||||
# Close all file handles
|
||||
for _, f in file_handles:
|
||||
try:
|
||||
f.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def watch_worker(supervisor, daemon_name, interval=5):
|
||||
"""loop continuously and monitor worker's health"""
|
||||
while True:
|
||||
|
||||
@@ -3,6 +3,9 @@ Background task functions for queuing work to the orchestrator.
|
||||
|
||||
These functions queue Snapshots/Crawls for processing by setting their status
|
||||
to QUEUED, which the orchestrator workers will pick up and process.
|
||||
|
||||
NOTE: These functions do NOT start the orchestrator - they assume it's already
|
||||
running via `archivebox server` (supervisord) or will be run inline by the CLI.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.workers'
|
||||
@@ -10,16 +13,6 @@ __package__ = 'archivebox.workers'
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
def ensure_orchestrator_running():
|
||||
"""Ensure the orchestrator is running to process queued items."""
|
||||
from .orchestrator import Orchestrator
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
# Start orchestrator in background
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.start()
|
||||
|
||||
|
||||
def bg_add(add_kwargs: dict) -> int:
|
||||
"""
|
||||
Add URLs and queue them for archiving.
|
||||
@@ -36,9 +29,6 @@ def bg_add(add_kwargs: dict) -> int:
|
||||
|
||||
result = add(**add_kwargs)
|
||||
|
||||
# Ensure orchestrator is running to process the new snapshots
|
||||
ensure_orchestrator_running()
|
||||
|
||||
return len(result) if result else 0
|
||||
|
||||
|
||||
@@ -66,10 +56,6 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
|
||||
)
|
||||
queued_count += 1
|
||||
|
||||
# Ensure orchestrator is running to process the queued snapshots
|
||||
if queued_count > 0:
|
||||
ensure_orchestrator_running()
|
||||
|
||||
return queued_count
|
||||
|
||||
|
||||
@@ -90,9 +76,6 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Ensure orchestrator is running to process the queued snapshot
|
||||
ensure_orchestrator_running()
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
@@ -67,8 +67,8 @@ class Worker:
|
||||
# Configuration (can be overridden by subclasses)
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
|
||||
POLL_INTERVAL: ClassVar[float] = 0.5
|
||||
IDLE_TIMEOUT: ClassVar[int] = 3 # Exit after N idle iterations (set to 0 to never exit)
|
||||
POLL_INTERVAL: ClassVar[float] = 1.0
|
||||
IDLE_TIMEOUT: ClassVar[int] = 10 # Exit after N idle iterations (10 sec at 1.0 poll interval)
|
||||
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
|
||||
self.worker_id = worker_id
|
||||
|
||||
Reference in New Issue
Block a user