logging and admin ui improvements

This commit is contained in:
Nick Sweeting
2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions

View File

@@ -7,9 +7,14 @@ class Command(BaseCommand):
help = 'Run the archivebox orchestrator'
def add_arguments(self, parser):
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
parser.add_argument(
'--exit-on-idle',
action='store_true',
default=False,
help="Exit when all work is complete (default: run forever)"
)
def handle(self, *args, **kwargs):
daemon = kwargs.get('daemon', False)
orchestrator = Orchestrator(exit_on_idle=not daemon)
exit_on_idle = kwargs.get('exit_on_idle', False)
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
orchestrator.runloop()

View File

@@ -12,16 +12,17 @@ Architecture:
└── Each worker spawns task subprocesses via CLI
Usage:
# Embedded in other commands (exits when done)
# Default: runs forever (for use as subprocess of server)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.runloop()
# Exit when done (for embedded use in other commands)
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop()
# Daemon mode (runs forever)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start() # fork and return
# Or run via CLI
archivebox orchestrator [--daemon]
archivebox manage orchestrator # runs forever
archivebox manage orchestrator --exit-on-idle # exits when done
"""
__package__ = 'archivebox.workers'
@@ -45,6 +46,14 @@ from .pid_utils import (
)
def _run_orchestrator_process(exit_on_idle: bool) -> None:
"""Top-level function for multiprocessing (must be picklable)."""
from archivebox.config.django import setup_django
setup_django()
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
orchestrator.runloop()
class Orchestrator:
"""
Manages worker processes by polling queues and spawning workers as needed.
@@ -277,12 +286,12 @@ class Orchestrator:
Fork orchestrator as a background process.
Returns the PID of the new process.
"""
def run_orchestrator():
from archivebox.config.django import setup_django
setup_django()
self.runloop()
proc = Process(target=run_orchestrator, name='orchestrator')
# Use module-level function to avoid pickle errors with local functions
proc = Process(
target=_run_orchestrator_process,
args=(self.exit_on_idle,),
name='orchestrator'
)
proc.start()
assert proc.pid is not None

View File

@@ -28,7 +28,7 @@ WORKERS_DIR_NAME = "workers"
ORCHESTRATOR_WORKER = {
"name": "worker_orchestrator",
"command": "archivebox manage orchestrator",
"command": "archivebox manage orchestrator", # runs forever by default
"autostart": "true",
"autorestart": "true",
"stdout_logfile": "logs/worker_orchestrator.log",
@@ -332,14 +332,14 @@ def stop_worker(supervisor, daemon_name):
def tail_worker_logs(log_path: str):
get_or_create_supervisord_process(daemonize=False)
from rich.live import Live
from rich.table import Table
table = Table()
table.add_column("TS")
table.add_column("URL")
try:
with Live(table, refresh_per_second=1) as live: # update 4 times a second to feel fluid
with open(log_path, 'r') as f:
@@ -352,6 +352,83 @@ def tail_worker_logs(log_path: str):
except SystemExit:
pass
def tail_multiple_worker_logs(log_files: list[str], follow=True):
"""Tail multiple log files simultaneously, interleaving their output."""
import select
from pathlib import Path
# Convert relative paths to absolute paths
log_paths = []
for log_file in log_files:
log_path = Path(log_file)
if not log_path.is_absolute():
log_path = CONSTANTS.DATA_DIR / log_path
# Create log file if it doesn't exist
if not log_path.exists():
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.touch()
log_paths.append(log_path)
# Open all log files
file_handles = []
for log_path in log_paths:
try:
f = open(log_path, 'r')
# Seek to end of file if following
if follow:
f.seek(0, 2) # Seek to end
file_handles.append((log_path.name, f))
except Exception as e:
print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]")
if not file_handles:
print("[red]No log files could be opened[/red]")
return
# Print which logs we're tailing
log_names = [name for name, _ in file_handles]
print(f"[dim]Tailing: {', '.join(log_names)}[/dim]")
print()
try:
while follow:
# Read available lines from all files
for log_name, f in file_handles:
line = f.readline()
if line:
# Colorize based on log source
if 'orchestrator' in log_name.lower():
color = 'cyan'
elif 'daphne' in log_name.lower():
color = 'green'
else:
color = 'white'
# Strip ANSI codes if present (supervisord does this but just in case)
import re
line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip())
if line_clean:
print(f'[{color}][{log_name}][/{color}] {line_clean}')
# Small sleep to avoid busy-waiting
time.sleep(0.1)
except (KeyboardInterrupt, BrokenPipeError, IOError):
print("\n[yellow][i] Stopped tailing logs[/i][/yellow]")
except SystemExit:
pass
finally:
# Close all file handles
for _, f in file_handles:
try:
f.close()
except Exception:
pass
def watch_worker(supervisor, daemon_name, interval=5):
"""loop continuously and monitor worker's health"""
while True:

View File

@@ -3,6 +3,9 @@ Background task functions for queuing work to the orchestrator.
These functions queue Snapshots/Crawls for processing by setting their status
to QUEUED, which the orchestrator workers will pick up and process.
NOTE: These functions do NOT start the orchestrator - they assume it's already
running via `archivebox server` (supervisord) or will be run inline by the CLI.
"""
__package__ = 'archivebox.workers'
@@ -10,16 +13,6 @@ __package__ = 'archivebox.workers'
from django.utils import timezone
def ensure_orchestrator_running():
"""Ensure the orchestrator is running to process queued items."""
from .orchestrator import Orchestrator
if not Orchestrator.is_running():
# Start orchestrator in background
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.start()
def bg_add(add_kwargs: dict) -> int:
"""
Add URLs and queue them for archiving.
@@ -36,9 +29,6 @@ def bg_add(add_kwargs: dict) -> int:
result = add(**add_kwargs)
# Ensure orchestrator is running to process the new snapshots
ensure_orchestrator_running()
return len(result) if result else 0
@@ -66,10 +56,6 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
)
queued_count += 1
# Ensure orchestrator is running to process the queued snapshots
if queued_count > 0:
ensure_orchestrator_running()
return queued_count
@@ -90,9 +76,6 @@ def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
# Ensure orchestrator is running to process the queued snapshot
ensure_orchestrator_running()
return 1
return 0

View File

@@ -67,8 +67,8 @@ class Worker:
# Configuration (can be overridden by subclasses)
MAX_TICK_TIME: ClassVar[int] = 60
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
POLL_INTERVAL: ClassVar[float] = 0.5
IDLE_TIMEOUT: ClassVar[int] = 3 # Exit after N idle iterations (set to 0 to never exit)
POLL_INTERVAL: ClassVar[float] = 1.0
IDLE_TIMEOUT: ClassVar[int] = 10 # Exit after N idle iterations (10 sec at 1.0 poll interval)
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
self.worker_id = worker_id