unified Process source of truth and better screenshot tests

This commit is contained in:
Nick Sweeting
2026-01-02 04:20:34 -08:00
parent 3672174dad
commit dd77511026
44 changed files with 3369 additions and 1919 deletions

View File

@@ -0,0 +1,345 @@
"""
Rich Layout-based live progress display for ArchiveBox orchestrator.
Shows a comprehensive dashboard with:
- Top: Crawl queue status (full width)
- Middle: 4-column grid of SnapshotWorker progress panels
- Bottom: Orchestrator/Daphne logs
"""
__package__ = 'archivebox.misc'
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from collections import deque
from rich import box
from rich.align import Align
from rich.console import Console, Group, RenderableType
from rich.layout import Layout
from rich.panel import Panel
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
from rich.table import Table
from rich.text import Text
from archivebox.config import VERSION
# Maximum number of SnapshotWorker columns to display
MAX_WORKER_COLUMNS = 4
class CrawlQueuePanel:
"""Display crawl queue status across full width."""
def __init__(self):
self.orchestrator_status = "Idle"
self.crawl_queue_count = 0
self.crawl_workers_count = 0
self.max_crawl_workers = 8
self.crawl_id: Optional[str] = None
def __rich__(self) -> Panel:
grid = Table.grid(expand=True)
grid.add_column(justify="left", ratio=1)
grid.add_column(justify="center", ratio=1)
grid.add_column(justify="center", ratio=1)
grid.add_column(justify="right", ratio=1)
# Left: ArchiveBox version + timestamp
left_text = Text()
left_text.append("ArchiveBox ", style="bold cyan")
left_text.append(f"v{VERSION}", style="bold yellow")
left_text.append(f"{datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
# Center-left: Crawl queue status
queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
center_left_text = Text()
center_left_text.append("Crawls: ", style="white")
center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
center_left_text.append(" queued", style="grey53")
# Center-right: CrawlWorker status
worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
center_right_text = Text()
center_right_text.append("Workers: ", style="white")
center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
center_right_text.append(" active", style="grey53")
# Right: Orchestrator status
status_color = "green" if self.crawl_workers_count > 0 else "grey53"
right_text = Text()
right_text.append("Status: ", style="white")
right_text.append(self.orchestrator_status, style=f"bold {status_color}")
if self.crawl_id:
right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
grid.add_row(left_text, center_left_text, center_right_text, right_text)
return Panel(grid, style="white on blue", box=box.ROUNDED)
class SnapshotWorkerPanel:
"""Display progress for a single SnapshotWorker."""
def __init__(self, worker_num: int):
self.worker_num = worker_num
self.snapshot_id: Optional[str] = None
self.snapshot_url: Optional[str] = None
self.total_hooks: int = 0
self.completed_hooks: int = 0
self.current_plugin: Optional[str] = None
self.status: str = "idle" # idle, working, completed
self.recent_logs: deque = deque(maxlen=5)
def __rich__(self) -> Panel:
if self.status == "idle":
content = Align.center(
Text("Idle", style="grey53"),
vertical="middle",
)
border_style = "grey53"
title_style = "grey53"
else:
# Build progress display
lines = []
# URL (truncated)
if self.snapshot_url:
url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
lines.append(Text(url_display, style="cyan"))
lines.append(Text()) # Spacing
# Progress bar
if self.total_hooks > 0:
pct = (self.completed_hooks / self.total_hooks) * 100
bar_width = 30
filled = int((pct / 100) * bar_width)
bar = "" * filled + "" * (bar_width - filled)
# Color based on progress
if pct < 30:
bar_style = "yellow"
elif pct < 100:
bar_style = "green"
else:
bar_style = "blue"
progress_text = Text()
progress_text.append(bar, style=bar_style)
progress_text.append(f" {pct:.0f}%", style="white")
lines.append(progress_text)
lines.append(Text()) # Spacing
# Stats
stats = Table.grid(padding=(0, 1))
stats.add_column(style="grey53", no_wrap=True)
stats.add_column(style="white")
stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
if self.current_plugin:
stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
lines.append(stats)
lines.append(Text()) # Spacing
# Recent logs
if self.recent_logs:
lines.append(Text("Recent:", style="grey53"))
for log_msg, log_style in self.recent_logs:
log_text = Text(f"{log_msg[:30]}", style=log_style)
lines.append(log_text)
content = Group(*lines)
border_style = "green" if self.status == "working" else "blue"
title_style = "green" if self.status == "working" else "blue"
return Panel(
content,
title=f"[{title_style}]Worker {self.worker_num}",
border_style=border_style,
box=box.ROUNDED,
height=20,
)
def add_log(self, message: str, style: str = "white"):
"""Add a log message to this worker's recent logs."""
self.recent_logs.append((message, style))
class OrchestratorLogPanel:
"""Display orchestrator and system logs."""
def __init__(self, max_events: int = 15):
self.events: deque = deque(maxlen=max_events)
self.max_events = max_events
def add_event(self, message: str, style: str = "white"):
"""Add an event to the log."""
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S")
self.events.append((timestamp, message, style))
def __rich__(self) -> Panel:
if not self.events:
content = Text("No recent events", style="grey53", justify="center")
else:
lines = []
for timestamp, message, style in self.events:
line = Text()
line.append(f"[{timestamp}] ", style="grey53")
line.append(message, style=style)
lines.append(line)
content = Group(*lines)
return Panel(
content,
title="[bold white]Orchestrator / Daphne Logs",
border_style="white",
box=box.ROUNDED,
height=12,
)
class ArchiveBoxProgressLayout:
"""
Main layout manager for ArchiveBox orchestrator progress display.
Layout structure:
┌─────────────────────────────────────────────────────────────┐
│ Crawl Queue (full width) │
├───────────────┬───────────────┬───────────────┬─────────────┤
│ Snapshot │ Snapshot │ Snapshot │ Snapshot │
│ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │
│ │ │ │ │
│ Progress + │ Progress + │ Progress + │ Progress + │
│ Stats + │ Stats + │ Stats + │ Stats + │
│ Logs │ Logs │ Logs │ Logs │
├───────────────┴───────────────┴───────────────┴─────────────┤
│ Orchestrator / Daphne Logs │
└─────────────────────────────────────────────────────────────┘
"""
def __init__(self, crawl_id: Optional[str] = None):
self.crawl_id = crawl_id
self.start_time = datetime.now(timezone.utc)
# Create components
self.crawl_queue = CrawlQueuePanel()
self.crawl_queue.crawl_id = crawl_id
# Create 4 worker panels
self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
self.orchestrator_log = OrchestratorLogPanel(max_events=12)
# Create layout
self.layout = self._make_layout()
# Track snapshot ID to worker panel mapping
self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index
def _make_layout(self) -> Layout:
"""Define the layout structure."""
layout = Layout(name="root")
# Top-level split: crawl_queue, workers, logs
layout.split(
Layout(name="crawl_queue", size=3),
Layout(name="workers", ratio=1),
Layout(name="logs", size=13),
)
# Split workers into 4 columns
layout["workers"].split_row(
Layout(name="worker1"),
Layout(name="worker2"),
Layout(name="worker3"),
Layout(name="worker4"),
)
# Assign components to layout sections
layout["crawl_queue"].update(self.crawl_queue)
layout["worker1"].update(self.worker_panels[0])
layout["worker2"].update(self.worker_panels[1])
layout["worker3"].update(self.worker_panels[2])
layout["worker4"].update(self.worker_panels[3])
layout["logs"].update(self.orchestrator_log)
return layout
def update_orchestrator_status(
self,
status: str,
crawl_queue_count: int = 0,
crawl_workers_count: int = 0,
max_crawl_workers: int = 8,
):
"""Update orchestrator status in the crawl queue panel."""
self.crawl_queue.orchestrator_status = status
self.crawl_queue.crawl_queue_count = crawl_queue_count
self.crawl_queue.crawl_workers_count = crawl_workers_count
self.crawl_queue.max_crawl_workers = max_crawl_workers
def update_snapshot_worker(
self,
snapshot_id: str,
url: str,
total: int,
completed: int,
current_plugin: str = "",
):
"""Update or assign a snapshot to a worker panel."""
# Find or assign worker panel for this snapshot
if snapshot_id not in self.snapshot_to_worker:
# Find first idle worker panel
worker_idx = None
for idx, panel in enumerate(self.worker_panels):
if panel.status == "idle":
worker_idx = idx
break
# If no idle worker, use round-robin (shouldn't happen often)
if worker_idx is None:
worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
self.snapshot_to_worker[snapshot_id] = worker_idx
# Get assigned worker panel
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Update panel
panel.snapshot_id = snapshot_id
panel.snapshot_url = url
panel.total_hooks = total
panel.completed_hooks = completed
panel.current_plugin = current_plugin
panel.status = "working" if completed < total else "completed"
def remove_snapshot_worker(self, snapshot_id: str):
"""Mark a snapshot worker as idle after completion."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Mark as idle
panel.status = "idle"
panel.snapshot_id = None
panel.snapshot_url = None
panel.total_hooks = 0
panel.completed_hooks = 0
panel.current_plugin = None
panel.recent_logs.clear()
# Remove mapping
del self.snapshot_to_worker[snapshot_id]
def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
"""Add a log message to a specific worker's panel."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
self.worker_panels[worker_idx].add_log(message, style)
def log_event(self, message: str, style: str = "white"):
"""Add an event to the orchestrator log."""
self.orchestrator_log.add_event(message, style)
def get_layout(self) -> Layout:
"""Get the Rich Layout object for rendering."""
return self.layout