tons of fixes with codex

This commit is contained in:
Nick Sweeting
2026-01-19 01:00:53 -08:00
parent eaf7256345
commit c7b2217cd6
184 changed files with 3943 additions and 2420 deletions

View File

@@ -180,9 +180,11 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
return len(f'file://{socket_file}') <= 96
tmp_is_valid = False
allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes')
try:
tmp_is_valid = dir_is_writable(tmp_dir)
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
if not allow_no_unix_sockets:
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
return True

View File

@@ -3,30 +3,29 @@ Rich Layout-based live progress display for ArchiveBox orchestrator.
Shows a comprehensive dashboard with:
- Top: Crawl queue status (full width)
- Middle: 4-column grid of SnapshotWorker progress panels
- Middle: Running process logs (dynamic panels)
- Bottom: Orchestrator/Daphne logs
"""
__package__ = 'archivebox.misc'
from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from typing import List, Optional, Any
from collections import deque
from pathlib import Path
from rich import box
from rich.align import Align
from rich.console import Console, Group, RenderableType
from rich.console import Group
from rich.layout import Layout
from rich.columns import Columns
from rich.panel import Panel
from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
from rich.table import Table
from rich.text import Text
from rich.table import Table
from rich.tree import Tree
from archivebox.config import VERSION
# Maximum number of SnapshotWorker columns to display
MAX_WORKER_COLUMNS = 4
class CrawlQueuePanel:
"""Display crawl queue status across full width."""
@@ -35,6 +34,8 @@ class CrawlQueuePanel:
self.orchestrator_status = "Idle"
self.crawl_queue_count = 0
self.crawl_workers_count = 0
self.binary_queue_count = 0
self.binary_workers_count = 0
self.max_crawl_workers = 8
self.crawl_id: Optional[str] = None
@@ -51,19 +52,27 @@ class CrawlQueuePanel:
left_text.append(f"v{VERSION}", style="bold yellow")
left_text.append(f"{datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
# Center-left: Crawl queue status
# Center-left: Crawl + Binary queue status
queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
center_left_text = Text()
center_left_text.append("Crawls: ", style="white")
center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
center_left_text.append(" queued", style="grey53")
center_left_text.append(" • Binaries: ", style="white")
binary_queue_style = "yellow" if self.binary_queue_count > 0 else "grey53"
center_left_text.append(str(self.binary_queue_count), style=f"bold {binary_queue_style}")
center_left_text.append(" queued", style="grey53")
# Center-right: CrawlWorker status
# Center-right: Worker status
worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
center_right_text = Text()
center_right_text.append("Workers: ", style="white")
center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
center_right_text.append(" active", style="grey53")
center_right_text.append(" crawl", style="grey53")
binary_worker_style = "green" if self.binary_workers_count > 0 else "grey53"
center_right_text.append("", style="grey53")
center_right_text.append(str(self.binary_workers_count), style=f"bold {binary_worker_style}")
center_right_text.append(" binary", style="grey53")
# Right: Orchestrator status
status_color = "green" if self.crawl_workers_count > 0 else "grey53"
@@ -74,151 +83,302 @@ class CrawlQueuePanel:
right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
grid.add_row(left_text, center_left_text, center_right_text, right_text)
return Panel(grid, style="white on blue", box=box.ROUNDED)
return Panel(grid, style="white on blue", box=box.HORIZONTALS)
class SnapshotWorkerPanel:
"""Display progress for a single SnapshotWorker."""
class ProcessLogPanel:
"""Display logs for a running Process."""
def __init__(self, worker_num: int):
self.worker_num = worker_num
self.snapshot_id: Optional[str] = None
self.snapshot_url: Optional[str] = None
self.total_hooks: int = 0
self.completed_hooks: int = 0
self.current_plugin: Optional[str] = None
self.status: str = "idle" # idle, working, completed
self.recent_logs: deque = deque(maxlen=5)
def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None):
self.process = process
self.max_lines = max_lines
self.compact = compact
def __rich__(self) -> Panel:
if self.status == "idle":
content = Align.center(
Text("Idle", style="grey53"),
vertical="middle",
)
border_style = "grey53"
title_style = "grey53"
else:
# Build progress display
lines = []
is_pending = self._is_pending()
output_line = '' if is_pending else self._output_line()
stdout_lines = []
stderr_lines = []
try:
stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
except Exception:
stdout_lines = []
stderr_lines = []
# URL (truncated)
if self.snapshot_url:
url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
lines.append(Text(url_display, style="cyan"))
lines.append(Text()) # Spacing
header_lines = []
chrome_launch_line = self._chrome_launch_line(stderr_lines, stdout_lines)
if chrome_launch_line:
header_lines.append(Text(chrome_launch_line, style="grey53"))
if output_line:
header_lines.append(Text(output_line, style="grey53"))
log_lines = []
for line in stdout_lines:
if line:
log_lines.append(Text(line, style="white"))
for line in stderr_lines:
if line:
log_lines.append(Text(line, style="cyan"))
# Progress bar
if self.total_hooks > 0:
pct = (self.completed_hooks / self.total_hooks) * 100
bar_width = 30
filled = int((pct / 100) * bar_width)
bar = "" * filled + "" * (bar_width - filled)
compact = self.compact if self.compact is not None else self._is_background_hook()
max_body = max(1, self.max_lines - len(header_lines))
if not log_lines:
log_lines = []
# Color based on progress
if pct < 30:
bar_style = "yellow"
elif pct < 100:
bar_style = "green"
else:
bar_style = "blue"
lines = header_lines + log_lines[-max_body:]
progress_text = Text()
progress_text.append(bar, style=bar_style)
progress_text.append(f" {pct:.0f}%", style="white")
lines.append(progress_text)
lines.append(Text()) # Spacing
# Stats
stats = Table.grid(padding=(0, 1))
stats.add_column(style="grey53", no_wrap=True)
stats.add_column(style="white")
stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
if self.current_plugin:
stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
lines.append(stats)
lines.append(Text()) # Spacing
# Recent logs
if self.recent_logs:
lines.append(Text("Recent:", style="grey53"))
for log_msg, log_style in self.recent_logs:
log_text = Text(f"{log_msg[:30]}", style=log_style)
lines.append(log_text)
content = Group(*lines)
border_style = "green" if self.status == "working" else "blue"
title_style = "green" if self.status == "working" else "blue"
content = Group(*lines) if lines else Text("")
title = self._title()
border_style = "grey53" if is_pending else "cyan"
height = 2 if is_pending else None
return Panel(
content,
title=f"[{title_style}]Worker {self.worker_num}",
title=title,
border_style=border_style,
box=box.ROUNDED,
height=20,
box=box.HORIZONTALS,
padding=(0, 1),
height=height,
)
def add_log(self, message: str, style: str = "white"):
"""Add a log message to this worker's recent logs."""
self.recent_logs.append((message, style))
def _title(self) -> str:
process_type = getattr(self.process, 'process_type', 'process')
worker_type = getattr(self.process, 'worker_type', '')
pid = getattr(self.process, 'pid', None)
label = process_type
if process_type == 'worker' and worker_type:
label, worker_suffix = self._worker_label(worker_type)
elif process_type == 'hook':
try:
cmd = getattr(self.process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else 'hook'
plugin_name = hook_path.parent.name if hook_path and hook_path.parent.name else 'hook'
except Exception:
hook_name = 'hook'
plugin_name = 'hook'
label = f"{plugin_name}/{hook_name}"
worker_suffix = ''
else:
worker_suffix = ''
url = self._extract_url()
url_suffix = f" url={self._abbrev_url(url)}" if url else ""
time_suffix = self._elapsed_suffix()
title_style = "grey53" if self._is_pending() else "bold white"
if pid:
return f"[{title_style}]{label}[/{title_style}] [grey53]pid={pid}{worker_suffix}{url_suffix}{time_suffix}[/grey53]"
return f"[{title_style}]{label}[/{title_style}]{f' [grey53]{worker_suffix.strip()} {url_suffix.strip()}{time_suffix}[/grey53]' if (worker_suffix or url_suffix or time_suffix) else ''}".rstrip()
def _is_background_hook(self) -> bool:
if getattr(self.process, 'process_type', '') != 'hook':
return False
try:
cmd = getattr(self.process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
return '.bg.' in hook_name
except Exception:
return False
def _is_pending(self) -> bool:
status = getattr(self.process, 'status', '')
if status in ('queued', 'pending', 'backoff'):
return True
if getattr(self.process, 'process_type', '') == 'hook' and not getattr(self.process, 'pid', None):
return True
return False
def _worker_label(self, worker_type: str) -> tuple[str, str]:
cmd = getattr(self.process, 'cmd', []) or []
if worker_type == 'crawl':
crawl_id = self._extract_arg(cmd, '--crawl-id')
suffix = ''
if crawl_id:
suffix = f" id={str(crawl_id)[-8:]}"
try:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.filter(id=crawl_id).first()
if crawl:
urls = crawl.get_urls_list()
if urls:
url_list = self._abbrev_urls(urls)
suffix += f" urls={url_list}"
except Exception:
pass
return 'crawl', suffix
if worker_type == 'snapshot':
snapshot_id = self._extract_arg(cmd, '--snapshot-id')
suffix = ''
if snapshot_id:
suffix = f" id={str(snapshot_id)[-8:]}"
try:
from archivebox.core.models import Snapshot
snap = Snapshot.objects.filter(id=snapshot_id).first()
if snap and snap.url:
suffix += f" url={self._abbrev_url(snap.url, max_len=48)}"
except Exception:
pass
return 'snapshot', suffix
return f"worker:{worker_type}", ''
@staticmethod
def _extract_arg(cmd: list[str], key: str) -> str | None:
for i, part in enumerate(cmd):
if part.startswith(f'{key}='):
return part.split('=', 1)[1]
if part == key and i + 1 < len(cmd):
return cmd[i + 1]
return None
def _abbrev_urls(self, urls: list[str], max_len: int = 48) -> str:
if not urls:
return ''
if len(urls) == 1:
return self._abbrev_url(urls[0], max_len=max_len)
first = self._abbrev_url(urls[0], max_len=max_len)
return f"{first},+{len(urls) - 1}"
def _extract_url(self) -> str:
url = getattr(self.process, 'url', None)
if url:
return str(url)
cmd = getattr(self.process, 'cmd', []) or []
for i, part in enumerate(cmd):
if part.startswith('--url='):
return part.split('=', 1)[1].strip()
if part == '--url' and i + 1 < len(cmd):
return str(cmd[i + 1]).strip()
return ''
def _abbrev_url(self, url: str, max_len: int = 48) -> str:
if not url:
return ''
if len(url) <= max_len:
return url
return f"{url[:max_len - 3]}..."
def _chrome_launch_line(self, stderr_lines: list[str], stdout_lines: list[str]) -> str:
try:
cmd = getattr(self.process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
if 'chrome_launch' not in hook_name:
return ''
pid = ''
ws = ''
for line in stderr_lines + stdout_lines:
if not ws and 'CDP URL:' in line:
ws = line.split('CDP URL:', 1)[1].strip()
if not pid and 'PID:' in line:
pid = line.split('PID:', 1)[1].strip()
if pid and ws:
return f"Chrome pid={pid} {ws}"
if ws:
return f"Chrome {ws}"
if pid:
return f"Chrome pid={pid}"
try:
from archivebox import DATA_DIR
base = Path(DATA_DIR)
pwd = getattr(self.process, 'pwd', None)
if pwd:
chrome_dir = Path(pwd)
if not chrome_dir.is_absolute():
chrome_dir = (base / chrome_dir).resolve()
cdp_file = chrome_dir / 'cdp_url.txt'
pid_file = chrome_dir / 'chrome.pid'
if cdp_file.exists():
ws = cdp_file.read_text().strip()
if pid_file.exists():
pid = pid_file.read_text().strip()
if pid and ws:
return f"Chrome pid={pid} {ws}"
if ws:
return f"Chrome {ws}"
if pid:
return f"Chrome pid={pid}"
except Exception:
pass
except Exception:
return ''
return ''
def _elapsed_suffix(self) -> str:
started_at = getattr(self.process, 'started_at', None)
timeout = getattr(self.process, 'timeout', None)
if not started_at or not timeout:
return ''
try:
now = datetime.now(timezone.utc) if started_at.tzinfo else datetime.now()
elapsed = int((now - started_at).total_seconds())
elapsed = max(elapsed, 0)
return f" [{elapsed}/{int(timeout)}s]"
except Exception:
return ''
def _output_line(self) -> str:
pwd = getattr(self.process, 'pwd', None)
if not pwd:
return ''
try:
from archivebox import DATA_DIR
rel = Path(pwd)
base = Path(DATA_DIR)
if rel.is_absolute():
try:
rel = rel.relative_to(base)
except Exception:
pass
rel_str = f"./{rel}" if not str(rel).startswith("./") else str(rel)
return f"{rel_str}"
except Exception:
return f"{pwd}"
class CrawlWorkerLogPanel:
"""Display CrawlWorker logs by tailing stdout/stderr from Process."""
class WorkerLogPanel:
"""Display worker logs by tailing stdout/stderr from Process."""
def __init__(self, max_lines: int = 8):
def __init__(self, title: str, empty_message: str, running_message: str, max_lines: int = 8):
self.title = title
self.empty_message = empty_message
self.running_message = running_message
self.log_lines: deque = deque(maxlen=max_lines * 2) # Allow more buffer
self.max_lines = max_lines
self.last_stdout_pos = 0 # Track file position for efficient tailing
self.last_stderr_pos = 0
self.last_process_running = False
def update_from_process(self, process: Any):
"""Update logs by tailing the Process stdout/stderr files."""
from pathlib import Path
if not process:
self.last_process_running = False
return
# Read new stdout lines since last read
# Use Process tail helpers for consistency
try:
stdout_path = Path(process.stdout)
if stdout_path.exists():
with open(stdout_path, 'r') as f:
# Seek to last read position
f.seek(self.last_stdout_pos)
new_lines = f.readlines()
# Update position
self.last_stdout_pos = f.tell()
# Add new lines (up to max_lines to avoid overflow)
for line in new_lines[-self.max_lines:]:
line = line.rstrip('\n')
if line and not line.startswith('['): # Skip Rich markup lines
self.log_lines.append(('stdout', line))
self.last_process_running = bool(getattr(process, 'is_running', False))
stdout_lines = list(process.tail_stdout(lines=self.max_lines, follow=False))
stderr_lines = list(process.tail_stderr(lines=self.max_lines, follow=False))
except Exception:
pass
return
# Read new stderr lines since last read
try:
stderr_path = Path(process.stderr)
if stderr_path.exists():
with open(stderr_path, 'r') as f:
f.seek(self.last_stderr_pos)
new_lines = f.readlines()
self.log_lines.clear()
self.last_stderr_pos = f.tell()
for line in new_lines[-self.max_lines:]:
line = line.rstrip('\n')
if line and not line.startswith('['): # Skip Rich markup lines
self.log_lines.append(('stderr', line))
except Exception:
pass
# Preserve ordering by showing stdout then stderr
for line in stdout_lines:
if line:
self.log_lines.append(('stdout', line))
for line in stderr_lines:
if line:
self.log_lines.append(('stderr', line))
def __rich__(self) -> Panel:
if not self.log_lines:
content = Text("No CrawlWorker logs yet", style="grey53", justify="center")
message = self.running_message if self.last_process_running else self.empty_message
content = Text(message, style="grey53", justify="center")
else:
# Get the last max_lines for display
display_lines = list(self.log_lines)[-self.max_lines:]
@@ -236,9 +396,9 @@ class CrawlWorkerLogPanel:
return Panel(
content,
title="[bold cyan]CrawlWorker Logs (stdout/stderr)",
title=f"[bold cyan]{self.title}",
border_style="cyan",
box=box.ROUNDED,
box=box.HORIZONTALS,
)
@@ -270,10 +430,71 @@ class OrchestratorLogPanel:
content,
title="[bold white]Orchestrator / Daphne Logs",
border_style="white",
box=box.ROUNDED,
box=box.HORIZONTALS,
)
class CrawlQueueTreePanel:
"""Display crawl queue with snapshots + hook summary in a tree view."""
def __init__(self, max_crawls: int = 8, max_snapshots: int = 16):
self.crawls: list[dict[str, Any]] = []
self.max_crawls = max_crawls
self.max_snapshots = max_snapshots
def update_crawls(self, crawls: list[dict[str, Any]]) -> None:
"""Update crawl tree data."""
self.crawls = crawls[:self.max_crawls]
def __rich__(self) -> Panel:
if not self.crawls:
content = Text("No active crawls", style="grey53", justify="center")
else:
trees = []
for crawl in self.crawls:
crawl_status = crawl.get('status', '')
crawl_label = crawl.get('label', '')
crawl_id = crawl.get('id', '')[:8]
crawl_text = Text(f"{self._status_icon(crawl_status)} {crawl_id} {crawl_label}", style="white")
crawl_tree = Tree(crawl_text, guide_style="grey53")
snapshots = crawl.get('snapshots', [])[:self.max_snapshots]
for snap in snapshots:
snap_status = snap.get('status', '')
snap_label = snap.get('label', '')
snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white")
snap_node = crawl_tree.add(snap_text)
hooks = snap.get('hooks', {})
if hooks:
completed = hooks.get('completed', 0)
running = hooks.get('running', 0)
pending = hooks.get('pending', 0)
summary = f"{completed} | ▶️ {running} | ⌛️ {pending}"
snap_node.add(Text(summary, style="grey53"))
trees.append(crawl_tree)
content = Group(*trees)
return Panel(
content,
title="[bold white]Crawl Queue",
border_style="white",
box=box.HORIZONTALS,
)
@staticmethod
def _status_icon(status: str) -> str:
if status in ('queued', 'pending'):
return ''
if status in ('started', 'running'):
return ''
if status in ('sealed', 'done', 'completed'):
return ''
if status in ('failed', 'error'):
return ''
return ''
class ArchiveBoxProgressLayout:
"""
Main layout manager for ArchiveBox orchestrator progress display.
@@ -281,15 +502,8 @@ class ArchiveBoxProgressLayout:
Layout structure:
┌─────────────────────────────────────────────────────────────┐
│ Crawl Queue (full width) │
├──────────────────────────────────────────────────────────┤
Snapshot Snapshot │ Snapshot Snapshot
│ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │
│ │ │ │ │
│ Progress + │ Progress + │ Progress + │ Progress + │
│ Stats + │ Stats + │ Stats + │ Stats + │
│ Logs │ Logs │ Logs │ Logs │
├───────────────┴───────────────┴───────────────┴─────────────┤
│ CrawlWorker Logs (stdout/stderr) │
├─────────────────────────────────────────────────────────────┤
Running Process Logs (dynamic panels)
├─────────────────────────────────────────────────────────────┤
│ Orchestrator / Daphne Logs │
└─────────────────────────────────────────────────────────────┘
@@ -303,51 +517,33 @@ class ArchiveBoxProgressLayout:
self.crawl_queue = CrawlQueuePanel()
self.crawl_queue.crawl_id = crawl_id
# Create 4 worker panels
self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
self.crawl_worker_log = CrawlWorkerLogPanel(max_lines=8)
self.process_panels: List[ProcessLogPanel] = []
self.orchestrator_log = OrchestratorLogPanel(max_events=8)
self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16)
# Create layout
self.layout = self._make_layout()
# Track snapshot ID to worker panel mapping
self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index
def _make_layout(self) -> Layout:
"""Define the layout structure."""
layout = Layout(name="root")
# Top-level split: crawl_queue, workers, logs
# Top-level split: crawl_queue, workers, bottom
layout.split(
Layout(name="crawl_queue", size=3),
Layout(name="workers", ratio=1),
Layout(name="logs", size=20),
)
# Split workers into 4 columns
layout["workers"].split_row(
Layout(name="worker1"),
Layout(name="worker2"),
Layout(name="worker3"),
Layout(name="worker4"),
)
# Split logs into crawl_worker_logs and orchestrator_logs
layout["logs"].split(
Layout(name="crawl_worker_logs", size=10),
Layout(name="orchestrator_logs", size=10),
Layout(name="processes", ratio=1),
Layout(name="bottom", size=12),
)
# Assign components to layout sections
layout["crawl_queue"].update(self.crawl_queue)
layout["worker1"].update(self.worker_panels[0])
layout["worker2"].update(self.worker_panels[1])
layout["worker3"].update(self.worker_panels[2])
layout["worker4"].update(self.worker_panels[3])
layout["crawl_worker_logs"].update(self.crawl_worker_log)
layout["processes"].update(Columns([]))
layout["bottom"].split_row(
Layout(name="orchestrator_logs", ratio=2),
Layout(name="crawl_tree", ratio=1),
)
layout["orchestrator_logs"].update(self.orchestrator_log)
layout["crawl_tree"].update(self.crawl_queue_tree)
return layout
@@ -356,82 +552,53 @@ class ArchiveBoxProgressLayout:
status: str,
crawl_queue_count: int = 0,
crawl_workers_count: int = 0,
binary_queue_count: int = 0,
binary_workers_count: int = 0,
max_crawl_workers: int = 8,
):
"""Update orchestrator status in the crawl queue panel."""
self.crawl_queue.orchestrator_status = status
self.crawl_queue.crawl_queue_count = crawl_queue_count
self.crawl_queue.crawl_workers_count = crawl_workers_count
self.crawl_queue.binary_queue_count = binary_queue_count
self.crawl_queue.binary_workers_count = binary_workers_count
self.crawl_queue.max_crawl_workers = max_crawl_workers
def update_snapshot_worker(
self,
snapshot_id: str,
url: str,
total: int,
completed: int,
current_plugin: str = "",
):
"""Update or assign a snapshot to a worker panel."""
# Find or assign worker panel for this snapshot
if snapshot_id not in self.snapshot_to_worker:
# Find first idle worker panel
worker_idx = None
for idx, panel in enumerate(self.worker_panels):
if panel.status == "idle":
worker_idx = idx
break
def update_process_panels(self, processes: List[Any], pending: Optional[List[Any]] = None) -> None:
"""Update process panels to show all running processes."""
panels = []
all_processes = list(processes) + list(pending or [])
for process in all_processes:
is_hook = getattr(process, 'process_type', '') == 'hook'
is_bg = False
if is_hook:
try:
cmd = getattr(process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
is_bg = '.bg.' in hook_name
except Exception:
is_bg = False
is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
max_lines = 2 if is_pending else (4 if is_bg else 7)
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg))
if not panels:
self.layout["processes"].size = 0
self.layout["processes"].update(Text(""))
return
# If no idle worker, use round-robin (shouldn't happen often)
if worker_idx is None:
worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
self.layout["processes"].size = None
self.layout["processes"].ratio = 1
self.layout["processes"].update(Columns(panels, equal=True, expand=True))
self.snapshot_to_worker[snapshot_id] = worker_idx
def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
"""Update the crawl queue tree panel."""
self.crawl_queue_tree.update_crawls(crawls)
# Get assigned worker panel
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Update panel
panel.snapshot_id = snapshot_id
panel.snapshot_url = url
panel.total_hooks = total
panel.completed_hooks = completed
panel.current_plugin = current_plugin
panel.status = "working" if completed < total else "completed"
def remove_snapshot_worker(self, snapshot_id: str):
"""Mark a snapshot worker as idle after completion."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
panel = self.worker_panels[worker_idx]
# Mark as idle
panel.status = "idle"
panel.snapshot_id = None
panel.snapshot_url = None
panel.total_hooks = 0
panel.completed_hooks = 0
panel.current_plugin = None
panel.recent_logs.clear()
# Remove mapping
del self.snapshot_to_worker[snapshot_id]
def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
"""Add a log message to a specific worker's panel."""
if snapshot_id in self.snapshot_to_worker:
worker_idx = self.snapshot_to_worker[snapshot_id]
self.worker_panels[worker_idx].add_log(message, style)
def log_event(self, message: str, style: str = "white"):
def log_event(self, message: str, style: str = "white") -> None:
"""Add an event to the orchestrator log."""
self.orchestrator_log.add_event(message, style)
def update_crawl_worker_logs(self, process: Any):
"""Update CrawlWorker logs by tailing the Process stdout/stderr files."""
self.crawl_worker_log.update_from_process(process)
def get_layout(self) -> Layout:
"""Get the Rich Layout object for rendering."""
return self.layout