better tui

This commit is contained in:
Nick Sweeting
2026-01-19 01:53:32 -08:00
parent 1cb2d5070e
commit b5bbc3b549
9 changed files with 690 additions and 109 deletions

View File

@@ -3,19 +3,19 @@ Rich Layout-based live progress display for ArchiveBox orchestrator.
Shows a comprehensive dashboard with:
- Top: Crawl queue status (full width)
- Middle: Running process logs (dynamic panels)
- Bottom: Orchestrator/Daphne logs
- Middle: Crawl queue tree with hook outputs
- Bottom: Running process logs (dynamic panels)
"""
__package__ = 'archivebox.misc'
from datetime import datetime, timezone
import re
from typing import List, Optional, Any
from collections import deque
from pathlib import Path
from rich import box
from rich.align import Align
from rich.console import Group
from rich.layout import Layout
from rich.columns import Columns
@@ -27,6 +27,13 @@ from rich.tree import Tree
from archivebox.config import VERSION
_RICH_TAG_RE = re.compile(r'\[/?[^\]]+\]')
def _strip_rich(text: str) -> str:
return _RICH_TAG_RE.sub('', text or '').strip()
class CrawlQueuePanel:
"""Display crawl queue status across full width."""
@@ -89,12 +96,18 @@ class CrawlQueuePanel:
class ProcessLogPanel:
"""Display logs for a running Process."""
def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None):
def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None, bg_terminating: bool = False):
self.process = process
self.max_lines = max_lines
self.compact = compact
self.bg_terminating = bg_terminating
def __rich__(self) -> Panel:
completed_line = self._completed_output_line()
if completed_line:
style = "green" if self._completed_ok() else "yellow"
return Text(completed_line, style=style)
is_pending = self._is_pending()
output_line = '' if is_pending else self._output_line()
stdout_lines = []
@@ -130,7 +143,7 @@ class ProcessLogPanel:
content = Group(*lines) if lines else Text("")
title = self._title()
border_style = "grey53" if is_pending else "cyan"
border_style = self._border_style(is_pending=is_pending)
height = 2 if is_pending else None
return Panel(
content,
@@ -141,6 +154,32 @@ class ProcessLogPanel:
height=height,
)
def plain_lines(self) -> list[str]:
completed_line = self._completed_output_line()
if completed_line:
return [completed_line]
lines = []
if not self._is_pending():
output_line = self._output_line()
if output_line:
lines.append(output_line)
try:
stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
except Exception:
stdout_lines = []
stderr_lines = []
for line in stdout_lines:
if line:
lines.append(line)
for line in stderr_lines:
if line:
lines.append(line)
return lines
def _title(self) -> str:
process_type = getattr(self.process, 'process_type', 'process')
worker_type = getattr(self.process, 'worker_type', '')
@@ -189,6 +228,51 @@ class ProcessLogPanel:
return True
return False
def _completed_ok(self) -> bool:
exit_code = getattr(self.process, 'exit_code', None)
return exit_code in (0, None)
def _completed_output_line(self) -> str:
status = getattr(self.process, 'status', '')
if status != 'exited':
return ''
output_line = self._output_line()
if not output_line:
return ''
if not self._has_output_files():
return ''
return output_line
def _has_output_files(self) -> bool:
pwd = getattr(self.process, 'pwd', None)
if not pwd:
return False
try:
base = Path(pwd)
if not base.exists():
return False
ignore = {'stdout.log', 'stderr.log', 'cmd.sh', 'process.pid', 'hook.pid', 'listener.pid'}
for path in base.rglob('*'):
if path.is_file() and path.name not in ignore:
return True
except Exception:
return False
return False
def _border_style(self, is_pending: bool) -> str:
if is_pending:
return "grey53"
status = getattr(self.process, 'status', '')
if status == 'exited':
exit_code = getattr(self.process, 'exit_code', None)
return "green" if exit_code in (0, None) else "yellow"
is_hook = getattr(self.process, 'process_type', '') == 'hook'
if is_hook and not self._is_background_hook():
return "green"
if is_hook and self._is_background_hook() and self.bg_terminating:
return "red"
return "cyan"
def _worker_label(self, worker_type: str) -> tuple[str, str]:
cmd = getattr(self.process, 'cmd', []) or []
if worker_type == 'crawl':
@@ -402,38 +486,6 @@ class WorkerLogPanel:
)
class OrchestratorLogPanel:
"""Display orchestrator and system logs."""
def __init__(self, max_events: int = 8):
self.events: deque = deque(maxlen=max_events)
self.max_events = max_events
def add_event(self, message: str, style: str = "white"):
"""Add an event to the log."""
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S")
self.events.append((timestamp, message, style))
def __rich__(self) -> Panel:
if not self.events:
content = Text("No recent events", style="grey53", justify="center")
else:
lines = []
for timestamp, message, style in self.events:
line = Text()
line.append(f"[{timestamp}] ", style="grey53")
line.append(message, style=style)
lines.append(line)
content = Group(*lines)
return Panel(
content,
title="[bold white]Orchestrator / Daphne Logs",
border_style="white",
box=box.HORIZONTALS,
)
class CrawlQueueTreePanel:
"""Display crawl queue with snapshots + hook summary in a tree view."""
@@ -465,13 +517,23 @@ class CrawlQueueTreePanel:
snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white")
snap_node = crawl_tree.add(snap_text)
hooks = snap.get('hooks', {})
if hooks:
completed = hooks.get('completed', 0)
running = hooks.get('running', 0)
pending = hooks.get('pending', 0)
summary = f"{completed} | ▶️ {running} | ⌛️ {pending}"
snap_node.add(Text(summary, style="grey53"))
output_path = snap.get('output_path', '')
if output_path:
snap_node.add(Text(output_path, style="grey53"))
hooks = snap.get('hooks', []) or []
for hook in hooks:
status = hook.get('status', '')
path = hook.get('path', '')
size = hook.get('size', '')
elapsed = hook.get('elapsed', '')
timeout = hook.get('timeout', '')
is_bg = hook.get('is_bg', False)
is_running = hook.get('is_running', False)
is_pending = hook.get('is_pending', False)
icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending)
stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status)
snap_node.add(Text(f"{icon} {path}{stats}", style=color))
trees.append(crawl_tree)
content = Group(*trees)
@@ -494,6 +556,45 @@ class CrawlQueueTreePanel:
return ''
return ''
@staticmethod
def _hook_style(status: str, is_bg: bool = False, is_running: bool = False, is_pending: bool = False) -> tuple[str, str]:
if status == 'succeeded':
return '', 'green'
if status == 'failed':
return '⚠️', 'yellow'
if status == 'skipped':
return '', 'grey53'
if is_pending:
return '⌛️', 'grey53'
if is_running and is_bg:
return '', 'cyan'
if is_running:
return '▶️', 'cyan'
if status == 'started':
return '▶️', 'cyan'
return '', 'grey53'
@staticmethod
def _hook_stats(size: str = '', elapsed: str = '', timeout: str = '', status: str = '') -> str:
if status in ('succeeded', 'failed', 'skipped'):
parts = []
if size:
parts.append(size)
if elapsed:
parts.append(elapsed)
if not parts:
return ''
return f" ({' | '.join(parts)})"
if elapsed or timeout:
size_part = '...' if elapsed or timeout else ''
time_part = ''
if elapsed and timeout:
time_part = f"{elapsed}/{timeout}"
elif elapsed:
time_part = f"{elapsed}"
return f" ({size_part} | {time_part})" if time_part else f" ({size_part})"
return ''
class ArchiveBoxProgressLayout:
"""
@@ -503,9 +604,9 @@ class ArchiveBoxProgressLayout:
┌─────────────────────────────────────────────────────────────┐
│ Crawl Queue (full width) │
├─────────────────────────────────────────────────────────────┤
Running Process Logs (dynamic panels)
Crawl Queue Tree (hooks + outputs)
├─────────────────────────────────────────────────────────────┤
Orchestrator / Daphne Logs
Running Process Logs (dynamic panels)
└─────────────────────────────────────────────────────────────┘
"""
@@ -518,7 +619,6 @@ class ArchiveBoxProgressLayout:
self.crawl_queue.crawl_id = crawl_id
self.process_panels: List[ProcessLogPanel] = []
self.orchestrator_log = OrchestratorLogPanel(max_events=8)
self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16)
# Create layout
@@ -528,22 +628,17 @@ class ArchiveBoxProgressLayout:
"""Define the layout structure."""
layout = Layout(name="root")
# Top-level split: crawl_queue, workers, bottom
# Top-level split: crawl_queue, crawl_tree, processes
layout.split(
Layout(name="crawl_queue", size=3),
Layout(name="crawl_tree", size=14),
Layout(name="processes", ratio=1),
Layout(name="bottom", size=12),
)
# Assign components to layout sections
layout["crawl_queue"].update(self.crawl_queue)
layout["processes"].update(Columns([]))
layout["bottom"].split_row(
Layout(name="orchestrator_logs", ratio=2),
Layout(name="crawl_tree", ratio=1),
)
layout["orchestrator_logs"].update(self.orchestrator_log)
layout["crawl_tree"].update(self.crawl_queue_tree)
layout["processes"].update(Columns([]))
return layout
@@ -568,6 +663,33 @@ class ArchiveBoxProgressLayout:
"""Update process panels to show all running processes."""
panels = []
all_processes = list(processes) + list(pending or [])
fg_running = False
for process in processes:
if getattr(process, 'process_type', '') != 'hook':
continue
try:
cmd = getattr(process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
if '.bg.' not in hook_name:
fg_running = True
break
except Exception:
continue
fg_pending = False
for process in (pending or []):
if getattr(process, 'process_type', '') != 'hook':
continue
try:
cmd = getattr(process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else ''
if '.bg.' not in hook_name:
fg_pending = True
break
except Exception:
continue
bg_terminating = bool(processes) and not fg_running and not fg_pending
for process in all_processes:
is_hook = getattr(process, 'process_type', '') == 'hook'
is_bg = False
@@ -581,12 +703,14 @@ class ArchiveBoxProgressLayout:
is_bg = False
is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
max_lines = 2 if is_pending else (4 if is_bg else 7)
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg))
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating))
if not panels:
self.layout["processes"].size = 0
self.layout["processes"].update(Text(""))
self.process_panels = []
return
self.process_panels = panels
self.layout["processes"].size = None
self.layout["processes"].ratio = 1
self.layout["processes"].update(Columns(panels, equal=True, expand=True))
@@ -597,8 +721,54 @@ class ArchiveBoxProgressLayout:
def log_event(self, message: str, style: str = "white") -> None:
"""Add an event to the orchestrator log."""
self.orchestrator_log.add_event(message, style)
return
def get_layout(self) -> Layout:
"""Get the Rich Layout object for rendering."""
return self.layout
def plain_lines(self) -> list[tuple[str, str]]:
lines: list[tuple[str, str]] = []
queue = self.crawl_queue
queue_line = (
f"Status: {queue.orchestrator_status} | Crawls: {queue.crawl_queue_count} queued | "
f"Binaries: {queue.binary_queue_count} queued | Workers: {queue.crawl_workers_count}/{queue.max_crawl_workers} "
f"crawl, {queue.binary_workers_count} binary"
)
lines.append(("crawl_queue", queue_line))
for panel in self.process_panels:
title = _strip_rich(panel._title())
for line in panel.plain_lines():
if line:
lines.append((title or "process", line))
for crawl in self.crawl_queue_tree.crawls:
crawl_line = f"{self.crawl_queue_tree._status_icon(crawl.get('status', ''))} {crawl.get('id', '')[:8]} {crawl.get('label', '')}".strip()
lines.append(("crawl_tree", crawl_line))
for snap in crawl.get('snapshots', []):
snap_line = f" {self.crawl_queue_tree._status_icon(snap.get('status', ''))} {snap.get('label', '')}".rstrip()
lines.append(("crawl_tree", snap_line))
output_path = snap.get('output_path', '')
if output_path:
lines.append(("crawl_tree", f" {output_path}"))
for hook in snap.get('hooks', []) or []:
status = hook.get('status', '')
path = hook.get('path', '')
icon, _ = self.crawl_queue_tree._hook_style(
status,
is_bg=hook.get('is_bg', False),
is_running=hook.get('is_running', False),
is_pending=hook.get('is_pending', False),
)
stats = self.crawl_queue_tree._hook_stats(
size=hook.get('size', ''),
elapsed=hook.get('elapsed', ''),
timeout=hook.get('timeout', ''),
status=status,
)
hook_line = f" {icon} {path}{stats}".strip()
if hook_line:
lines.append(("crawl_tree", hook_line))
return lines

View File

@@ -32,6 +32,7 @@ class Persona(ModelWithConfig):
Each persona provides:
- CHROME_USER_DATA_DIR: Chrome profile directory
- CHROME_EXTENSIONS_DIR: Installed extensions directory
- CHROME_DOWNLOADS_DIR: Chrome downloads directory
- COOKIES_FILE: Cookies file for wget/curl
- config: JSON field with persona-specific config overrides
@@ -72,6 +73,11 @@ class Persona(ModelWithConfig):
"""Derived path to Chrome extensions directory for this persona."""
return str(self.path / 'chrome_extensions')
@property
def CHROME_DOWNLOADS_DIR(self) -> str:
"""Derived path to Chrome downloads directory for this persona."""
return str(self.path / 'chrome_downloads')
@property
def COOKIES_FILE(self) -> str:
"""Derived path to cookies.txt file for this persona (if exists)."""
@@ -86,6 +92,7 @@ class Persona(ModelWithConfig):
- All values from self.config JSONField
- CHROME_USER_DATA_DIR (derived from persona path)
- CHROME_EXTENSIONS_DIR (derived from persona path)
- CHROME_DOWNLOADS_DIR (derived from persona path)
- COOKIES_FILE (derived from persona path, if file exists)
- ACTIVE_PERSONA (set to this persona's name)
"""
@@ -96,6 +103,8 @@ class Persona(ModelWithConfig):
derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR
if 'CHROME_EXTENSIONS_DIR' not in derived:
derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR
if 'CHROME_DOWNLOADS_DIR' not in derived:
derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
derived['COOKIES_FILE'] = self.COOKIES_FILE
@@ -109,6 +118,7 @@ class Persona(ModelWithConfig):
self.path.mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True)
def cleanup_chrome(self) -> bool:
"""

View File

@@ -384,6 +384,8 @@ async function launchChromium(options = {}) {
return { success: false, error: 'Chrome binary not found' };
}
const downloadsDir = getEnv('CHROME_DOWNLOADS_DIR');
// Kill zombies first
if (killZombies) {
killZombieChrome();
@@ -412,6 +414,28 @@ async function launchChromium(options = {}) {
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
}
}
if (downloadsDir) {
try {
const defaultProfileDir = path.join(userDataDir, 'Default');
const prefsPath = path.join(defaultProfileDir, 'Preferences');
fs.mkdirSync(defaultProfileDir, { recursive: true });
let prefs = {};
if (fs.existsSync(prefsPath)) {
try {
prefs = JSON.parse(fs.readFileSync(prefsPath, 'utf-8'));
} catch (e) {
prefs = {};
}
}
prefs.download = prefs.download || {};
prefs.download.default_directory = downloadsDir;
prefs.download.prompt_for_download = false;
fs.writeFileSync(prefsPath, JSON.stringify(prefs));
console.error(`[*] Set Chrome download directory: ${downloadsDir}`);
} catch (e) {
console.error(`[!] Failed to set Chrome download directory: ${e.message}`);
}
}
}
// Find a free port
@@ -455,6 +479,11 @@ async function launchChromium(options = {}) {
// Dynamic args come after base so they can override if needed
const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs];
// Ensure keychain prompts are disabled on macOS
if (!chromiumArgs.includes('--use-mock-keychain')) {
chromiumArgs.push('--use-mock-keychain');
}
// Add extension loading flags
if (extensionPaths.length > 0) {
const extPathsArg = extensionPaths.join(',');

View File

@@ -84,6 +84,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
}
const url = await page.url();
console.error(`[singlefile] Triggering extension for: ${url}`);
// Check for unsupported URL schemes
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
@@ -93,24 +94,28 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
return null;
}
const downloadsDir = options.downloadsDir || CHROME_DOWNLOADS_DIR;
console.error(`[singlefile] Watching downloads dir: ${downloadsDir}`);
// Ensure downloads directory exists
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
await fs.promises.mkdir(downloadsDir, { recursive: true });
// Get list of existing files to ignore
const files_before = new Set(
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'))
(await fs.promises.readdir(downloadsDir))
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'))
);
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
console.error(`[singlefile] Saving via extension (${extension.id})...`);
// Bring page to front (extension action button acts on foreground tab)
await page.bringToFront();
// Trigger the extension's action (toolbar button click)
console.error('[singlefile] Dispatching extension action...');
await extension.dispatchAction();
// Wait for file to appear in downloads directory
@@ -118,34 +123,90 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
const max_tries = 10;
let files_new = [];
console.error(`[singlefile] Waiting up to ${(check_delay * max_tries) / 1000}s for download...`);
for (let attempt = 0; attempt < max_tries; attempt++) {
await wait(check_delay);
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'));
const files_after = (await fs.promises.readdir(downloadsDir))
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'));
files_new = files_after.filter(file => !files_before.has(file));
if (files_new.length === 0) {
console.error(`[singlefile] No new downloads yet (${attempt + 1}/${max_tries})`);
continue;
}
// Find the matching file by checking if it contains the URL in the HTML header
for (const file of files_new) {
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
const dl_header = dl_text.split('meta charset')[0];
console.error(`[singlefile] New download(s) detected: ${files_new.join(', ')}`);
if (dl_header.includes(`url: ${url}`)) {
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
await fs.promises.rename(dl_path, out_path);
// Prefer files that match the URL or have SingleFile markers
const url_variants = new Set([url]);
if (url.endsWith('/')) {
url_variants.add(url.slice(0, -1));
} else {
url_variants.add(`${url}/`);
}
const scored = [];
for (const file of files_new) {
const dl_path = path.join(downloadsDir, file);
let header = '';
try {
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
header = dl_text.slice(0, 200000);
const stat = await fs.promises.stat(dl_path);
console.error(`[singlefile] Download ${file} size=${stat.size} bytes`);
} catch (err) {
// Skip unreadable files
continue;
}
const header_lower = header.toLowerCase();
const has_url = Array.from(url_variants).some(v => header.includes(v));
const has_singlefile_marker = header_lower.includes('singlefile') || header_lower.includes('single-file');
const score = (has_url ? 2 : 0) + (has_singlefile_marker ? 1 : 0);
scored.push({ file, dl_path, score });
}
scored.sort((a, b) => b.score - a.score);
if (scored.length > 0) {
const best = scored[0];
if (best.score > 0 || files_new.length === 1) {
console.error(`[singlefile] Moving download from ${best.file} -> ${out_path}`);
await fs.promises.rename(best.dl_path, out_path);
const out_stat = await fs.promises.stat(out_path);
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
return out_path;
}
}
if (files_new.length > 0) {
// Fallback: move the newest file if no clear match found
let newest = null;
let newest_mtime = -1;
for (const file of files_new) {
const dl_path = path.join(downloadsDir, file);
try {
const stat = await fs.promises.stat(dl_path);
if (stat.mtimeMs > newest_mtime) {
newest_mtime = stat.mtimeMs;
newest = { file, dl_path };
}
} catch (err) {}
}
if (newest) {
console.error(`[singlefile] Moving newest download from ${newest.file} -> ${out_path}`);
await fs.promises.rename(newest.dl_path, out_path);
const out_stat = await fs.promises.stat(out_path);
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
return out_path;
}
}
}
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
console.error(`[singlefile] Failed to find SingleFile HTML in ${downloadsDir} after ${(check_delay * max_tries) / 1000}s`);
console.error(`[singlefile] New files seen: ${files_new.join(', ')}`);
return null;
}

View File

@@ -37,6 +37,7 @@ BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'singlefile.html'
EXTENSION_SAVE_SCRIPT = Path(__file__).parent / 'singlefile_extension_save.js'
def get_env(name: str, default: str = '') -> str:
@@ -255,6 +256,42 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
return False, None, f'{type(e).__name__}: {e}'
def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
"""Save using the SingleFile Chrome extension via existing Chrome session."""
# Only attempt if chrome session exists
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
if not cdp_url:
return False, None, 'No Chrome session available'
if not EXTENSION_SAVE_SCRIPT.exists():
return False, None, 'SingleFile extension helper script missing'
node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
if result.returncode == 0:
# Prefer explicit stdout path, fallback to local output file
out_text = result.stdout.decode('utf-8', errors='replace').strip()
if out_text and Path(out_text).exists():
return True, out_text, ''
output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
if output_path.exists() and output_path.stat().st_size > 0:
return True, str(output_path), ''
return False, None, 'SingleFile extension completed but no output file found'
stderr = result.stderr.decode('utf-8', errors='replace').strip()
stdout = result.stdout.decode('utf-8', errors='replace').strip()
detail = stderr or stdout
return False, None, detail or 'SingleFile extension failed'
@click.command()
@click.option('--url', required=True, help='URL to archive')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
@@ -278,11 +315,14 @@ def main(url: str, snapshot_id: str):
print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
sys.exit(0)
# Get binary from environment
binary = get_env('SINGLEFILE_BINARY', 'single-file')
# Prefer SingleFile extension via existing Chrome session
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
success, output, error = save_singlefile_with_extension(url, timeout)
# Run extraction
success, output, error = save_singlefile(url, binary)
# Fallback to single-file-cli if extension path failed
if not success:
binary = get_env('SINGLEFILE_BINARY', 'single-file')
success, output, error = save_singlefile(url, binary)
status = 'succeeded' if success else 'failed'
except Exception as e:

View File

@@ -30,6 +30,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
PLUGIN_DIR = get_plugin_dir(__file__)
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__82_singlefile_install.js'
TEST_URL = "https://example.com"
@@ -142,6 +143,95 @@ def test_singlefile_with_chrome_session():
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
def test_singlefile_with_extension_uses_existing_chrome():
"""Test SingleFile uses the Chrome extension via existing session (CLI fallback disabled)."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
data_dir = tmpdir / 'data'
extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
extensions_dir.mkdir(parents=True, exist_ok=True)
downloads_dir.mkdir(parents=True, exist_ok=True)
user_data_dir.mkdir(parents=True, exist_ok=True)
env_install = os.environ.copy()
env_install.update({
'DATA_DIR': str(data_dir),
'CHROME_EXTENSIONS_DIR': str(extensions_dir),
'CHROME_DOWNLOADS_DIR': str(downloads_dir),
})
# Install SingleFile extension cache before launching Chrome
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env_install,
timeout=120
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Launch Chrome session with extensions loaded
old_env = os.environ.copy()
os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
try:
with chrome_session(
tmpdir=tmpdir,
crawl_id='singlefile-ext-crawl',
snapshot_id='singlefile-ext-snap',
test_url=TEST_URL,
navigate=True,
timeout=30,
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
# Ensure ../chrome points to snapshot chrome session (contains target_id.txt)
chrome_dir = singlefile_output_dir.parent / 'chrome'
if not chrome_dir.exists():
chrome_dir.symlink_to(snapshot_chrome_dir)
env['SINGLEFILE_ENABLED'] = 'true'
env['SINGLEFILE_BINARY'] = '/nonexistent/single-file' # force extension path
env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
env['CHROME_HEADLESS'] = 'false'
# Track downloads dir state before run to ensure file is created then moved out
downloads_before = set(downloads_dir.glob('*.html'))
downloads_mtime_before = downloads_dir.stat().st_mtime_ns
result = subprocess.run(
[sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-ext-snap'],
cwd=str(singlefile_output_dir),
capture_output=True,
text=True,
env=env,
timeout=120
)
assert result.returncode == 0, f"SingleFile extension run failed: {result.stderr}"
output_file = singlefile_output_dir / 'singlefile.html'
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
html_content = output_file.read_text(errors='ignore')
assert 'Example Domain' in html_content, "Output should contain example.com content"
# Verify download moved out of downloads dir
downloads_after = set(downloads_dir.glob('*.html'))
new_downloads = downloads_after - downloads_before
downloads_mtime_after = downloads_dir.stat().st_mtime_ns
assert downloads_mtime_after != downloads_mtime_before, "Downloads dir should be modified during extension save"
assert not new_downloads, f"SingleFile download should be moved out of downloads dir, found: {new_downloads}"
finally:
os.environ.clear()
os.environ.update(old_env)
def test_singlefile_disabled_skips():
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -30,6 +30,7 @@ import time
from typing import Type
from datetime import timedelta
from multiprocessing import Process as MPProcess
from pathlib import Path
from django.utils import timezone
@@ -457,12 +458,14 @@ class Orchestrator:
# Enable progress layout only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle
plain_output = not IS_TTY
self.on_startup()
if not show_progress:
# No progress layout - just run normally
self._run_orchestrator_loop(None)
# No progress layout - optionally emit plain lines for non-TTY output
progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id) if plain_output else None
self._run_orchestrator_loop(progress_layout, plain_output=plain_output)
else:
# Redirect worker subprocess output to /dev/null
devnull_fd = os.open(os.devnull, os.O_WRONLY)
@@ -497,7 +500,7 @@ class Orchestrator:
screen=True,
console=orchestrator_console,
):
self._run_orchestrator_loop(progress_layout)
self._run_orchestrator_loop(progress_layout, plain_output=False)
# Restore original console
logging_module.CONSOLE = original_console
@@ -515,11 +518,12 @@ class Orchestrator:
pass
# stdout_for_console is closed by orchestrator_console
def _run_orchestrator_loop(self, progress_layout):
def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False):
"""Run the main orchestrator loop with optional progress display."""
last_queue_sizes = {}
last_snapshot_count = None
tick_count = 0
last_plain_lines: set[tuple[str, str]] = set()
# Track snapshot progress to detect changes
snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin)
@@ -591,6 +595,22 @@ class Orchestrator:
def _abbrev(text: str, max_len: int = 80) -> str:
return text if len(text) <= max_len else f"{text[:max_len - 3]}..."
def _format_size(num_bytes: int | None) -> str:
if not num_bytes:
return ''
size = float(num_bytes)
for unit in ('b', 'kb', 'mb', 'gb', 'tb'):
if size < 1024 or unit == 'tb':
return f"{size:.1f}{unit}"
size /= 1024
return ''
def _format_seconds(total_seconds: float | None) -> str:
if total_seconds is None:
return ''
seconds = max(0.0, float(total_seconds))
return f"{seconds:.1f}s"
tree_data: list[dict] = []
for crawl in crawls:
urls = crawl.get_urls_list()
@@ -614,28 +634,174 @@ class Orchestrator:
active_snaps.append(s)
for snap in active_snaps:
total = snap.archiveresult_set.count()
completed = snap.archiveresult_set.filter(status__in=[
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.FAILED,
]).count()
running = snap.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED).count()
try:
from archivebox.config.configset import get_config
from archivebox.hooks import discover_hooks
hooks_list = discover_hooks('Snapshot', config=get_config(snapshot=snap))
total_hooks = len(hooks_list)
snap_config = get_config(snapshot=snap)
hooks_list = discover_hooks('Snapshot', config=snap_config)
hooks_by_snapshot[str(snap.id)] = hooks_list
from archivebox.hooks import get_plugin_special_config
hook_timeouts = {}
for hook_path in hooks_list:
plugin_name = hook_path.parent.name
try:
hook_timeouts[hook_path.name] = int(get_plugin_special_config(plugin_name, snap_config)['timeout'])
except Exception:
pass
except Exception:
total_hooks = total
pending = max(total_hooks - completed - running, 0)
snap_label = _abbrev(snap.url or str(snap.id), max_len=60)
hooks_list = []
hook_timeouts = {}
try:
from archivebox import DATA_DIR
data_dir = Path(DATA_DIR)
snap_path = snap.output_dir
try:
rel = Path(snap_path)
if rel.is_absolute():
rel = rel.relative_to(data_dir)
snap_path = f"./{rel}" if not str(rel).startswith("./") else str(rel)
except Exception:
snap_path = str(snap_path)
ars = list(
snap.archiveresult_set.select_related('process').order_by('start_ts')
)
ar_by_hook = {ar.hook_name: ar for ar in ars if ar.hook_name}
except Exception:
snap_path = ''
ar_by_hook = {}
plugin_hooks: dict[str, list[dict]] = {}
now = timezone.now()
for hook_path in hooks_list:
hook_name = hook_path.name
is_bg = '.bg.' in hook_name
ar = ar_by_hook.get(hook_name)
status = 'pending'
is_running = False
is_pending = True
elapsed = ''
timeout = ''
size = ''
if ar:
if ar.status == ArchiveResult.StatusChoices.STARTED:
status = 'started'
is_running = True
is_pending = False
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
if start_ts:
elapsed = _format_seconds((now - start_ts).total_seconds())
hook_timeout = None
if ar.process_id and ar.process and ar.process.timeout:
hook_timeout = ar.process.timeout
hook_timeout = hook_timeout or hook_timeouts.get(hook_name)
if hook_timeout:
timeout = _format_seconds(hook_timeout)
else:
status = ar.status
is_pending = False
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None)
if start_ts and end_ts:
elapsed = _format_seconds((end_ts - start_ts).total_seconds())
size = _format_size(getattr(ar, 'output_size', None))
else:
hook_timeout = hook_timeouts.get(hook_name)
if hook_timeout:
timeout = _format_seconds(hook_timeout)
elapsed = _format_seconds(0)
plugin_name = hook_path.parent.name
if plugin_name in ('plugins', '.'):
plugin_name = hook_name.split('__')[-1].split('.')[0]
plugin_hooks.setdefault(plugin_name, []).append({
'status': status,
'size': size,
'elapsed': elapsed,
'timeout': timeout,
'is_bg': is_bg,
'is_running': is_running,
'is_pending': is_pending,
'hook_name': hook_name,
})
hooks = []
for plugin_name, hook_entries in plugin_hooks.items():
running = next((h for h in hook_entries if h['is_running']), None)
pending = next((h for h in hook_entries if h['is_pending']), None)
any_failed = any(h['status'] == ArchiveResult.StatusChoices.FAILED for h in hook_entries)
any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries)
any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries)
if running:
status = 'started'
is_running = True
is_pending = False
is_bg = running['is_bg']
elapsed = running.get('elapsed', '')
timeout = running.get('timeout', '')
size = ''
elif pending:
status = 'pending'
is_running = False
is_pending = True
is_bg = pending['is_bg']
elapsed = pending.get('elapsed', '') or _format_seconds(0)
timeout = pending.get('timeout', '')
size = ''
else:
is_running = False
is_pending = False
is_bg = any(h['is_bg'] for h in hook_entries)
if any_failed:
status = 'failed'
elif any_succeeded:
status = 'succeeded'
elif any_skipped:
status = 'skipped'
else:
status = 'skipped'
total_elapsed = 0.0
has_elapsed = False
for h in hook_entries:
if h.get('elapsed'):
try:
total_elapsed += float(h['elapsed'].rstrip('s'))
has_elapsed = True
except Exception:
pass
elapsed = _format_seconds(total_elapsed) if has_elapsed else ''
max_output = 0
# Use the largest output_size we already computed on ArchiveResult
ar_sizes = [
ar_by_hook[h['hook_name']].output_size
for h in hook_entries
if h.get('hook_name') in ar_by_hook and getattr(ar_by_hook[h['hook_name']], 'output_size', 0)
]
if ar_sizes:
max_output = max(ar_sizes)
size = _format_size(max_output) if max_output else ''
timeout = ''
hooks.append({
'status': status,
'path': f"./{plugin_name}",
'size': size,
'elapsed': elapsed,
'timeout': timeout,
'is_bg': is_bg,
'is_running': is_running,
'is_pending': is_pending,
})
snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80)
snapshots.append({
'id': str(snap.id),
'status': snap.status,
'label': snap_label,
'hooks': {'completed': completed, 'running': running, 'pending': pending} if total else {},
'output_path': snap_path,
'hooks': hooks,
})
pending_snapshot_candidates.append(snap)
@@ -837,6 +1003,16 @@ class Orchestrator:
if snapshot_id in snapshot_progress:
del snapshot_progress[snapshot_id]
if plain_output:
plain_lines = progress_layout.plain_lines()
new_lines = [line for line in plain_lines if line not in last_plain_lines]
if new_lines:
ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
for panel, line in new_lines:
if line:
print(f"[{ts}] [{panel}] {line}")
last_plain_lines = set(plain_lines)
# Track idle state
has_pending = self.has_pending_work(queue_sizes)
has_running = self.has_running_workers()

View File

@@ -254,8 +254,7 @@ def start_new_supervisord_process(daemonize=False):
shell=True,
start_new_session=True,
)
time.sleep(2)
return get_existing_supervisord_process()
return wait_for_supervisord_ready()
else:
# Start supervisord in FOREGROUND - this will block until supervisord exits
# supervisord with nodaemon=true will run in foreground and handle signals properly
@@ -273,10 +272,19 @@ def start_new_supervisord_process(daemonize=False):
global _supervisord_proc
_supervisord_proc = proc
# Wait a bit for supervisord to start up
time.sleep(2)
return wait_for_supervisord_ready()
return get_existing_supervisord_process()
def wait_for_supervisord_ready(max_wait_sec: float = 5.0, interval_sec: float = 0.1):
"""Poll for supervisord readiness without a fixed startup sleep."""
deadline = time.monotonic() + max_wait_sec
supervisor = None
while time.monotonic() < deadline:
supervisor = get_existing_supervisord_process()
if supervisor is not None:
return supervisor
time.sleep(interval_sec)
return supervisor
def get_or_create_supervisord_process(daemonize=False):
@@ -287,17 +295,16 @@ def get_or_create_supervisord_process(daemonize=False):
if supervisor is None:
stop_existing_supervisord_process()
supervisor = start_new_supervisord_process(daemonize=daemonize)
time.sleep(0.5)
# wait up to 5s in case supervisord is slow to start
if not supervisor:
for _ in range(10):
for _ in range(50):
if supervisor is not None:
print()
break
sys.stdout.write('.')
sys.stdout.flush()
time.sleep(0.5)
time.sleep(0.1)
supervisor = get_existing_supervisord_process()
else:
print()
@@ -328,9 +335,7 @@ def start_worker(supervisor, daemon, lazy=False):
for added in added:
supervisor.addProcessGroup(added)
time.sleep(1)
for _ in range(10):
for _ in range(25):
procs = supervisor.getAllProcessInfo()
for proc in procs:
if proc['name'] == daemon["name"]:
@@ -345,8 +350,8 @@ def start_worker(supervisor, daemon, lazy=False):
print(f" - Worker {daemon['name']}: started {proc['statename']} ({proc['description']})")
return proc
# retry in a second in case it's slow to launch
time.sleep(0.5)
# retry in a moment in case it's slow to launch
time.sleep(0.2)
raise Exception(f"Failed to start worker {daemon['name']}! Only found: {procs}")

4
uv.lock generated
View File

@@ -1,5 +1,5 @@
version = 1
revision = 3
revision = 2
requires-python = ">=3.13"
resolution-markers = [
"python_full_version >= '3.14' and sys_platform == 'darwin'",
@@ -60,7 +60,7 @@ wheels = [
[[package]]
name = "archivebox"
version = "0.9.2"
version = "0.9.3"
source = { editable = "." }
dependencies = [
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },