mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
better tui
This commit is contained in:
@@ -3,19 +3,19 @@ Rich Layout-based live progress display for ArchiveBox orchestrator.
|
||||
|
||||
Shows a comprehensive dashboard with:
|
||||
- Top: Crawl queue status (full width)
|
||||
- Middle: Running process logs (dynamic panels)
|
||||
- Bottom: Orchestrator/Daphne logs
|
||||
- Middle: Crawl queue tree with hook outputs
|
||||
- Bottom: Running process logs (dynamic panels)
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
from datetime import datetime, timezone
|
||||
import re
|
||||
from typing import List, Optional, Any
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
|
||||
from rich import box
|
||||
from rich.align import Align
|
||||
from rich.console import Group
|
||||
from rich.layout import Layout
|
||||
from rich.columns import Columns
|
||||
@@ -27,6 +27,13 @@ from rich.tree import Tree
|
||||
from archivebox.config import VERSION
|
||||
|
||||
|
||||
_RICH_TAG_RE = re.compile(r'\[/?[^\]]+\]')
|
||||
|
||||
|
||||
def _strip_rich(text: str) -> str:
|
||||
return _RICH_TAG_RE.sub('', text or '').strip()
|
||||
|
||||
|
||||
class CrawlQueuePanel:
|
||||
"""Display crawl queue status across full width."""
|
||||
|
||||
@@ -89,12 +96,18 @@ class CrawlQueuePanel:
|
||||
class ProcessLogPanel:
|
||||
"""Display logs for a running Process."""
|
||||
|
||||
def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None):
|
||||
def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None, bg_terminating: bool = False):
|
||||
self.process = process
|
||||
self.max_lines = max_lines
|
||||
self.compact = compact
|
||||
self.bg_terminating = bg_terminating
|
||||
|
||||
def __rich__(self) -> Panel:
|
||||
completed_line = self._completed_output_line()
|
||||
if completed_line:
|
||||
style = "green" if self._completed_ok() else "yellow"
|
||||
return Text(completed_line, style=style)
|
||||
|
||||
is_pending = self._is_pending()
|
||||
output_line = '' if is_pending else self._output_line()
|
||||
stdout_lines = []
|
||||
@@ -130,7 +143,7 @@ class ProcessLogPanel:
|
||||
content = Group(*lines) if lines else Text("")
|
||||
|
||||
title = self._title()
|
||||
border_style = "grey53" if is_pending else "cyan"
|
||||
border_style = self._border_style(is_pending=is_pending)
|
||||
height = 2 if is_pending else None
|
||||
return Panel(
|
||||
content,
|
||||
@@ -141,6 +154,32 @@ class ProcessLogPanel:
|
||||
height=height,
|
||||
)
|
||||
|
||||
def plain_lines(self) -> list[str]:
|
||||
completed_line = self._completed_output_line()
|
||||
if completed_line:
|
||||
return [completed_line]
|
||||
|
||||
lines = []
|
||||
if not self._is_pending():
|
||||
output_line = self._output_line()
|
||||
if output_line:
|
||||
lines.append(output_line)
|
||||
|
||||
try:
|
||||
stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
|
||||
stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
|
||||
except Exception:
|
||||
stdout_lines = []
|
||||
stderr_lines = []
|
||||
|
||||
for line in stdout_lines:
|
||||
if line:
|
||||
lines.append(line)
|
||||
for line in stderr_lines:
|
||||
if line:
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
def _title(self) -> str:
|
||||
process_type = getattr(self.process, 'process_type', 'process')
|
||||
worker_type = getattr(self.process, 'worker_type', '')
|
||||
@@ -189,6 +228,51 @@ class ProcessLogPanel:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _completed_ok(self) -> bool:
|
||||
exit_code = getattr(self.process, 'exit_code', None)
|
||||
return exit_code in (0, None)
|
||||
|
||||
def _completed_output_line(self) -> str:
|
||||
status = getattr(self.process, 'status', '')
|
||||
if status != 'exited':
|
||||
return ''
|
||||
output_line = self._output_line()
|
||||
if not output_line:
|
||||
return ''
|
||||
if not self._has_output_files():
|
||||
return ''
|
||||
return output_line
|
||||
|
||||
def _has_output_files(self) -> bool:
|
||||
pwd = getattr(self.process, 'pwd', None)
|
||||
if not pwd:
|
||||
return False
|
||||
try:
|
||||
base = Path(pwd)
|
||||
if not base.exists():
|
||||
return False
|
||||
ignore = {'stdout.log', 'stderr.log', 'cmd.sh', 'process.pid', 'hook.pid', 'listener.pid'}
|
||||
for path in base.rglob('*'):
|
||||
if path.is_file() and path.name not in ignore:
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
def _border_style(self, is_pending: bool) -> str:
|
||||
if is_pending:
|
||||
return "grey53"
|
||||
status = getattr(self.process, 'status', '')
|
||||
if status == 'exited':
|
||||
exit_code = getattr(self.process, 'exit_code', None)
|
||||
return "green" if exit_code in (0, None) else "yellow"
|
||||
is_hook = getattr(self.process, 'process_type', '') == 'hook'
|
||||
if is_hook and not self._is_background_hook():
|
||||
return "green"
|
||||
if is_hook and self._is_background_hook() and self.bg_terminating:
|
||||
return "red"
|
||||
return "cyan"
|
||||
|
||||
def _worker_label(self, worker_type: str) -> tuple[str, str]:
|
||||
cmd = getattr(self.process, 'cmd', []) or []
|
||||
if worker_type == 'crawl':
|
||||
@@ -402,38 +486,6 @@ class WorkerLogPanel:
|
||||
)
|
||||
|
||||
|
||||
class OrchestratorLogPanel:
|
||||
"""Display orchestrator and system logs."""
|
||||
|
||||
def __init__(self, max_events: int = 8):
|
||||
self.events: deque = deque(maxlen=max_events)
|
||||
self.max_events = max_events
|
||||
|
||||
def add_event(self, message: str, style: str = "white"):
|
||||
"""Add an event to the log."""
|
||||
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S")
|
||||
self.events.append((timestamp, message, style))
|
||||
|
||||
def __rich__(self) -> Panel:
|
||||
if not self.events:
|
||||
content = Text("No recent events", style="grey53", justify="center")
|
||||
else:
|
||||
lines = []
|
||||
for timestamp, message, style in self.events:
|
||||
line = Text()
|
||||
line.append(f"[{timestamp}] ", style="grey53")
|
||||
line.append(message, style=style)
|
||||
lines.append(line)
|
||||
content = Group(*lines)
|
||||
|
||||
return Panel(
|
||||
content,
|
||||
title="[bold white]Orchestrator / Daphne Logs",
|
||||
border_style="white",
|
||||
box=box.HORIZONTALS,
|
||||
)
|
||||
|
||||
|
||||
class CrawlQueueTreePanel:
|
||||
"""Display crawl queue with snapshots + hook summary in a tree view."""
|
||||
|
||||
@@ -465,13 +517,23 @@ class CrawlQueueTreePanel:
|
||||
snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white")
|
||||
snap_node = crawl_tree.add(snap_text)
|
||||
|
||||
hooks = snap.get('hooks', {})
|
||||
if hooks:
|
||||
completed = hooks.get('completed', 0)
|
||||
running = hooks.get('running', 0)
|
||||
pending = hooks.get('pending', 0)
|
||||
summary = f"✅ {completed} | ▶️ {running} | ⌛️ {pending}"
|
||||
snap_node.add(Text(summary, style="grey53"))
|
||||
output_path = snap.get('output_path', '')
|
||||
if output_path:
|
||||
snap_node.add(Text(output_path, style="grey53"))
|
||||
|
||||
hooks = snap.get('hooks', []) or []
|
||||
for hook in hooks:
|
||||
status = hook.get('status', '')
|
||||
path = hook.get('path', '')
|
||||
size = hook.get('size', '')
|
||||
elapsed = hook.get('elapsed', '')
|
||||
timeout = hook.get('timeout', '')
|
||||
is_bg = hook.get('is_bg', False)
|
||||
is_running = hook.get('is_running', False)
|
||||
is_pending = hook.get('is_pending', False)
|
||||
icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending)
|
||||
stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status)
|
||||
snap_node.add(Text(f"{icon} {path}{stats}", style=color))
|
||||
trees.append(crawl_tree)
|
||||
content = Group(*trees)
|
||||
|
||||
@@ -494,6 +556,45 @@ class CrawlQueueTreePanel:
|
||||
return '✖'
|
||||
return '•'
|
||||
|
||||
@staticmethod
|
||||
def _hook_style(status: str, is_bg: bool = False, is_running: bool = False, is_pending: bool = False) -> tuple[str, str]:
|
||||
if status == 'succeeded':
|
||||
return '✅', 'green'
|
||||
if status == 'failed':
|
||||
return '⚠️', 'yellow'
|
||||
if status == 'skipped':
|
||||
return '⏭', 'grey53'
|
||||
if is_pending:
|
||||
return '⌛️', 'grey53'
|
||||
if is_running and is_bg:
|
||||
return '᠁', 'cyan'
|
||||
if is_running:
|
||||
return '▶️', 'cyan'
|
||||
if status == 'started':
|
||||
return '▶️', 'cyan'
|
||||
return '•', 'grey53'
|
||||
|
||||
@staticmethod
|
||||
def _hook_stats(size: str = '', elapsed: str = '', timeout: str = '', status: str = '') -> str:
|
||||
if status in ('succeeded', 'failed', 'skipped'):
|
||||
parts = []
|
||||
if size:
|
||||
parts.append(size)
|
||||
if elapsed:
|
||||
parts.append(elapsed)
|
||||
if not parts:
|
||||
return ''
|
||||
return f" ({' | '.join(parts)})"
|
||||
if elapsed or timeout:
|
||||
size_part = '...' if elapsed or timeout else ''
|
||||
time_part = ''
|
||||
if elapsed and timeout:
|
||||
time_part = f"{elapsed}/{timeout}"
|
||||
elif elapsed:
|
||||
time_part = f"{elapsed}"
|
||||
return f" ({size_part} | {time_part})" if time_part else f" ({size_part})"
|
||||
return ''
|
||||
|
||||
|
||||
class ArchiveBoxProgressLayout:
|
||||
"""
|
||||
@@ -503,9 +604,9 @@ class ArchiveBoxProgressLayout:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Crawl Queue (full width) │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ Running Process Logs (dynamic panels) │
|
||||
│ Crawl Queue Tree (hooks + outputs) │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ Orchestrator / Daphne Logs │
|
||||
│ Running Process Logs (dynamic panels) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
"""
|
||||
|
||||
@@ -518,7 +619,6 @@ class ArchiveBoxProgressLayout:
|
||||
self.crawl_queue.crawl_id = crawl_id
|
||||
|
||||
self.process_panels: List[ProcessLogPanel] = []
|
||||
self.orchestrator_log = OrchestratorLogPanel(max_events=8)
|
||||
self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16)
|
||||
|
||||
# Create layout
|
||||
@@ -528,22 +628,17 @@ class ArchiveBoxProgressLayout:
|
||||
"""Define the layout structure."""
|
||||
layout = Layout(name="root")
|
||||
|
||||
# Top-level split: crawl_queue, workers, bottom
|
||||
# Top-level split: crawl_queue, crawl_tree, processes
|
||||
layout.split(
|
||||
Layout(name="crawl_queue", size=3),
|
||||
Layout(name="crawl_tree", size=14),
|
||||
Layout(name="processes", ratio=1),
|
||||
Layout(name="bottom", size=12),
|
||||
)
|
||||
|
||||
# Assign components to layout sections
|
||||
layout["crawl_queue"].update(self.crawl_queue)
|
||||
layout["processes"].update(Columns([]))
|
||||
layout["bottom"].split_row(
|
||||
Layout(name="orchestrator_logs", ratio=2),
|
||||
Layout(name="crawl_tree", ratio=1),
|
||||
)
|
||||
layout["orchestrator_logs"].update(self.orchestrator_log)
|
||||
layout["crawl_tree"].update(self.crawl_queue_tree)
|
||||
layout["processes"].update(Columns([]))
|
||||
|
||||
return layout
|
||||
|
||||
@@ -568,6 +663,33 @@ class ArchiveBoxProgressLayout:
|
||||
"""Update process panels to show all running processes."""
|
||||
panels = []
|
||||
all_processes = list(processes) + list(pending or [])
|
||||
fg_running = False
|
||||
for process in processes:
|
||||
if getattr(process, 'process_type', '') != 'hook':
|
||||
continue
|
||||
try:
|
||||
cmd = getattr(process, 'cmd', [])
|
||||
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
||||
hook_name = hook_path.name if hook_path else ''
|
||||
if '.bg.' not in hook_name:
|
||||
fg_running = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
fg_pending = False
|
||||
for process in (pending or []):
|
||||
if getattr(process, 'process_type', '') != 'hook':
|
||||
continue
|
||||
try:
|
||||
cmd = getattr(process, 'cmd', [])
|
||||
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
||||
hook_name = hook_path.name if hook_path else ''
|
||||
if '.bg.' not in hook_name:
|
||||
fg_pending = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
bg_terminating = bool(processes) and not fg_running and not fg_pending
|
||||
for process in all_processes:
|
||||
is_hook = getattr(process, 'process_type', '') == 'hook'
|
||||
is_bg = False
|
||||
@@ -581,12 +703,14 @@ class ArchiveBoxProgressLayout:
|
||||
is_bg = False
|
||||
is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
|
||||
max_lines = 2 if is_pending else (4 if is_bg else 7)
|
||||
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg))
|
||||
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating))
|
||||
if not panels:
|
||||
self.layout["processes"].size = 0
|
||||
self.layout["processes"].update(Text(""))
|
||||
self.process_panels = []
|
||||
return
|
||||
|
||||
self.process_panels = panels
|
||||
self.layout["processes"].size = None
|
||||
self.layout["processes"].ratio = 1
|
||||
self.layout["processes"].update(Columns(panels, equal=True, expand=True))
|
||||
@@ -597,8 +721,54 @@ class ArchiveBoxProgressLayout:
|
||||
|
||||
def log_event(self, message: str, style: str = "white") -> None:
|
||||
"""Add an event to the orchestrator log."""
|
||||
self.orchestrator_log.add_event(message, style)
|
||||
return
|
||||
|
||||
def get_layout(self) -> Layout:
|
||||
"""Get the Rich Layout object for rendering."""
|
||||
return self.layout
|
||||
|
||||
def plain_lines(self) -> list[tuple[str, str]]:
|
||||
lines: list[tuple[str, str]] = []
|
||||
queue = self.crawl_queue
|
||||
queue_line = (
|
||||
f"Status: {queue.orchestrator_status} | Crawls: {queue.crawl_queue_count} queued | "
|
||||
f"Binaries: {queue.binary_queue_count} queued | Workers: {queue.crawl_workers_count}/{queue.max_crawl_workers} "
|
||||
f"crawl, {queue.binary_workers_count} binary"
|
||||
)
|
||||
lines.append(("crawl_queue", queue_line))
|
||||
|
||||
for panel in self.process_panels:
|
||||
title = _strip_rich(panel._title())
|
||||
for line in panel.plain_lines():
|
||||
if line:
|
||||
lines.append((title or "process", line))
|
||||
|
||||
for crawl in self.crawl_queue_tree.crawls:
|
||||
crawl_line = f"{self.crawl_queue_tree._status_icon(crawl.get('status', ''))} {crawl.get('id', '')[:8]} {crawl.get('label', '')}".strip()
|
||||
lines.append(("crawl_tree", crawl_line))
|
||||
for snap in crawl.get('snapshots', []):
|
||||
snap_line = f" {self.crawl_queue_tree._status_icon(snap.get('status', ''))} {snap.get('label', '')}".rstrip()
|
||||
lines.append(("crawl_tree", snap_line))
|
||||
output_path = snap.get('output_path', '')
|
||||
if output_path:
|
||||
lines.append(("crawl_tree", f" {output_path}"))
|
||||
for hook in snap.get('hooks', []) or []:
|
||||
status = hook.get('status', '')
|
||||
path = hook.get('path', '')
|
||||
icon, _ = self.crawl_queue_tree._hook_style(
|
||||
status,
|
||||
is_bg=hook.get('is_bg', False),
|
||||
is_running=hook.get('is_running', False),
|
||||
is_pending=hook.get('is_pending', False),
|
||||
)
|
||||
stats = self.crawl_queue_tree._hook_stats(
|
||||
size=hook.get('size', ''),
|
||||
elapsed=hook.get('elapsed', ''),
|
||||
timeout=hook.get('timeout', ''),
|
||||
status=status,
|
||||
)
|
||||
hook_line = f" {icon} {path}{stats}".strip()
|
||||
if hook_line:
|
||||
lines.append(("crawl_tree", hook_line))
|
||||
|
||||
return lines
|
||||
|
||||
@@ -32,6 +32,7 @@ class Persona(ModelWithConfig):
|
||||
Each persona provides:
|
||||
- CHROME_USER_DATA_DIR: Chrome profile directory
|
||||
- CHROME_EXTENSIONS_DIR: Installed extensions directory
|
||||
- CHROME_DOWNLOADS_DIR: Chrome downloads directory
|
||||
- COOKIES_FILE: Cookies file for wget/curl
|
||||
- config: JSON field with persona-specific config overrides
|
||||
|
||||
@@ -72,6 +73,11 @@ class Persona(ModelWithConfig):
|
||||
"""Derived path to Chrome extensions directory for this persona."""
|
||||
return str(self.path / 'chrome_extensions')
|
||||
|
||||
@property
|
||||
def CHROME_DOWNLOADS_DIR(self) -> str:
|
||||
"""Derived path to Chrome downloads directory for this persona."""
|
||||
return str(self.path / 'chrome_downloads')
|
||||
|
||||
@property
|
||||
def COOKIES_FILE(self) -> str:
|
||||
"""Derived path to cookies.txt file for this persona (if exists)."""
|
||||
@@ -86,6 +92,7 @@ class Persona(ModelWithConfig):
|
||||
- All values from self.config JSONField
|
||||
- CHROME_USER_DATA_DIR (derived from persona path)
|
||||
- CHROME_EXTENSIONS_DIR (derived from persona path)
|
||||
- CHROME_DOWNLOADS_DIR (derived from persona path)
|
||||
- COOKIES_FILE (derived from persona path, if file exists)
|
||||
- ACTIVE_PERSONA (set to this persona's name)
|
||||
"""
|
||||
@@ -96,6 +103,8 @@ class Persona(ModelWithConfig):
|
||||
derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR
|
||||
if 'CHROME_EXTENSIONS_DIR' not in derived:
|
||||
derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR
|
||||
if 'CHROME_DOWNLOADS_DIR' not in derived:
|
||||
derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR
|
||||
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
|
||||
derived['COOKIES_FILE'] = self.COOKIES_FILE
|
||||
|
||||
@@ -109,6 +118,7 @@ class Persona(ModelWithConfig):
|
||||
self.path.mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def cleanup_chrome(self) -> bool:
|
||||
"""
|
||||
|
||||
@@ -384,6 +384,8 @@ async function launchChromium(options = {}) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
const downloadsDir = getEnv('CHROME_DOWNLOADS_DIR');
|
||||
|
||||
// Kill zombies first
|
||||
if (killZombies) {
|
||||
killZombieChrome();
|
||||
@@ -412,6 +414,28 @@ async function launchChromium(options = {}) {
|
||||
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
|
||||
}
|
||||
}
|
||||
if (downloadsDir) {
|
||||
try {
|
||||
const defaultProfileDir = path.join(userDataDir, 'Default');
|
||||
const prefsPath = path.join(defaultProfileDir, 'Preferences');
|
||||
fs.mkdirSync(defaultProfileDir, { recursive: true });
|
||||
let prefs = {};
|
||||
if (fs.existsSync(prefsPath)) {
|
||||
try {
|
||||
prefs = JSON.parse(fs.readFileSync(prefsPath, 'utf-8'));
|
||||
} catch (e) {
|
||||
prefs = {};
|
||||
}
|
||||
}
|
||||
prefs.download = prefs.download || {};
|
||||
prefs.download.default_directory = downloadsDir;
|
||||
prefs.download.prompt_for_download = false;
|
||||
fs.writeFileSync(prefsPath, JSON.stringify(prefs));
|
||||
console.error(`[*] Set Chrome download directory: ${downloadsDir}`);
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to set Chrome download directory: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find a free port
|
||||
@@ -455,6 +479,11 @@ async function launchChromium(options = {}) {
|
||||
// Dynamic args come after base so they can override if needed
|
||||
const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs];
|
||||
|
||||
// Ensure keychain prompts are disabled on macOS
|
||||
if (!chromiumArgs.includes('--use-mock-keychain')) {
|
||||
chromiumArgs.push('--use-mock-keychain');
|
||||
}
|
||||
|
||||
// Add extension loading flags
|
||||
if (extensionPaths.length > 0) {
|
||||
const extPathsArg = extensionPaths.join(',');
|
||||
|
||||
@@ -84,6 +84,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
}
|
||||
|
||||
const url = await page.url();
|
||||
console.error(`[singlefile] Triggering extension for: ${url}`);
|
||||
|
||||
// Check for unsupported URL schemes
|
||||
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
@@ -93,24 +94,28 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const downloadsDir = options.downloadsDir || CHROME_DOWNLOADS_DIR;
|
||||
console.error(`[singlefile] Watching downloads dir: ${downloadsDir}`);
|
||||
|
||||
// Ensure downloads directory exists
|
||||
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
await fs.promises.mkdir(downloadsDir, { recursive: true });
|
||||
|
||||
// Get list of existing files to ignore
|
||||
const files_before = new Set(
|
||||
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'))
|
||||
(await fs.promises.readdir(downloadsDir))
|
||||
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'))
|
||||
);
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
console.error(`[singlefile] Saving via extension (${extension.id})...`);
|
||||
|
||||
// Bring page to front (extension action button acts on foreground tab)
|
||||
await page.bringToFront();
|
||||
|
||||
// Trigger the extension's action (toolbar button click)
|
||||
console.error('[singlefile] Dispatching extension action...');
|
||||
await extension.dispatchAction();
|
||||
|
||||
// Wait for file to appear in downloads directory
|
||||
@@ -118,34 +123,90 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
const max_tries = 10;
|
||||
let files_new = [];
|
||||
|
||||
console.error(`[singlefile] Waiting up to ${(check_delay * max_tries) / 1000}s for download...`);
|
||||
for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
await wait(check_delay);
|
||||
|
||||
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'));
|
||||
const files_after = (await fs.promises.readdir(downloadsDir))
|
||||
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'));
|
||||
|
||||
files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
if (files_new.length === 0) {
|
||||
console.error(`[singlefile] No new downloads yet (${attempt + 1}/${max_tries})`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the matching file by checking if it contains the URL in the HTML header
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
const dl_header = dl_text.split('meta charset')[0];
|
||||
console.error(`[singlefile] New download(s) detected: ${files_new.join(', ')}`);
|
||||
|
||||
if (dl_header.includes(`url: ${url}`)) {
|
||||
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
await fs.promises.rename(dl_path, out_path);
|
||||
// Prefer files that match the URL or have SingleFile markers
|
||||
const url_variants = new Set([url]);
|
||||
if (url.endsWith('/')) {
|
||||
url_variants.add(url.slice(0, -1));
|
||||
} else {
|
||||
url_variants.add(`${url}/`);
|
||||
}
|
||||
|
||||
const scored = [];
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(downloadsDir, file);
|
||||
let header = '';
|
||||
try {
|
||||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
header = dl_text.slice(0, 200000);
|
||||
const stat = await fs.promises.stat(dl_path);
|
||||
console.error(`[singlefile] Download ${file} size=${stat.size} bytes`);
|
||||
} catch (err) {
|
||||
// Skip unreadable files
|
||||
continue;
|
||||
}
|
||||
|
||||
const header_lower = header.toLowerCase();
|
||||
const has_url = Array.from(url_variants).some(v => header.includes(v));
|
||||
const has_singlefile_marker = header_lower.includes('singlefile') || header_lower.includes('single-file');
|
||||
const score = (has_url ? 2 : 0) + (has_singlefile_marker ? 1 : 0);
|
||||
scored.push({ file, dl_path, score });
|
||||
}
|
||||
|
||||
scored.sort((a, b) => b.score - a.score);
|
||||
|
||||
if (scored.length > 0) {
|
||||
const best = scored[0];
|
||||
if (best.score > 0 || files_new.length === 1) {
|
||||
console.error(`[singlefile] Moving download from ${best.file} -> ${out_path}`);
|
||||
await fs.promises.rename(best.dl_path, out_path);
|
||||
const out_stat = await fs.promises.stat(out_path);
|
||||
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
|
||||
return out_path;
|
||||
}
|
||||
}
|
||||
|
||||
if (files_new.length > 0) {
|
||||
// Fallback: move the newest file if no clear match found
|
||||
let newest = null;
|
||||
let newest_mtime = -1;
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(downloadsDir, file);
|
||||
try {
|
||||
const stat = await fs.promises.stat(dl_path);
|
||||
if (stat.mtimeMs > newest_mtime) {
|
||||
newest_mtime = stat.mtimeMs;
|
||||
newest = { file, dl_path };
|
||||
}
|
||||
} catch (err) {}
|
||||
}
|
||||
if (newest) {
|
||||
console.error(`[singlefile] Moving newest download from ${newest.file} -> ${out_path}`);
|
||||
await fs.promises.rename(newest.dl_path, out_path);
|
||||
const out_stat = await fs.promises.stat(out_path);
|
||||
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
|
||||
return out_path;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
console.error(`[singlefile] Failed to find SingleFile HTML in ${downloadsDir} after ${(check_delay * max_tries) / 1000}s`);
|
||||
console.error(`[singlefile] New files seen: ${files_new.join(', ')}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ BIN_NAME = 'single-file'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'singlefile.html'
|
||||
EXTENSION_SAVE_SCRIPT = Path(__file__).parent / 'singlefile_extension_save.js'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -255,6 +256,42 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
|
||||
"""Save using the SingleFile Chrome extension via existing Chrome session."""
|
||||
# Only attempt if chrome session exists
|
||||
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
|
||||
if not cdp_url:
|
||||
return False, None, 'No Chrome session available'
|
||||
|
||||
if not EXTENSION_SAVE_SCRIPT.exists():
|
||||
return False, None, 'SingleFile extension helper script missing'
|
||||
|
||||
node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
|
||||
cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
if result.returncode == 0:
|
||||
# Prefer explicit stdout path, fallback to local output file
|
||||
out_text = result.stdout.decode('utf-8', errors='replace').strip()
|
||||
if out_text and Path(out_text).exists():
|
||||
return True, out_text, ''
|
||||
output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
|
||||
if output_path.exists() and output_path.stat().st_size > 0:
|
||||
return True, str(output_path), ''
|
||||
return False, None, 'SingleFile extension completed but no output file found'
|
||||
|
||||
stderr = result.stderr.decode('utf-8', errors='replace').strip()
|
||||
stdout = result.stdout.decode('utf-8', errors='replace').strip()
|
||||
detail = stderr or stdout
|
||||
return False, None, detail or 'SingleFile extension failed'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to archive')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
@@ -278,11 +315,14 @@ def main(url: str, snapshot_id: str):
|
||||
print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
binary = get_env('SINGLEFILE_BINARY', 'single-file')
|
||||
# Prefer SingleFile extension via existing Chrome session
|
||||
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||
success, output, error = save_singlefile_with_extension(url, timeout)
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
# Fallback to single-file-cli if extension path failed
|
||||
if not success:
|
||||
binary = get_env('SINGLEFILE_BINARY', 'single-file')
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -30,6 +30,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__82_singlefile_install.js'
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
@@ -142,6 +143,95 @@ def test_singlefile_with_chrome_session():
|
||||
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
|
||||
|
||||
|
||||
def test_singlefile_with_extension_uses_existing_chrome():
|
||||
"""Test SingleFile uses the Chrome extension via existing session (CLI fallback disabled)."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
data_dir = tmpdir / 'data'
|
||||
extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
|
||||
user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
|
||||
extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
downloads_dir.mkdir(parents=True, exist_ok=True)
|
||||
user_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
env_install = os.environ.copy()
|
||||
env_install.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(extensions_dir),
|
||||
'CHROME_DOWNLOADS_DIR': str(downloads_dir),
|
||||
})
|
||||
|
||||
# Install SingleFile extension cache before launching Chrome
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env_install,
|
||||
timeout=120
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Launch Chrome session with extensions loaded
|
||||
old_env = os.environ.copy()
|
||||
os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
|
||||
os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
|
||||
try:
|
||||
with chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id='singlefile-ext-crawl',
|
||||
snapshot_id='singlefile-ext-snap',
|
||||
test_url=TEST_URL,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
|
||||
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
|
||||
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Ensure ../chrome points to snapshot chrome session (contains target_id.txt)
|
||||
chrome_dir = singlefile_output_dir.parent / 'chrome'
|
||||
if not chrome_dir.exists():
|
||||
chrome_dir.symlink_to(snapshot_chrome_dir)
|
||||
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
env['SINGLEFILE_BINARY'] = '/nonexistent/single-file' # force extension path
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
|
||||
env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
|
||||
env['CHROME_HEADLESS'] = 'false'
|
||||
|
||||
# Track downloads dir state before run to ensure file is created then moved out
|
||||
downloads_before = set(downloads_dir.glob('*.html'))
|
||||
downloads_mtime_before = downloads_dir.stat().st_mtime_ns
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-ext-snap'],
|
||||
cwd=str(singlefile_output_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"SingleFile extension run failed: {result.stderr}"
|
||||
|
||||
output_file = singlefile_output_dir / 'singlefile.html'
|
||||
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
|
||||
html_content = output_file.read_text(errors='ignore')
|
||||
assert 'Example Domain' in html_content, "Output should contain example.com content"
|
||||
|
||||
# Verify download moved out of downloads dir
|
||||
downloads_after = set(downloads_dir.glob('*.html'))
|
||||
new_downloads = downloads_after - downloads_before
|
||||
downloads_mtime_after = downloads_dir.stat().st_mtime_ns
|
||||
assert downloads_mtime_after != downloads_mtime_before, "Downloads dir should be modified during extension save"
|
||||
assert not new_downloads, f"SingleFile download should be moved out of downloads dir, found: {new_downloads}"
|
||||
finally:
|
||||
os.environ.clear()
|
||||
os.environ.update(old_env)
|
||||
|
||||
|
||||
def test_singlefile_disabled_skips():
|
||||
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -30,6 +30,7 @@ import time
|
||||
from typing import Type
|
||||
from datetime import timedelta
|
||||
from multiprocessing import Process as MPProcess
|
||||
from pathlib import Path
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
@@ -457,12 +458,14 @@ class Orchestrator:
|
||||
|
||||
# Enable progress layout only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
plain_output = not IS_TTY
|
||||
|
||||
self.on_startup()
|
||||
|
||||
if not show_progress:
|
||||
# No progress layout - just run normally
|
||||
self._run_orchestrator_loop(None)
|
||||
# No progress layout - optionally emit plain lines for non-TTY output
|
||||
progress_layout = ArchiveBoxProgressLayout(crawl_id=self.crawl_id) if plain_output else None
|
||||
self._run_orchestrator_loop(progress_layout, plain_output=plain_output)
|
||||
else:
|
||||
# Redirect worker subprocess output to /dev/null
|
||||
devnull_fd = os.open(os.devnull, os.O_WRONLY)
|
||||
@@ -497,7 +500,7 @@ class Orchestrator:
|
||||
screen=True,
|
||||
console=orchestrator_console,
|
||||
):
|
||||
self._run_orchestrator_loop(progress_layout)
|
||||
self._run_orchestrator_loop(progress_layout, plain_output=False)
|
||||
|
||||
# Restore original console
|
||||
logging_module.CONSOLE = original_console
|
||||
@@ -515,11 +518,12 @@ class Orchestrator:
|
||||
pass
|
||||
# stdout_for_console is closed by orchestrator_console
|
||||
|
||||
def _run_orchestrator_loop(self, progress_layout):
|
||||
def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False):
|
||||
"""Run the main orchestrator loop with optional progress display."""
|
||||
last_queue_sizes = {}
|
||||
last_snapshot_count = None
|
||||
tick_count = 0
|
||||
last_plain_lines: set[tuple[str, str]] = set()
|
||||
|
||||
# Track snapshot progress to detect changes
|
||||
snapshot_progress = {} # snapshot_id -> (total, completed, current_plugin)
|
||||
@@ -591,6 +595,22 @@ class Orchestrator:
|
||||
def _abbrev(text: str, max_len: int = 80) -> str:
|
||||
return text if len(text) <= max_len else f"{text[:max_len - 3]}..."
|
||||
|
||||
def _format_size(num_bytes: int | None) -> str:
|
||||
if not num_bytes:
|
||||
return ''
|
||||
size = float(num_bytes)
|
||||
for unit in ('b', 'kb', 'mb', 'gb', 'tb'):
|
||||
if size < 1024 or unit == 'tb':
|
||||
return f"{size:.1f}{unit}"
|
||||
size /= 1024
|
||||
return ''
|
||||
|
||||
def _format_seconds(total_seconds: float | None) -> str:
|
||||
if total_seconds is None:
|
||||
return ''
|
||||
seconds = max(0.0, float(total_seconds))
|
||||
return f"{seconds:.1f}s"
|
||||
|
||||
tree_data: list[dict] = []
|
||||
for crawl in crawls:
|
||||
urls = crawl.get_urls_list()
|
||||
@@ -614,28 +634,174 @@ class Orchestrator:
|
||||
active_snaps.append(s)
|
||||
|
||||
for snap in active_snaps:
|
||||
total = snap.archiveresult_set.count()
|
||||
completed = snap.archiveresult_set.filter(status__in=[
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
]).count()
|
||||
running = snap.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED).count()
|
||||
try:
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.hooks import discover_hooks
|
||||
hooks_list = discover_hooks('Snapshot', config=get_config(snapshot=snap))
|
||||
total_hooks = len(hooks_list)
|
||||
snap_config = get_config(snapshot=snap)
|
||||
hooks_list = discover_hooks('Snapshot', config=snap_config)
|
||||
hooks_by_snapshot[str(snap.id)] = hooks_list
|
||||
from archivebox.hooks import get_plugin_special_config
|
||||
hook_timeouts = {}
|
||||
for hook_path in hooks_list:
|
||||
plugin_name = hook_path.parent.name
|
||||
try:
|
||||
hook_timeouts[hook_path.name] = int(get_plugin_special_config(plugin_name, snap_config)['timeout'])
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
total_hooks = total
|
||||
pending = max(total_hooks - completed - running, 0)
|
||||
snap_label = _abbrev(snap.url or str(snap.id), max_len=60)
|
||||
hooks_list = []
|
||||
hook_timeouts = {}
|
||||
|
||||
try:
|
||||
from archivebox import DATA_DIR
|
||||
data_dir = Path(DATA_DIR)
|
||||
snap_path = snap.output_dir
|
||||
try:
|
||||
rel = Path(snap_path)
|
||||
if rel.is_absolute():
|
||||
rel = rel.relative_to(data_dir)
|
||||
snap_path = f"./{rel}" if not str(rel).startswith("./") else str(rel)
|
||||
except Exception:
|
||||
snap_path = str(snap_path)
|
||||
|
||||
ars = list(
|
||||
snap.archiveresult_set.select_related('process').order_by('start_ts')
|
||||
)
|
||||
ar_by_hook = {ar.hook_name: ar for ar in ars if ar.hook_name}
|
||||
except Exception:
|
||||
snap_path = ''
|
||||
ar_by_hook = {}
|
||||
|
||||
plugin_hooks: dict[str, list[dict]] = {}
|
||||
now = timezone.now()
|
||||
for hook_path in hooks_list:
|
||||
hook_name = hook_path.name
|
||||
is_bg = '.bg.' in hook_name
|
||||
ar = ar_by_hook.get(hook_name)
|
||||
status = 'pending'
|
||||
is_running = False
|
||||
is_pending = True
|
||||
elapsed = ''
|
||||
timeout = ''
|
||||
size = ''
|
||||
if ar:
|
||||
if ar.status == ArchiveResult.StatusChoices.STARTED:
|
||||
status = 'started'
|
||||
is_running = True
|
||||
is_pending = False
|
||||
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
|
||||
if start_ts:
|
||||
elapsed = _format_seconds((now - start_ts).total_seconds())
|
||||
hook_timeout = None
|
||||
if ar.process_id and ar.process and ar.process.timeout:
|
||||
hook_timeout = ar.process.timeout
|
||||
hook_timeout = hook_timeout or hook_timeouts.get(hook_name)
|
||||
if hook_timeout:
|
||||
timeout = _format_seconds(hook_timeout)
|
||||
else:
|
||||
status = ar.status
|
||||
is_pending = False
|
||||
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
|
||||
end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None)
|
||||
if start_ts and end_ts:
|
||||
elapsed = _format_seconds((end_ts - start_ts).total_seconds())
|
||||
size = _format_size(getattr(ar, 'output_size', None))
|
||||
else:
|
||||
hook_timeout = hook_timeouts.get(hook_name)
|
||||
if hook_timeout:
|
||||
timeout = _format_seconds(hook_timeout)
|
||||
elapsed = _format_seconds(0)
|
||||
|
||||
plugin_name = hook_path.parent.name
|
||||
if plugin_name in ('plugins', '.'):
|
||||
plugin_name = hook_name.split('__')[-1].split('.')[0]
|
||||
plugin_hooks.setdefault(plugin_name, []).append({
|
||||
'status': status,
|
||||
'size': size,
|
||||
'elapsed': elapsed,
|
||||
'timeout': timeout,
|
||||
'is_bg': is_bg,
|
||||
'is_running': is_running,
|
||||
'is_pending': is_pending,
|
||||
'hook_name': hook_name,
|
||||
})
|
||||
|
||||
hooks = []
|
||||
for plugin_name, hook_entries in plugin_hooks.items():
|
||||
running = next((h for h in hook_entries if h['is_running']), None)
|
||||
pending = next((h for h in hook_entries if h['is_pending']), None)
|
||||
any_failed = any(h['status'] == ArchiveResult.StatusChoices.FAILED for h in hook_entries)
|
||||
any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries)
|
||||
any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries)
|
||||
|
||||
if running:
|
||||
status = 'started'
|
||||
is_running = True
|
||||
is_pending = False
|
||||
is_bg = running['is_bg']
|
||||
elapsed = running.get('elapsed', '')
|
||||
timeout = running.get('timeout', '')
|
||||
size = ''
|
||||
elif pending:
|
||||
status = 'pending'
|
||||
is_running = False
|
||||
is_pending = True
|
||||
is_bg = pending['is_bg']
|
||||
elapsed = pending.get('elapsed', '') or _format_seconds(0)
|
||||
timeout = pending.get('timeout', '')
|
||||
size = ''
|
||||
else:
|
||||
is_running = False
|
||||
is_pending = False
|
||||
is_bg = any(h['is_bg'] for h in hook_entries)
|
||||
if any_failed:
|
||||
status = 'failed'
|
||||
elif any_succeeded:
|
||||
status = 'succeeded'
|
||||
elif any_skipped:
|
||||
status = 'skipped'
|
||||
else:
|
||||
status = 'skipped'
|
||||
total_elapsed = 0.0
|
||||
has_elapsed = False
|
||||
for h in hook_entries:
|
||||
if h.get('elapsed'):
|
||||
try:
|
||||
total_elapsed += float(h['elapsed'].rstrip('s'))
|
||||
has_elapsed = True
|
||||
except Exception:
|
||||
pass
|
||||
elapsed = _format_seconds(total_elapsed) if has_elapsed else ''
|
||||
max_output = 0
|
||||
# Use the largest output_size we already computed on ArchiveResult
|
||||
ar_sizes = [
|
||||
ar_by_hook[h['hook_name']].output_size
|
||||
for h in hook_entries
|
||||
if h.get('hook_name') in ar_by_hook and getattr(ar_by_hook[h['hook_name']], 'output_size', 0)
|
||||
]
|
||||
if ar_sizes:
|
||||
max_output = max(ar_sizes)
|
||||
size = _format_size(max_output) if max_output else ''
|
||||
timeout = ''
|
||||
|
||||
hooks.append({
|
||||
'status': status,
|
||||
'path': f"./{plugin_name}",
|
||||
'size': size,
|
||||
'elapsed': elapsed,
|
||||
'timeout': timeout,
|
||||
'is_bg': is_bg,
|
||||
'is_running': is_running,
|
||||
'is_pending': is_pending,
|
||||
})
|
||||
|
||||
snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80)
|
||||
snapshots.append({
|
||||
'id': str(snap.id),
|
||||
'status': snap.status,
|
||||
'label': snap_label,
|
||||
'hooks': {'completed': completed, 'running': running, 'pending': pending} if total else {},
|
||||
'output_path': snap_path,
|
||||
'hooks': hooks,
|
||||
})
|
||||
pending_snapshot_candidates.append(snap)
|
||||
|
||||
@@ -837,6 +1003,16 @@ class Orchestrator:
|
||||
if snapshot_id in snapshot_progress:
|
||||
del snapshot_progress[snapshot_id]
|
||||
|
||||
if plain_output:
|
||||
plain_lines = progress_layout.plain_lines()
|
||||
new_lines = [line for line in plain_lines if line not in last_plain_lines]
|
||||
if new_lines:
|
||||
ts = timezone.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
for panel, line in new_lines:
|
||||
if line:
|
||||
print(f"[{ts}] [{panel}] {line}")
|
||||
last_plain_lines = set(plain_lines)
|
||||
|
||||
# Track idle state
|
||||
has_pending = self.has_pending_work(queue_sizes)
|
||||
has_running = self.has_running_workers()
|
||||
|
||||
@@ -254,8 +254,7 @@ def start_new_supervisord_process(daemonize=False):
|
||||
shell=True,
|
||||
start_new_session=True,
|
||||
)
|
||||
time.sleep(2)
|
||||
return get_existing_supervisord_process()
|
||||
return wait_for_supervisord_ready()
|
||||
else:
|
||||
# Start supervisord in FOREGROUND - this will block until supervisord exits
|
||||
# supervisord with nodaemon=true will run in foreground and handle signals properly
|
||||
@@ -273,10 +272,19 @@ def start_new_supervisord_process(daemonize=False):
|
||||
global _supervisord_proc
|
||||
_supervisord_proc = proc
|
||||
|
||||
# Wait a bit for supervisord to start up
|
||||
time.sleep(2)
|
||||
return wait_for_supervisord_ready()
|
||||
|
||||
return get_existing_supervisord_process()
|
||||
|
||||
def wait_for_supervisord_ready(max_wait_sec: float = 5.0, interval_sec: float = 0.1):
|
||||
"""Poll for supervisord readiness without a fixed startup sleep."""
|
||||
deadline = time.monotonic() + max_wait_sec
|
||||
supervisor = None
|
||||
while time.monotonic() < deadline:
|
||||
supervisor = get_existing_supervisord_process()
|
||||
if supervisor is not None:
|
||||
return supervisor
|
||||
time.sleep(interval_sec)
|
||||
return supervisor
|
||||
|
||||
|
||||
def get_or_create_supervisord_process(daemonize=False):
|
||||
@@ -287,17 +295,16 @@ def get_or_create_supervisord_process(daemonize=False):
|
||||
if supervisor is None:
|
||||
stop_existing_supervisord_process()
|
||||
supervisor = start_new_supervisord_process(daemonize=daemonize)
|
||||
time.sleep(0.5)
|
||||
|
||||
# wait up to 5s in case supervisord is slow to start
|
||||
if not supervisor:
|
||||
for _ in range(10):
|
||||
for _ in range(50):
|
||||
if supervisor is not None:
|
||||
print()
|
||||
break
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.1)
|
||||
supervisor = get_existing_supervisord_process()
|
||||
else:
|
||||
print()
|
||||
@@ -328,9 +335,7 @@ def start_worker(supervisor, daemon, lazy=False):
|
||||
for added in added:
|
||||
supervisor.addProcessGroup(added)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
for _ in range(10):
|
||||
for _ in range(25):
|
||||
procs = supervisor.getAllProcessInfo()
|
||||
for proc in procs:
|
||||
if proc['name'] == daemon["name"]:
|
||||
@@ -345,8 +350,8 @@ def start_worker(supervisor, daemon, lazy=False):
|
||||
print(f" - Worker {daemon['name']}: started {proc['statename']} ({proc['description']})")
|
||||
return proc
|
||||
|
||||
# retry in a second in case it's slow to launch
|
||||
time.sleep(0.5)
|
||||
# retry in a moment in case it's slow to launch
|
||||
time.sleep(0.2)
|
||||
|
||||
raise Exception(f"Failed to start worker {daemon['name']}! Only found: {procs}")
|
||||
|
||||
|
||||
4
uv.lock
generated
4
uv.lock
generated
@@ -1,5 +1,5 @@
|
||||
version = 1
|
||||
revision = 3
|
||||
revision = 2
|
||||
requires-python = ">=3.13"
|
||||
resolution-markers = [
|
||||
"python_full_version >= '3.14' and sys_platform == 'darwin'",
|
||||
@@ -60,7 +60,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "archivebox"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "abx-pkg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
|
||||
Reference in New Issue
Block a user