working singlefile

This commit is contained in:
Nick Sweeting
2026-01-19 03:05:49 -08:00
parent b5bbc3b549
commit bef67760db
17 changed files with 498 additions and 54 deletions

View File

@@ -1020,14 +1020,14 @@ class Process(models.Model):
# Debug logging # Debug logging
import sys import sys
print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr) # print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
# Get parent process start time from OS # Get parent process start time from OS
try: try:
os_parent = psutil.Process(ppid) os_parent = psutil.Process(ppid)
os_parent_start = os_parent.create_time() os_parent_start = os_parent.create_time()
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr) # print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr)
return None # Parent process doesn't exist return None # Parent process doesn't exist
# Find matching Process record # Find matching Process record
@@ -1038,18 +1038,18 @@ class Process(models.Model):
started_at__gte=timezone.now() - PID_REUSE_WINDOW, started_at__gte=timezone.now() - PID_REUSE_WINDOW,
).order_by('-started_at') ).order_by('-started_at')
print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr) # print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr)
for candidate in candidates: for candidate in candidates:
if candidate.started_at: if candidate.started_at:
db_start_time = candidate.started_at.timestamp() db_start_time = candidate.started_at.timestamp()
time_diff = abs(db_start_time - os_parent_start) time_diff = abs(db_start_time - os_parent_start)
print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr) # print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr)
if time_diff < START_TIME_TOLERANCE: if time_diff < START_TIME_TOLERANCE:
print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr) # print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr)
return candidate return candidate
print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr) # print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr)
return None # No matching ArchiveBox parent process return None # No matching ArchiveBox parent process
@classmethod @classmethod
@@ -1519,7 +1519,7 @@ class Process(models.Model):
stdout_path = self.stdout_file stdout_path = self.stdout_file
stderr_path = self.stderr_file stderr_path = self.stderr_file
with open(stdout_path, 'w') as out, open(stderr_path, 'w') as err: with open(stdout_path, 'a') as out, open(stderr_path, 'a') as err:
proc = subprocess.Popen( proc = subprocess.Popen(
self.cmd, self.cmd,
cwd=working_dir, cwd=working_dir,

View File

@@ -10,6 +10,7 @@ Shows a comprehensive dashboard with:
__package__ = 'archivebox.misc' __package__ = 'archivebox.misc'
from datetime import datetime, timezone from datetime import datetime, timezone
import os
import re import re
from typing import List, Optional, Any from typing import List, Optional, Any
from collections import deque from collections import deque
@@ -23,6 +24,7 @@ from rich.panel import Panel
from rich.text import Text from rich.text import Text
from rich.table import Table from rich.table import Table
from rich.tree import Tree from rich.tree import Tree
from rich.cells import cell_len
from archivebox.config import VERSION from archivebox.config import VERSION
@@ -533,7 +535,23 @@ class CrawlQueueTreePanel:
is_pending = hook.get('is_pending', False) is_pending = hook.get('is_pending', False)
icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending) icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending)
stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status) stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status)
snap_node.add(Text(f"{icon} {path}{stats}", style=color)) line = Text(f"{icon} {path}{stats}", style=color)
stderr_tail = hook.get('stderr', '')
if stderr_tail:
left_str = f"{icon} {path}{stats}"
avail = self._available_width(left_str, indent=16)
trunc = getattr(self, "_truncate_tail", self._truncate_to_width)
stderr_tail = trunc(stderr_tail, avail)
if not stderr_tail:
snap_node.add(line)
continue
row = Table.grid(expand=True)
row.add_column(justify="left", ratio=1)
row.add_column(justify="right")
row.add_row(line, Text(stderr_tail, style="grey70"))
snap_node.add(row)
else:
snap_node.add(line)
trees.append(crawl_tree) trees.append(crawl_tree)
content = Group(*trees) content = Group(*trees)
@@ -561,7 +579,7 @@ class CrawlQueueTreePanel:
if status == 'succeeded': if status == 'succeeded':
return '', 'green' return '', 'green'
if status == 'failed': if status == 'failed':
return '⚠️', 'yellow' return '', 'red'
if status == 'skipped': if status == 'skipped':
return '', 'grey53' return '', 'grey53'
if is_pending: if is_pending:
@@ -595,6 +613,37 @@ class CrawlQueueTreePanel:
return f" ({size_part} | {time_part})" if time_part else f" ({size_part})" return f" ({size_part} | {time_part})" if time_part else f" ({size_part})"
return '' return ''
@staticmethod
def _terminal_width() -> int:
try:
return os.get_terminal_size().columns
except OSError:
return 120
@staticmethod
def _truncate_to_width(text: str, max_width: int) -> str:
if not text or max_width <= 0:
return ''
t = Text(text)
t.truncate(max_width, overflow="ellipsis")
return t.plain
@staticmethod
def _truncate_tail(text: str, max_width: int) -> str:
if not text or max_width <= 0:
return ''
if cell_len(text) <= max_width:
return text
if max_width <= 1:
return ''
return f"{text[-(max_width - 1):]}"
def _available_width(self, left_text: str, indent: int = 0) -> int:
width = self._terminal_width()
base = max(0, width - cell_len(left_text) - indent - 6)
cap = max(0, (width * 2) // 5)
return max(0, min(base, cap))
class ArchiveBoxProgressLayout: class ArchiveBoxProgressLayout:
""" """
@@ -631,7 +680,7 @@ class ArchiveBoxProgressLayout:
# Top-level split: crawl_queue, crawl_tree, processes # Top-level split: crawl_queue, crawl_tree, processes
layout.split( layout.split(
Layout(name="crawl_queue", size=3), Layout(name="crawl_queue", size=3),
Layout(name="crawl_tree", size=14), Layout(name="crawl_tree", size=20),
Layout(name="processes", ratio=1), Layout(name="processes", ratio=1),
) )
@@ -671,6 +720,8 @@ class ArchiveBoxProgressLayout:
cmd = getattr(process, 'cmd', []) cmd = getattr(process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else '' hook_name = hook_path.name if hook_path else ''
if '.bg.' in hook_name:
continue
if '.bg.' not in hook_name: if '.bg.' not in hook_name:
fg_running = True fg_running = True
break break
@@ -684,6 +735,8 @@ class ArchiveBoxProgressLayout:
cmd = getattr(process, 'cmd', []) cmd = getattr(process, 'cmd', [])
hook_path = Path(cmd[1]) if len(cmd) > 1 else None hook_path = Path(cmd[1]) if len(cmd) > 1 else None
hook_name = hook_path.name if hook_path else '' hook_name = hook_path.name if hook_path else ''
if '.bg.' in hook_name:
continue
if '.bg.' not in hook_name: if '.bg.' not in hook_name:
fg_pending = True fg_pending = True
break break
@@ -701,6 +754,10 @@ class ArchiveBoxProgressLayout:
is_bg = '.bg.' in hook_name is_bg = '.bg.' in hook_name
except Exception: except Exception:
is_bg = False is_bg = False
if is_hook and is_bg:
continue
if not self._has_log_lines(process):
continue
is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None)) is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
max_lines = 2 if is_pending else (4 if is_bg else 7) max_lines = 2 if is_pending else (4 if is_bg else 7)
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating)) panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating))
@@ -718,6 +775,17 @@ class ArchiveBoxProgressLayout:
def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None: def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
"""Update the crawl queue tree panel.""" """Update the crawl queue tree panel."""
self.crawl_queue_tree.update_crawls(crawls) self.crawl_queue_tree.update_crawls(crawls)
# Auto-size crawl tree panel to content
line_count = 0
for crawl in crawls:
line_count += 1
for snap in crawl.get('snapshots', []) or []:
line_count += 1
if snap.get('output_path'):
line_count += 1
for _ in snap.get('hooks', []) or []:
line_count += 1
self.layout["crawl_tree"].size = max(4, line_count + 2)
def log_event(self, message: str, style: str = "white") -> None: def log_event(self, message: str, style: str = "white") -> None:
"""Add an event to the orchestrator log.""" """Add an event to the orchestrator log."""
@@ -767,8 +835,28 @@ class ArchiveBoxProgressLayout:
timeout=hook.get('timeout', ''), timeout=hook.get('timeout', ''),
status=status, status=status,
) )
stderr_tail = hook.get('stderr', '')
hook_line = f" {icon} {path}{stats}".strip() hook_line = f" {icon} {path}{stats}".strip()
if stderr_tail:
avail = self.crawl_queue_tree._available_width(hook_line, indent=16)
trunc = getattr(self.crawl_queue_tree, "_truncate_tail", self.crawl_queue_tree._truncate_to_width)
stderr_tail = trunc(stderr_tail, avail)
if stderr_tail:
hook_line = f"{hook_line} {stderr_tail}"
if hook_line: if hook_line:
lines.append(("crawl_tree", hook_line)) lines.append(("crawl_tree", hook_line))
return lines return lines
@staticmethod
def _has_log_lines(process: Any) -> bool:
try:
stdout_lines = list(process.tail_stdout(lines=1, follow=False))
if any(line.strip() for line in stdout_lines):
return True
stderr_lines = list(process.tail_stderr(lines=1, follow=False))
if any(line.strip() for line in stderr_lines):
return True
except Exception:
return False
return False

View File

@@ -2,7 +2,7 @@
""" """
Submit a URL to archive.org for archiving. Submit a URL to archive.org for archiving.
Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid> Usage: on_Snapshot__archivedotorg.bg.py --url=<url> --snapshot-id=<uuid>
Output: Writes archive.org.txt to $PWD with the archived URL Output: Writes archive.org.txt to $PWD with the archived URL
Environment variables: Environment variables:

View File

@@ -803,9 +803,16 @@ try {
* @returns {string} - 32-character extension ID * @returns {string} - 32-character extension ID
*/ */
function getExtensionId(unpacked_path) { function getExtensionId(unpacked_path) {
let resolved_path = unpacked_path;
try {
resolved_path = fs.realpathSync(unpacked_path);
} catch (err) {
// Use the provided path if realpath fails
resolved_path = unpacked_path;
}
// Chrome uses a SHA256 hash of the unpacked extension directory path // Chrome uses a SHA256 hash of the unpacked extension directory path
const hash = crypto.createHash('sha256'); const hash = crypto.createHash('sha256');
hash.update(Buffer.from(unpacked_path, 'utf-8')); hash.update(Buffer.from(resolved_path, 'utf-8'));
// Convert first 32 hex chars to characters in the range 'a'-'p' // Convert first 32 hex chars to characters in the range 'a'-'p'
const detected_extension_id = Array.from(hash.digest('hex')) const detected_extension_id = Array.from(hash.digest('hex'))
@@ -978,6 +985,8 @@ async function isTargetExtension(target) {
let extension_id = null; let extension_id = null;
let manifest_version = null; let manifest_version = null;
let manifest = null;
let manifest_name = null;
const target_is_extension = is_chrome_extension || target_is_bg; const target_is_extension = is_chrome_extension || target_is_bg;
if (target_is_extension) { if (target_is_extension) {
@@ -985,8 +994,9 @@ async function isTargetExtension(target) {
extension_id = target_url?.split('://')[1]?.split('/')[0] || null; extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
if (target_ctx) { if (target_ctx) {
const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
manifest_version = manifest?.manifest_version || null; manifest_version = manifest?.manifest_version || null;
manifest_name = manifest?.name || null;
} }
} catch (err) { } catch (err) {
// Failed to get extension metadata // Failed to get extension metadata
@@ -1001,6 +1011,8 @@ async function isTargetExtension(target) {
target_url, target_url,
extension_id, extension_id,
manifest_version, manifest_version,
manifest,
manifest_name,
}; };
} }
@@ -1053,14 +1065,23 @@ async function loadExtensionFromTarget(extensions, target) {
// Trigger extension toolbar button click // Trigger extension toolbar button click
dispatchAction: async (tab) => { dispatchAction: async (tab) => {
return await target_ctx.evaluate((tabId) => { return await target_ctx.evaluate(async (tab) => {
return new Promise((resolve) => { tab = tab || (await new Promise((resolve) =>
chrome.action.onClicked.addListener((tab) => { chrome.tabs.query({ currentWindow: true, active: true }, ([tab]) => resolve(tab))
resolve({ success: true, tab }); ));
});
chrome.action.openPopup(); // Manifest V3: chrome.action
}); if (chrome.action?.onClicked?.dispatch) {
}, tab?.id || null); return await chrome.action.onClicked.dispatch(tab);
}
// Manifest V2: chrome.browserAction
if (chrome.browserAction?.onClicked?.dispatch) {
return await chrome.browserAction.onClicked.dispatch(tab);
}
throw new Error('Extension action dispatch not available');
}, tab || null);
}, },
// Send message to extension // Send message to extension

View File

@@ -118,9 +118,7 @@ process.on('SIGTERM', () => cleanup('SIGTERM'));
process.on('SIGINT', () => cleanup('SIGINT')); process.on('SIGINT', () => cleanup('SIGINT'));
// Try to find the crawl's Chrome session // Try to find the crawl's Chrome session
function findCrawlChromeSession(crawlId) { function findCrawlChromeSession() {
if (!crawlId) return null;
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py // Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', ''); const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
if (!crawlOutputDir) return null; if (!crawlOutputDir) return null;
@@ -301,7 +299,7 @@ async function main() {
const args = parseArgs(); const args = parseArgs();
const url = args.url; const url = args.url;
const snapshotId = args.snapshot_id; const snapshotId = args.snapshot_id;
const crawlId = args.crawl_id; const crawlId = args.crawl_id || getEnv('CRAWL_ID', '');
if (!url || !snapshotId) { if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]'); console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
@@ -332,15 +330,14 @@ async function main() {
} }
// Try to use existing crawl Chrome session // Try to use existing crawl Chrome session
const crawlSession = findCrawlChromeSession(crawlId); const crawlSession = findCrawlChromeSession();
let result; let result;
if (crawlSession) { if (crawlSession) {
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`); console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid); result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
} else { } else {
console.log(`[*] No crawl Chrome session found, launching new Chrome`); result = { success: false, error: 'No crawl Chrome session found (CRAWL_OUTPUT_DIR missing or chrome not running)' };
result = await launchNewChrome(url, binary);
} }
if (result.success) { if (result.success) {

View File

@@ -2,7 +2,7 @@
""" """
Extract favicon from a URL. Extract favicon from a URL.
Usage: on_Snapshot__favicon.py --url=<url> --snapshot-id=<uuid> Usage: on_Snapshot__favicon.bg.py --url=<url> --snapshot-id=<uuid>
Output: Writes favicon.ico to $PWD Output: Writes favicon.ico to $PWD
Environment variables: Environment variables:

View File

@@ -17,6 +17,7 @@ Environment variables:
import json import json
import os import os
import shutil
import subprocess import subprocess
import sys import sys
import threading import threading
@@ -87,6 +88,27 @@ def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
return default if default is not None else [] return default if default is not None else []
def get_binary_shebang(binary_path: str) -> str | None:
"""Return interpreter from shebang line if present (e.g., /path/to/python)."""
try:
with open(binary_path, 'r', encoding='utf-8') as f:
first_line = f.readline().strip()
if first_line.startswith('#!'):
return first_line[2:].strip().split(' ')[0]
except Exception:
pass
return None
def resolve_binary_path(binary: str) -> str | None:
"""Resolve binary to an absolute path if possible."""
if not binary:
return None
if Path(binary).is_file():
return binary
return shutil.which(binary)
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
""" """
@@ -118,10 +140,12 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
# Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary # Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py' wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
resolved_binary = resolve_binary_path(binary) or binary
if wrapper_path.exists(): if wrapper_path.exists():
cmd = [sys.executable, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)] forumdl_python = get_binary_shebang(resolved_binary) or sys.executable
cmd = [forumdl_python, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
else: else:
cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] cmd = [resolved_binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
if not check_ssl: if not check_ssl:
cmd.append('--no-check-certificate') cmd.append('--no-check-certificate')
@@ -187,7 +211,7 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
if 'unable to extract' in stderr_lower: if 'unable to extract' in stderr_lower:
return False, None, 'Unable to extract forum info' return False, None, 'Unable to extract forum info'
return False, None, f'forum-dl error: {stderr[:200]}' return False, None, f'forum-dl error: {stderr}'
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds' return False, None, f'Timed out after {timeout} seconds'

View File

@@ -196,7 +196,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
if 'unable to extract' in stderr_lower: if 'unable to extract' in stderr_lower:
return False, None, 'Unable to extract gallery info' return False, None, 'Unable to extract gallery info'
return False, None, f'gallery-dl error: {stderr[:200]}' return False, None, f'gallery-dl error: {stderr}'
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds' return False, None, f'Timed out after {timeout} seconds'

View File

@@ -82,6 +82,9 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
# Get text version # Get text version
cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text'] cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text']
result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True) result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True)
if result_text.stdout:
sys.stderr.write(result_text.stdout)
sys.stderr.flush()
if result_text.returncode != 0: if result_text.returncode != 0:
return False, None, f'postlight-parser failed (exit={result_text.returncode})' return False, None, f'postlight-parser failed (exit={result_text.returncode})'
@@ -101,6 +104,9 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
# Get HTML version # Get HTML version
cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html'] cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html']
result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True) result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True)
if result_html.stdout:
sys.stderr.write(result_html.stdout)
sys.stderr.flush()
try: try:
html_json = json.loads(result_html.stdout) html_json = json.loads(result_html.stdout)

View File

@@ -109,6 +109,10 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
cmd = [binary, *readability_args, *readability_args_extra, html_source] cmd = [binary, *readability_args, *readability_args_extra, html_source]
result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True) result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True)
if result.stdout:
sys.stderr.write(result.stdout)
sys.stderr.flush()
if result.returncode != 0: if result.returncode != 0:
return False, None, f'readability-extractor failed (exit={result.returncode})' return False, None, f'readability-extractor failed (exit={result.returncode})'

View File

@@ -116,7 +116,19 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
// Trigger the extension's action (toolbar button click) // Trigger the extension's action (toolbar button click)
console.error('[singlefile] Dispatching extension action...'); console.error('[singlefile] Dispatching extension action...');
await extension.dispatchAction(); try {
const actionTimeoutMs = options.actionTimeoutMs || 5000;
const actionPromise = extension.dispatchAction();
const actionResult = await Promise.race([
actionPromise,
wait(actionTimeoutMs).then(() => 'timeout'),
]);
if (actionResult === 'timeout') {
console.error(`[singlefile] Extension action did not resolve within ${actionTimeoutMs}ms, continuing...`);
}
} catch (err) {
console.error(`[singlefile] Extension action error: ${err.message || err}`);
}
// Wait for file to appear in downloads directory // Wait for file to appear in downloads directory
const check_delay = 3000; // 3 seconds const check_delay = 3000; // 3 seconds

View File

@@ -27,6 +27,7 @@ import threading
import time import time
from urllib.request import urlopen from urllib.request import urlopen
from pathlib import Path from pathlib import Path
import shutil
import rich_click as click import rich_click as click
@@ -142,6 +143,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message) Returns: (success, output_path, error_message)
""" """
print(f'[singlefile] CLI mode start url={url}', file=sys.stderr)
# Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader) # Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '') user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
@@ -172,8 +174,10 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
cdp_remote_url = None cdp_remote_url = None
if cdp_remote_url: if cdp_remote_url:
print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr)
cmd.extend(['--browser-server', cdp_remote_url]) cmd.extend(['--browser-server', cdp_remote_url])
elif chrome: elif chrome:
print(f'[singlefile] Launching Chrome binary: {chrome}', file=sys.stderr)
cmd.extend(['--browser-executable-path', chrome]) cmd.extend(['--browser-executable-path', chrome])
# Pass Chrome arguments (only when launching a new browser) # Pass Chrome arguments (only when launching a new browser)
@@ -200,6 +204,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
output_path = output_dir / OUTPUT_FILE output_path = output_dir / OUTPUT_FILE
cmd.extend([url, str(output_path)]) cmd.extend([url, str(output_path)])
print(f'[singlefile] CLI command: {" ".join(cmd[:6])} ...', file=sys.stderr)
try: try:
output_lines: list[str] = [] output_lines: list[str] = []
@@ -258,36 +263,93 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]: def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
"""Save using the SingleFile Chrome extension via existing Chrome session.""" """Save using the SingleFile Chrome extension via existing Chrome session."""
print(f'[singlefile] Extension mode start url={url}', file=sys.stderr)
# Only attempt if chrome session exists # Only attempt if chrome session exists
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10))) cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
if not cdp_url: if not cdp_url:
print('[singlefile] No chrome session (cdp_url.txt missing)', file=sys.stderr)
return False, None, 'No Chrome session available' return False, None, 'No Chrome session available'
if not EXTENSION_SAVE_SCRIPT.exists(): if not EXTENSION_SAVE_SCRIPT.exists():
print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr)
return False, None, 'SingleFile extension helper script missing' return False, None, 'SingleFile extension helper script missing'
node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node') node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
downloads_dir = get_env('CHROME_DOWNLOADS_DIR', '')
extensions_dir = get_env('CHROME_EXTENSIONS_DIR', '')
cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}'] cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
print(f'[singlefile] cdp_url={cdp_url}', file=sys.stderr)
print(f'[singlefile] node={node_binary}', file=sys.stderr)
node_resolved = shutil.which(node_binary) if node_binary else None
print(f'[singlefile] node_resolved={node_resolved}', file=sys.stderr)
print(f'[singlefile] PATH={os.environ.get("PATH","")}', file=sys.stderr)
if downloads_dir:
print(f'[singlefile] CHROME_DOWNLOADS_DIR={downloads_dir}', file=sys.stderr)
if extensions_dir:
print(f'[singlefile] CHROME_EXTENSIONS_DIR={extensions_dir}', file=sys.stderr)
print(f'[singlefile] helper_cmd={" ".join(cmd)}', file=sys.stderr)
try: try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout) output_lines: list[str] = []
except subprocess.TimeoutExpired: error_lines: list[str] = []
return False, None, f'Timed out after {timeout} seconds' process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1,
)
def _read_stream(stream, sink, label: str) -> None:
if not stream:
return
for line in stream:
sink.append(line)
sys.stderr.write(line)
sys.stderr.flush()
stdout_thread = threading.Thread(target=_read_stream, args=(process.stdout, output_lines, 'stdout'), daemon=True)
stderr_thread = threading.Thread(target=_read_stream, args=(process.stderr, error_lines, 'stderr'), daemon=True)
stdout_thread.start()
stderr_thread.start()
try:
process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
stdout_thread.join(timeout=1)
stderr_thread.join(timeout=1)
print(f'[singlefile] Extension helper timed out after {timeout}s', file=sys.stderr)
return False, None, f'Timed out after {timeout} seconds'
stdout_thread.join(timeout=1)
stderr_thread.join(timeout=1)
result_stdout = ''.join(output_lines).encode('utf-8', errors='replace')
result_stderr = ''.join(error_lines).encode('utf-8', errors='replace')
result_returncode = process.returncode
except Exception as e: except Exception as e:
print(f'[singlefile] Extension helper error: {type(e).__name__}: {e}', file=sys.stderr)
return False, None, f'{type(e).__name__}: {e}' return False, None, f'{type(e).__name__}: {e}'
if result.returncode == 0: print(f'[singlefile] helper_returncode={result_returncode}', file=sys.stderr)
print(f'[singlefile] helper_stdout_len={len(result_stdout or b"")}', file=sys.stderr)
print(f'[singlefile] helper_stderr_len={len(result_stderr or b"")}', file=sys.stderr)
if result_returncode == 0:
# Prefer explicit stdout path, fallback to local output file # Prefer explicit stdout path, fallback to local output file
out_text = result.stdout.decode('utf-8', errors='replace').strip() out_text = result_stdout.decode('utf-8', errors='replace').strip()
if out_text and Path(out_text).exists(): if out_text and Path(out_text).exists():
print(f'[singlefile] Extension output: {out_text}', file=sys.stderr)
return True, out_text, '' return True, out_text, ''
output_path = Path(OUTPUT_DIR) / OUTPUT_FILE output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
if output_path.exists() and output_path.stat().st_size > 0: if output_path.exists() and output_path.stat().st_size > 0:
print(f'[singlefile] Extension output: {output_path}', file=sys.stderr)
return True, str(output_path), '' return True, str(output_path), ''
return False, None, 'SingleFile extension completed but no output file found' return False, None, 'SingleFile extension completed but no output file found'
stderr = result.stderr.decode('utf-8', errors='replace').strip() stderr = result_stderr.decode('utf-8', errors='replace').strip()
stdout = result.stdout.decode('utf-8', errors='replace').strip() stdout = result_stdout.decode('utf-8', errors='replace').strip()
detail = stderr or stdout detail = stderr or stdout
return False, None, detail or 'SingleFile extension failed' return False, None, detail or 'SingleFile extension failed'
@@ -298,6 +360,7 @@ def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str |
def main(url: str, snapshot_id: str): def main(url: str, snapshot_id: str):
"""Archive a URL using SingleFile.""" """Archive a URL using SingleFile."""
print(f'[singlefile] Hook starting pid={os.getpid()} url={url}', file=sys.stderr)
output = None output = None
status = 'failed' status = 'failed'
error = '' error = ''
@@ -318,11 +381,6 @@ def main(url: str, snapshot_id: str):
# Prefer SingleFile extension via existing Chrome session # Prefer SingleFile extension via existing Chrome session
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
success, output, error = save_singlefile_with_extension(url, timeout) success, output, error = save_singlefile_with_extension(url, timeout)
# Fallback to single-file-cli if extension path failed
if not success:
binary = get_env('SINGLEFILE_BINARY', 'single-file')
success, output, error = save_singlefile(url, binary)
status = 'succeeded' if success else 'failed' status = 'succeeded' if success else 'failed'
except Exception as e: except Exception as e:

View File

@@ -0,0 +1,207 @@
#!/usr/bin/env node
/**
* Save a page using the SingleFile Chrome extension via an existing Chrome session.
*
* Usage: singlefile_extension_save.js --url=<url>
* Output: prints saved file path on success
*/
const fs = require('fs');
const path = require('path');
const CHROME_SESSION_DIR = '../chrome';
const DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
process.env.CHROME_DOWNLOADS_DIR = DOWNLOADS_DIR;
async function setDownloadDir(page, downloadDir) {
try {
await fs.promises.mkdir(downloadDir, { recursive: true });
const client = await page.target().createCDPSession();
try {
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadDir,
});
} catch (err) {
// Fallback for newer protocol versions
await client.send('Browser.setDownloadBehavior', {
behavior: 'allow',
downloadPath: downloadDir,
});
}
} catch (err) {
console.error(`[⚠️] Failed to set download directory: ${err.message || err}`);
}
}
function parseArgs() {
const args = {};
process.argv.slice(2).forEach((arg) => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
async function main() {
const args = parseArgs();
const url = args.url;
if (!url) {
console.error('Usage: singlefile_extension_save.js --url=<url>');
process.exit(1);
}
console.error(`[singlefile] helper start url=${url}`);
console.error(`[singlefile] downloads_dir=${DOWNLOADS_DIR}`);
if (process.env.CHROME_EXTENSIONS_DIR) {
console.error(`[singlefile] extensions_dir=${process.env.CHROME_EXTENSIONS_DIR}`);
}
try {
console.error('[singlefile] loading dependencies...');
const puppeteer = require('puppeteer-core');
const chromeUtils = require('../chrome/chrome_utils.js');
const {
EXTENSION,
saveSinglefileWithExtension,
} = require('./on_Crawl__82_singlefile_install.js');
console.error('[singlefile] dependencies loaded');
// Ensure extension is installed and metadata is cached
console.error('[singlefile] ensuring extension cache...');
const extension = await chromeUtils.installExtensionWithCache(
EXTENSION,
{ extensionsDir: process.env.CHROME_EXTENSIONS_DIR }
);
if (!extension) {
console.error('[❌] SingleFile extension not installed');
process.exit(2);
}
if (extension.unpacked_path) {
const runtimeId = chromeUtils.getExtensionId(extension.unpacked_path);
if (runtimeId) {
extension.id = runtimeId;
}
}
console.error(`[singlefile] extension ready id=${extension.id} version=${extension.version}`);
// Connect to existing Chrome session
console.error('[singlefile] connecting to chrome session...');
const { browser, page } = await chromeUtils.connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs: 60000,
puppeteer,
});
console.error('[singlefile] connected to chrome');
try {
// Ensure CDP target discovery is enabled so service_worker targets appear
try {
const client = await page.createCDPSession();
await client.send('Target.setDiscoverTargets', { discover: true });
await client.send('Target.setAutoAttach', { autoAttach: true, waitForDebuggerOnStart: false, flatten: true });
} catch (err) {
console.error(`[singlefile] failed to enable target discovery: ${err.message || err}`);
}
// Wait for extension target to be available, then attach dispatchAction
console.error('[singlefile] waiting for extension target...');
const deadline = Date.now() + 30000;
let matchTarget = null;
let matchInfo = null;
let lastLog = 0;
const wantedName = (extension.name || 'singlefile').toLowerCase();
while (Date.now() < deadline && !matchTarget) {
const targets = browser.targets();
for (const target of targets) {
const info = await chromeUtils.isTargetExtension(target);
if (!info?.target_is_extension || !info?.extension_id) {
continue;
}
const manifestName = (info.manifest_name || '').toLowerCase();
const targetUrl = (info.target_url || '').toLowerCase();
const nameMatches = manifestName.includes(wantedName) || manifestName.includes('singlefile') || manifestName.includes('single-file');
const urlMatches = targetUrl.includes('singlefile') || targetUrl.includes('single-file') || targetUrl.includes('single-file-extension');
if (nameMatches || urlMatches) {
matchTarget = target;
matchInfo = info;
break;
}
}
if (!matchTarget) {
if (Date.now() - lastLog > 5000) {
const targetsSummary = [];
for (const target of targets) {
const info = await chromeUtils.isTargetExtension(target);
if (!info?.target_is_extension) {
continue;
}
targetsSummary.push({
type: info.target_type,
url: info.target_url,
extensionId: info.extension_id,
manifestName: info.manifest_name,
});
}
console.error(`[singlefile] waiting... targets total=${targets.length} extensions=${targetsSummary.length} details=${JSON.stringify(targetsSummary)}`);
lastLog = Date.now();
}
await new Promise(r => setTimeout(r, 500));
}
}
if (!matchTarget || !matchInfo) {
const targets = chromeUtils.getExtensionTargets(browser);
console.error(`[singlefile] extension target not found (name=${extension.name})`);
console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`);
await browser.disconnect();
process.exit(5);
}
// Use the runtime extension id from the matched target
extension.id = matchInfo.extension_id;
console.error('[singlefile] loading extension from target...');
await chromeUtils.loadExtensionFromTarget([extension], matchTarget);
if (typeof extension.dispatchAction !== 'function') {
const targets = chromeUtils.getExtensionTargets(browser);
console.error(`[singlefile] extension dispatchAction missing for id=${extension.id}`);
console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`);
await browser.disconnect();
process.exit(6);
}
console.error('[singlefile] setting download dir...');
await setDownloadDir(page, DOWNLOADS_DIR);
console.error('[singlefile] triggering save via extension...');
const output = await saveSinglefileWithExtension(page, extension, { downloadsDir: DOWNLOADS_DIR });
if (output && fs.existsSync(output)) {
console.error(`[singlefile] saved: ${output}`);
console.log(output);
await browser.disconnect();
process.exit(0);
}
console.error('[❌] SingleFile extension did not produce output');
await browser.disconnect();
process.exit(3);
} catch (err) {
await browser.disconnect();
throw err;
}
} catch (err) {
console.error(`[❌] ${err.message || err}`);
process.exit(4);
}
}
if (require.main === module) {
main();
}

View File

@@ -483,8 +483,7 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run( result = subprocess.run(
['node', str(script_path)], ['node', str(script_path)],
cwd=str(tmpdir, cwd=str(tmpdir),
env=get_test_env()),
capture_output=True, capture_output=True,
text=True, text=True,
env=env, env=env,

View File

@@ -144,6 +144,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
try: try:
result = subprocess.run( result = subprocess.run(
cmd, cmd,
capture_output=True,
text=True,
timeout=timeout * 2, # Allow extra time for large downloads timeout=timeout * 2, # Allow extra time for large downloads
) )
@@ -166,7 +168,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
output_path = str(html_files[0]) if html_files else str(downloaded_files[0]) output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
# Parse download stats from wget output # Parse download stats from wget output
output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:] stderr_text = (result.stderr or '')
output_tail = stderr_text.strip().split('\n')[-3:] if stderr_text else []
files_count = len(downloaded_files) files_count = len(downloaded_files)
return True, output_path, '' return True, output_path, ''

View File

@@ -201,7 +201,7 @@ def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]:
if 'Unable to extract' in stderr: if 'Unable to extract' in stderr:
return False, None, 'Unable to extract media info' return False, None, 'Unable to extract media info'
return False, None, f'yt-dlp error: {stderr[:200]}' return False, None, f'yt-dlp error: {stderr}'
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds' return False, None, f'Timed out after {timeout} seconds'

View File

@@ -459,7 +459,6 @@ class Orchestrator:
# Enable progress layout only in TTY + foreground mode # Enable progress layout only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle show_progress = IS_TTY and self.exit_on_idle
plain_output = not IS_TTY plain_output = not IS_TTY
self.on_startup() self.on_startup()
if not show_progress: if not show_progress:
@@ -520,7 +519,6 @@ class Orchestrator:
def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False): def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False):
"""Run the main orchestrator loop with optional progress display.""" """Run the main orchestrator loop with optional progress display."""
last_queue_sizes = {}
last_snapshot_count = None last_snapshot_count = None
tick_count = 0 tick_count = 0
last_plain_lines: set[tuple[str, str]] = set() last_plain_lines: set[tuple[str, str]] = set()
@@ -611,6 +609,21 @@ class Orchestrator:
seconds = max(0.0, float(total_seconds)) seconds = max(0.0, float(total_seconds))
return f"{seconds:.1f}s" return f"{seconds:.1f}s"
def _tail_stderr_line(proc) -> str:
try:
path = getattr(proc, 'stderr_file', None)
if not path or not path.exists():
return ''
with open(path, 'rb') as f:
f.seek(0, os.SEEK_END)
size = f.tell()
f.seek(max(0, size - 4096))
data = f.read().decode('utf-8', errors='ignore')
lines = [ln.strip() for ln in data.splitlines() if ln.strip()]
return lines[-1] if lines else ''
except Exception:
return ''
tree_data: list[dict] = [] tree_data: list[dict] = []
for crawl in crawls: for crawl in crawls:
urls = crawl.get_urls_list() urls = crawl.get_urls_list()
@@ -684,7 +697,10 @@ class Orchestrator:
elapsed = '' elapsed = ''
timeout = '' timeout = ''
size = '' size = ''
stderr_tail = ''
if ar: if ar:
if ar.process_id and ar.process:
stderr_tail = _tail_stderr_line(ar.process)
if ar.status == ArchiveResult.StatusChoices.STARTED: if ar.status == ArchiveResult.StatusChoices.STARTED:
status = 'started' status = 'started'
is_running = True is_running = True
@@ -700,6 +716,8 @@ class Orchestrator:
timeout = _format_seconds(hook_timeout) timeout = _format_seconds(hook_timeout)
else: else:
status = ar.status status = ar.status
if ar.process_id and ar.process and ar.process.exit_code == 137:
status = 'failed'
is_pending = False is_pending = False
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None) start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None) end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None)
@@ -724,6 +742,7 @@ class Orchestrator:
'is_running': is_running, 'is_running': is_running,
'is_pending': is_pending, 'is_pending': is_pending,
'hook_name': hook_name, 'hook_name': hook_name,
'stderr': stderr_tail,
}) })
hooks = [] hooks = []
@@ -734,6 +753,7 @@ class Orchestrator:
any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries) any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries)
any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries) any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries)
stderr_tail = ''
if running: if running:
status = 'started' status = 'started'
is_running = True is_running = True
@@ -741,6 +761,7 @@ class Orchestrator:
is_bg = running['is_bg'] is_bg = running['is_bg']
elapsed = running.get('elapsed', '') elapsed = running.get('elapsed', '')
timeout = running.get('timeout', '') timeout = running.get('timeout', '')
stderr_tail = running.get('stderr', '')
size = '' size = ''
elif pending: elif pending:
status = 'pending' status = 'pending'
@@ -749,6 +770,7 @@ class Orchestrator:
is_bg = pending['is_bg'] is_bg = pending['is_bg']
elapsed = pending.get('elapsed', '') or _format_seconds(0) elapsed = pending.get('elapsed', '') or _format_seconds(0)
timeout = pending.get('timeout', '') timeout = pending.get('timeout', '')
stderr_tail = pending.get('stderr', '')
size = '' size = ''
else: else:
is_running = False is_running = False
@@ -762,6 +784,10 @@ class Orchestrator:
status = 'skipped' status = 'skipped'
else: else:
status = 'skipped' status = 'skipped'
for h in hook_entries:
if h.get('stderr'):
stderr_tail = h['stderr']
break
total_elapsed = 0.0 total_elapsed = 0.0
has_elapsed = False has_elapsed = False
for h in hook_entries: for h in hook_entries:
@@ -793,6 +819,7 @@ class Orchestrator:
'is_bg': is_bg, 'is_bg': is_bg,
'is_running': is_running, 'is_running': is_running,
'is_pending': is_pending, 'is_pending': is_pending,
'stderr': stderr_tail,
}) })
snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80) snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80)
@@ -857,8 +884,6 @@ class Orchestrator:
progress_layout.update_process_panels(running_processes, pending=pending_processes) progress_layout.update_process_panels(running_processes, pending=pending_processes)
last_queue_sizes = queue_sizes.copy()
# Update snapshot progress # Update snapshot progress
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot