mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 15:57:53 +10:00
working singlefile
This commit is contained in:
@@ -1020,14 +1020,14 @@ class Process(models.Model):
|
|||||||
|
|
||||||
# Debug logging
|
# Debug logging
|
||||||
import sys
|
import sys
|
||||||
print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
|
# print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
|
||||||
|
|
||||||
# Get parent process start time from OS
|
# Get parent process start time from OS
|
||||||
try:
|
try:
|
||||||
os_parent = psutil.Process(ppid)
|
os_parent = psutil.Process(ppid)
|
||||||
os_parent_start = os_parent.create_time()
|
os_parent_start = os_parent.create_time()
|
||||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||||
print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr)
|
# print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr)
|
||||||
return None # Parent process doesn't exist
|
return None # Parent process doesn't exist
|
||||||
|
|
||||||
# Find matching Process record
|
# Find matching Process record
|
||||||
@@ -1038,18 +1038,18 @@ class Process(models.Model):
|
|||||||
started_at__gte=timezone.now() - PID_REUSE_WINDOW,
|
started_at__gte=timezone.now() - PID_REUSE_WINDOW,
|
||||||
).order_by('-started_at')
|
).order_by('-started_at')
|
||||||
|
|
||||||
print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr)
|
# print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr)
|
||||||
|
|
||||||
for candidate in candidates:
|
for candidate in candidates:
|
||||||
if candidate.started_at:
|
if candidate.started_at:
|
||||||
db_start_time = candidate.started_at.timestamp()
|
db_start_time = candidate.started_at.timestamp()
|
||||||
time_diff = abs(db_start_time - os_parent_start)
|
time_diff = abs(db_start_time - os_parent_start)
|
||||||
print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr)
|
# print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr)
|
||||||
if time_diff < START_TIME_TOLERANCE:
|
if time_diff < START_TIME_TOLERANCE:
|
||||||
print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr)
|
# print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr)
|
||||||
return candidate
|
return candidate
|
||||||
|
|
||||||
print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr)
|
# print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr)
|
||||||
return None # No matching ArchiveBox parent process
|
return None # No matching ArchiveBox parent process
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -1519,7 +1519,7 @@ class Process(models.Model):
|
|||||||
stdout_path = self.stdout_file
|
stdout_path = self.stdout_file
|
||||||
stderr_path = self.stderr_file
|
stderr_path = self.stderr_file
|
||||||
|
|
||||||
with open(stdout_path, 'w') as out, open(stderr_path, 'w') as err:
|
with open(stdout_path, 'a') as out, open(stderr_path, 'a') as err:
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
self.cmd,
|
self.cmd,
|
||||||
cwd=working_dir,
|
cwd=working_dir,
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ Shows a comprehensive dashboard with:
|
|||||||
__package__ = 'archivebox.misc'
|
__package__ = 'archivebox.misc'
|
||||||
|
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import List, Optional, Any
|
from typing import List, Optional, Any
|
||||||
from collections import deque
|
from collections import deque
|
||||||
@@ -23,6 +24,7 @@ from rich.panel import Panel
|
|||||||
from rich.text import Text
|
from rich.text import Text
|
||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
from rich.tree import Tree
|
from rich.tree import Tree
|
||||||
|
from rich.cells import cell_len
|
||||||
|
|
||||||
from archivebox.config import VERSION
|
from archivebox.config import VERSION
|
||||||
|
|
||||||
@@ -533,7 +535,23 @@ class CrawlQueueTreePanel:
|
|||||||
is_pending = hook.get('is_pending', False)
|
is_pending = hook.get('is_pending', False)
|
||||||
icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending)
|
icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending)
|
||||||
stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status)
|
stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status)
|
||||||
snap_node.add(Text(f"{icon} {path}{stats}", style=color))
|
line = Text(f"{icon} {path}{stats}", style=color)
|
||||||
|
stderr_tail = hook.get('stderr', '')
|
||||||
|
if stderr_tail:
|
||||||
|
left_str = f"{icon} {path}{stats}"
|
||||||
|
avail = self._available_width(left_str, indent=16)
|
||||||
|
trunc = getattr(self, "_truncate_tail", self._truncate_to_width)
|
||||||
|
stderr_tail = trunc(stderr_tail, avail)
|
||||||
|
if not stderr_tail:
|
||||||
|
snap_node.add(line)
|
||||||
|
continue
|
||||||
|
row = Table.grid(expand=True)
|
||||||
|
row.add_column(justify="left", ratio=1)
|
||||||
|
row.add_column(justify="right")
|
||||||
|
row.add_row(line, Text(stderr_tail, style="grey70"))
|
||||||
|
snap_node.add(row)
|
||||||
|
else:
|
||||||
|
snap_node.add(line)
|
||||||
trees.append(crawl_tree)
|
trees.append(crawl_tree)
|
||||||
content = Group(*trees)
|
content = Group(*trees)
|
||||||
|
|
||||||
@@ -561,7 +579,7 @@ class CrawlQueueTreePanel:
|
|||||||
if status == 'succeeded':
|
if status == 'succeeded':
|
||||||
return '✅', 'green'
|
return '✅', 'green'
|
||||||
if status == 'failed':
|
if status == 'failed':
|
||||||
return '⚠️', 'yellow'
|
return '✖', 'red'
|
||||||
if status == 'skipped':
|
if status == 'skipped':
|
||||||
return '⏭', 'grey53'
|
return '⏭', 'grey53'
|
||||||
if is_pending:
|
if is_pending:
|
||||||
@@ -595,6 +613,37 @@ class CrawlQueueTreePanel:
|
|||||||
return f" ({size_part} | {time_part})" if time_part else f" ({size_part})"
|
return f" ({size_part} | {time_part})" if time_part else f" ({size_part})"
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _terminal_width() -> int:
|
||||||
|
try:
|
||||||
|
return os.get_terminal_size().columns
|
||||||
|
except OSError:
|
||||||
|
return 120
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _truncate_to_width(text: str, max_width: int) -> str:
|
||||||
|
if not text or max_width <= 0:
|
||||||
|
return ''
|
||||||
|
t = Text(text)
|
||||||
|
t.truncate(max_width, overflow="ellipsis")
|
||||||
|
return t.plain
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _truncate_tail(text: str, max_width: int) -> str:
|
||||||
|
if not text or max_width <= 0:
|
||||||
|
return ''
|
||||||
|
if cell_len(text) <= max_width:
|
||||||
|
return text
|
||||||
|
if max_width <= 1:
|
||||||
|
return '…'
|
||||||
|
return f"…{text[-(max_width - 1):]}"
|
||||||
|
|
||||||
|
def _available_width(self, left_text: str, indent: int = 0) -> int:
|
||||||
|
width = self._terminal_width()
|
||||||
|
base = max(0, width - cell_len(left_text) - indent - 6)
|
||||||
|
cap = max(0, (width * 2) // 5)
|
||||||
|
return max(0, min(base, cap))
|
||||||
|
|
||||||
|
|
||||||
class ArchiveBoxProgressLayout:
|
class ArchiveBoxProgressLayout:
|
||||||
"""
|
"""
|
||||||
@@ -631,7 +680,7 @@ class ArchiveBoxProgressLayout:
|
|||||||
# Top-level split: crawl_queue, crawl_tree, processes
|
# Top-level split: crawl_queue, crawl_tree, processes
|
||||||
layout.split(
|
layout.split(
|
||||||
Layout(name="crawl_queue", size=3),
|
Layout(name="crawl_queue", size=3),
|
||||||
Layout(name="crawl_tree", size=14),
|
Layout(name="crawl_tree", size=20),
|
||||||
Layout(name="processes", ratio=1),
|
Layout(name="processes", ratio=1),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -671,6 +720,8 @@ class ArchiveBoxProgressLayout:
|
|||||||
cmd = getattr(process, 'cmd', [])
|
cmd = getattr(process, 'cmd', [])
|
||||||
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
||||||
hook_name = hook_path.name if hook_path else ''
|
hook_name = hook_path.name if hook_path else ''
|
||||||
|
if '.bg.' in hook_name:
|
||||||
|
continue
|
||||||
if '.bg.' not in hook_name:
|
if '.bg.' not in hook_name:
|
||||||
fg_running = True
|
fg_running = True
|
||||||
break
|
break
|
||||||
@@ -684,6 +735,8 @@ class ArchiveBoxProgressLayout:
|
|||||||
cmd = getattr(process, 'cmd', [])
|
cmd = getattr(process, 'cmd', [])
|
||||||
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
||||||
hook_name = hook_path.name if hook_path else ''
|
hook_name = hook_path.name if hook_path else ''
|
||||||
|
if '.bg.' in hook_name:
|
||||||
|
continue
|
||||||
if '.bg.' not in hook_name:
|
if '.bg.' not in hook_name:
|
||||||
fg_pending = True
|
fg_pending = True
|
||||||
break
|
break
|
||||||
@@ -701,6 +754,10 @@ class ArchiveBoxProgressLayout:
|
|||||||
is_bg = '.bg.' in hook_name
|
is_bg = '.bg.' in hook_name
|
||||||
except Exception:
|
except Exception:
|
||||||
is_bg = False
|
is_bg = False
|
||||||
|
if is_hook and is_bg:
|
||||||
|
continue
|
||||||
|
if not self._has_log_lines(process):
|
||||||
|
continue
|
||||||
is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
|
is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
|
||||||
max_lines = 2 if is_pending else (4 if is_bg else 7)
|
max_lines = 2 if is_pending else (4 if is_bg else 7)
|
||||||
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating))
|
panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating))
|
||||||
@@ -718,6 +775,17 @@ class ArchiveBoxProgressLayout:
|
|||||||
def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
|
def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
|
||||||
"""Update the crawl queue tree panel."""
|
"""Update the crawl queue tree panel."""
|
||||||
self.crawl_queue_tree.update_crawls(crawls)
|
self.crawl_queue_tree.update_crawls(crawls)
|
||||||
|
# Auto-size crawl tree panel to content
|
||||||
|
line_count = 0
|
||||||
|
for crawl in crawls:
|
||||||
|
line_count += 1
|
||||||
|
for snap in crawl.get('snapshots', []) or []:
|
||||||
|
line_count += 1
|
||||||
|
if snap.get('output_path'):
|
||||||
|
line_count += 1
|
||||||
|
for _ in snap.get('hooks', []) or []:
|
||||||
|
line_count += 1
|
||||||
|
self.layout["crawl_tree"].size = max(4, line_count + 2)
|
||||||
|
|
||||||
def log_event(self, message: str, style: str = "white") -> None:
|
def log_event(self, message: str, style: str = "white") -> None:
|
||||||
"""Add an event to the orchestrator log."""
|
"""Add an event to the orchestrator log."""
|
||||||
@@ -767,8 +835,28 @@ class ArchiveBoxProgressLayout:
|
|||||||
timeout=hook.get('timeout', ''),
|
timeout=hook.get('timeout', ''),
|
||||||
status=status,
|
status=status,
|
||||||
)
|
)
|
||||||
|
stderr_tail = hook.get('stderr', '')
|
||||||
hook_line = f" {icon} {path}{stats}".strip()
|
hook_line = f" {icon} {path}{stats}".strip()
|
||||||
|
if stderr_tail:
|
||||||
|
avail = self.crawl_queue_tree._available_width(hook_line, indent=16)
|
||||||
|
trunc = getattr(self.crawl_queue_tree, "_truncate_tail", self.crawl_queue_tree._truncate_to_width)
|
||||||
|
stderr_tail = trunc(stderr_tail, avail)
|
||||||
|
if stderr_tail:
|
||||||
|
hook_line = f"{hook_line} {stderr_tail}"
|
||||||
if hook_line:
|
if hook_line:
|
||||||
lines.append(("crawl_tree", hook_line))
|
lines.append(("crawl_tree", hook_line))
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _has_log_lines(process: Any) -> bool:
|
||||||
|
try:
|
||||||
|
stdout_lines = list(process.tail_stdout(lines=1, follow=False))
|
||||||
|
if any(line.strip() for line in stdout_lines):
|
||||||
|
return True
|
||||||
|
stderr_lines = list(process.tail_stderr(lines=1, follow=False))
|
||||||
|
if any(line.strip() for line in stderr_lines):
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
return False
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
Submit a URL to archive.org for archiving.
|
Submit a URL to archive.org for archiving.
|
||||||
|
|
||||||
Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid>
|
Usage: on_Snapshot__archivedotorg.bg.py --url=<url> --snapshot-id=<uuid>
|
||||||
Output: Writes archive.org.txt to $PWD with the archived URL
|
Output: Writes archive.org.txt to $PWD with the archived URL
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
@@ -803,9 +803,16 @@ try {
|
|||||||
* @returns {string} - 32-character extension ID
|
* @returns {string} - 32-character extension ID
|
||||||
*/
|
*/
|
||||||
function getExtensionId(unpacked_path) {
|
function getExtensionId(unpacked_path) {
|
||||||
|
let resolved_path = unpacked_path;
|
||||||
|
try {
|
||||||
|
resolved_path = fs.realpathSync(unpacked_path);
|
||||||
|
} catch (err) {
|
||||||
|
// Use the provided path if realpath fails
|
||||||
|
resolved_path = unpacked_path;
|
||||||
|
}
|
||||||
// Chrome uses a SHA256 hash of the unpacked extension directory path
|
// Chrome uses a SHA256 hash of the unpacked extension directory path
|
||||||
const hash = crypto.createHash('sha256');
|
const hash = crypto.createHash('sha256');
|
||||||
hash.update(Buffer.from(unpacked_path, 'utf-8'));
|
hash.update(Buffer.from(resolved_path, 'utf-8'));
|
||||||
|
|
||||||
// Convert first 32 hex chars to characters in the range 'a'-'p'
|
// Convert first 32 hex chars to characters in the range 'a'-'p'
|
||||||
const detected_extension_id = Array.from(hash.digest('hex'))
|
const detected_extension_id = Array.from(hash.digest('hex'))
|
||||||
@@ -978,6 +985,8 @@ async function isTargetExtension(target) {
|
|||||||
|
|
||||||
let extension_id = null;
|
let extension_id = null;
|
||||||
let manifest_version = null;
|
let manifest_version = null;
|
||||||
|
let manifest = null;
|
||||||
|
let manifest_name = null;
|
||||||
const target_is_extension = is_chrome_extension || target_is_bg;
|
const target_is_extension = is_chrome_extension || target_is_bg;
|
||||||
|
|
||||||
if (target_is_extension) {
|
if (target_is_extension) {
|
||||||
@@ -985,8 +994,9 @@ async function isTargetExtension(target) {
|
|||||||
extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
|
extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
|
||||||
|
|
||||||
if (target_ctx) {
|
if (target_ctx) {
|
||||||
const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
|
manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
|
||||||
manifest_version = manifest?.manifest_version || null;
|
manifest_version = manifest?.manifest_version || null;
|
||||||
|
manifest_name = manifest?.name || null;
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
// Failed to get extension metadata
|
// Failed to get extension metadata
|
||||||
@@ -1001,6 +1011,8 @@ async function isTargetExtension(target) {
|
|||||||
target_url,
|
target_url,
|
||||||
extension_id,
|
extension_id,
|
||||||
manifest_version,
|
manifest_version,
|
||||||
|
manifest,
|
||||||
|
manifest_name,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1053,14 +1065,23 @@ async function loadExtensionFromTarget(extensions, target) {
|
|||||||
|
|
||||||
// Trigger extension toolbar button click
|
// Trigger extension toolbar button click
|
||||||
dispatchAction: async (tab) => {
|
dispatchAction: async (tab) => {
|
||||||
return await target_ctx.evaluate((tabId) => {
|
return await target_ctx.evaluate(async (tab) => {
|
||||||
return new Promise((resolve) => {
|
tab = tab || (await new Promise((resolve) =>
|
||||||
chrome.action.onClicked.addListener((tab) => {
|
chrome.tabs.query({ currentWindow: true, active: true }, ([tab]) => resolve(tab))
|
||||||
resolve({ success: true, tab });
|
));
|
||||||
});
|
|
||||||
chrome.action.openPopup();
|
// Manifest V3: chrome.action
|
||||||
});
|
if (chrome.action?.onClicked?.dispatch) {
|
||||||
}, tab?.id || null);
|
return await chrome.action.onClicked.dispatch(tab);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Manifest V2: chrome.browserAction
|
||||||
|
if (chrome.browserAction?.onClicked?.dispatch) {
|
||||||
|
return await chrome.browserAction.onClicked.dispatch(tab);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error('Extension action dispatch not available');
|
||||||
|
}, tab || null);
|
||||||
},
|
},
|
||||||
|
|
||||||
// Send message to extension
|
// Send message to extension
|
||||||
|
|||||||
@@ -118,9 +118,7 @@ process.on('SIGTERM', () => cleanup('SIGTERM'));
|
|||||||
process.on('SIGINT', () => cleanup('SIGINT'));
|
process.on('SIGINT', () => cleanup('SIGINT'));
|
||||||
|
|
||||||
// Try to find the crawl's Chrome session
|
// Try to find the crawl's Chrome session
|
||||||
function findCrawlChromeSession(crawlId) {
|
function findCrawlChromeSession() {
|
||||||
if (!crawlId) return null;
|
|
||||||
|
|
||||||
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
|
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
|
||||||
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
|
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
|
||||||
if (!crawlOutputDir) return null;
|
if (!crawlOutputDir) return null;
|
||||||
@@ -301,7 +299,7 @@ async function main() {
|
|||||||
const args = parseArgs();
|
const args = parseArgs();
|
||||||
const url = args.url;
|
const url = args.url;
|
||||||
const snapshotId = args.snapshot_id;
|
const snapshotId = args.snapshot_id;
|
||||||
const crawlId = args.crawl_id;
|
const crawlId = args.crawl_id || getEnv('CRAWL_ID', '');
|
||||||
|
|
||||||
if (!url || !snapshotId) {
|
if (!url || !snapshotId) {
|
||||||
console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
|
console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
|
||||||
@@ -332,15 +330,14 @@ async function main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Try to use existing crawl Chrome session
|
// Try to use existing crawl Chrome session
|
||||||
const crawlSession = findCrawlChromeSession(crawlId);
|
const crawlSession = findCrawlChromeSession();
|
||||||
let result;
|
let result;
|
||||||
|
|
||||||
if (crawlSession) {
|
if (crawlSession) {
|
||||||
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
|
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
|
||||||
result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
|
result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
|
||||||
} else {
|
} else {
|
||||||
console.log(`[*] No crawl Chrome session found, launching new Chrome`);
|
result = { success: false, error: 'No crawl Chrome session found (CRAWL_OUTPUT_DIR missing or chrome not running)' };
|
||||||
result = await launchNewChrome(url, binary);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
Extract favicon from a URL.
|
Extract favicon from a URL.
|
||||||
|
|
||||||
Usage: on_Snapshot__favicon.py --url=<url> --snapshot-id=<uuid>
|
Usage: on_Snapshot__favicon.bg.py --url=<url> --snapshot-id=<uuid>
|
||||||
Output: Writes favicon.ico to $PWD
|
Output: Writes favicon.ico to $PWD
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
@@ -17,6 +17,7 @@ Environment variables:
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
@@ -87,6 +88,27 @@ def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
|||||||
return default if default is not None else []
|
return default if default is not None else []
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_shebang(binary_path: str) -> str | None:
|
||||||
|
"""Return interpreter from shebang line if present (e.g., /path/to/python)."""
|
||||||
|
try:
|
||||||
|
with open(binary_path, 'r', encoding='utf-8') as f:
|
||||||
|
first_line = f.readline().strip()
|
||||||
|
if first_line.startswith('#!'):
|
||||||
|
return first_line[2:].strip().split(' ')[0]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_binary_path(binary: str) -> str | None:
|
||||||
|
"""Resolve binary to an absolute path if possible."""
|
||||||
|
if not binary:
|
||||||
|
return None
|
||||||
|
if Path(binary).is_file():
|
||||||
|
return binary
|
||||||
|
return shutil.which(binary)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||||
"""
|
"""
|
||||||
@@ -118,10 +140,12 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
|
|
||||||
# Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
|
# Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
|
||||||
wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
|
wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
|
||||||
|
resolved_binary = resolve_binary_path(binary) or binary
|
||||||
if wrapper_path.exists():
|
if wrapper_path.exists():
|
||||||
cmd = [sys.executable, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
|
forumdl_python = get_binary_shebang(resolved_binary) or sys.executable
|
||||||
|
cmd = [forumdl_python, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
|
||||||
else:
|
else:
|
||||||
cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
|
cmd = [resolved_binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
|
||||||
|
|
||||||
if not check_ssl:
|
if not check_ssl:
|
||||||
cmd.append('--no-check-certificate')
|
cmd.append('--no-check-certificate')
|
||||||
@@ -187,7 +211,7 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
if 'unable to extract' in stderr_lower:
|
if 'unable to extract' in stderr_lower:
|
||||||
return False, None, 'Unable to extract forum info'
|
return False, None, 'Unable to extract forum info'
|
||||||
|
|
||||||
return False, None, f'forum-dl error: {stderr[:200]}'
|
return False, None, f'forum-dl error: {stderr}'
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
return False, None, f'Timed out after {timeout} seconds'
|
return False, None, f'Timed out after {timeout} seconds'
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
if 'unable to extract' in stderr_lower:
|
if 'unable to extract' in stderr_lower:
|
||||||
return False, None, 'Unable to extract gallery info'
|
return False, None, 'Unable to extract gallery info'
|
||||||
|
|
||||||
return False, None, f'gallery-dl error: {stderr[:200]}'
|
return False, None, f'gallery-dl error: {stderr}'
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
return False, None, f'Timed out after {timeout} seconds'
|
return False, None, f'Timed out after {timeout} seconds'
|
||||||
|
|||||||
@@ -82,6 +82,9 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
# Get text version
|
# Get text version
|
||||||
cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text']
|
cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text']
|
||||||
result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True)
|
result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True)
|
||||||
|
if result_text.stdout:
|
||||||
|
sys.stderr.write(result_text.stdout)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
if result_text.returncode != 0:
|
if result_text.returncode != 0:
|
||||||
return False, None, f'postlight-parser failed (exit={result_text.returncode})'
|
return False, None, f'postlight-parser failed (exit={result_text.returncode})'
|
||||||
@@ -101,6 +104,9 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
# Get HTML version
|
# Get HTML version
|
||||||
cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html']
|
cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html']
|
||||||
result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True)
|
result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True)
|
||||||
|
if result_html.stdout:
|
||||||
|
sys.stderr.write(result_html.stdout)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
html_json = json.loads(result_html.stdout)
|
html_json = json.loads(result_html.stdout)
|
||||||
|
|||||||
@@ -109,6 +109,10 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
cmd = [binary, *readability_args, *readability_args_extra, html_source]
|
cmd = [binary, *readability_args, *readability_args_extra, html_source]
|
||||||
result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True)
|
result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True)
|
||||||
|
|
||||||
|
if result.stdout:
|
||||||
|
sys.stderr.write(result.stdout)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
return False, None, f'readability-extractor failed (exit={result.returncode})'
|
return False, None, f'readability-extractor failed (exit={result.returncode})'
|
||||||
|
|
||||||
|
|||||||
@@ -116,7 +116,19 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
|||||||
|
|
||||||
// Trigger the extension's action (toolbar button click)
|
// Trigger the extension's action (toolbar button click)
|
||||||
console.error('[singlefile] Dispatching extension action...');
|
console.error('[singlefile] Dispatching extension action...');
|
||||||
await extension.dispatchAction();
|
try {
|
||||||
|
const actionTimeoutMs = options.actionTimeoutMs || 5000;
|
||||||
|
const actionPromise = extension.dispatchAction();
|
||||||
|
const actionResult = await Promise.race([
|
||||||
|
actionPromise,
|
||||||
|
wait(actionTimeoutMs).then(() => 'timeout'),
|
||||||
|
]);
|
||||||
|
if (actionResult === 'timeout') {
|
||||||
|
console.error(`[singlefile] Extension action did not resolve within ${actionTimeoutMs}ms, continuing...`);
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[singlefile] Extension action error: ${err.message || err}`);
|
||||||
|
}
|
||||||
|
|
||||||
// Wait for file to appear in downloads directory
|
// Wait for file to appear in downloads directory
|
||||||
const check_delay = 3000; // 3 seconds
|
const check_delay = 3000; // 3 seconds
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ import threading
|
|||||||
import time
|
import time
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import shutil
|
||||||
|
|
||||||
import rich_click as click
|
import rich_click as click
|
||||||
|
|
||||||
@@ -142,6 +143,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
|
|
||||||
Returns: (success, output_path, error_message)
|
Returns: (success, output_path, error_message)
|
||||||
"""
|
"""
|
||||||
|
print(f'[singlefile] CLI mode start url={url}', file=sys.stderr)
|
||||||
# Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
|
# Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
|
||||||
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||||
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
|
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
|
||||||
@@ -172,8 +174,10 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
cdp_remote_url = None
|
cdp_remote_url = None
|
||||||
|
|
||||||
if cdp_remote_url:
|
if cdp_remote_url:
|
||||||
|
print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr)
|
||||||
cmd.extend(['--browser-server', cdp_remote_url])
|
cmd.extend(['--browser-server', cdp_remote_url])
|
||||||
elif chrome:
|
elif chrome:
|
||||||
|
print(f'[singlefile] Launching Chrome binary: {chrome}', file=sys.stderr)
|
||||||
cmd.extend(['--browser-executable-path', chrome])
|
cmd.extend(['--browser-executable-path', chrome])
|
||||||
|
|
||||||
# Pass Chrome arguments (only when launching a new browser)
|
# Pass Chrome arguments (only when launching a new browser)
|
||||||
@@ -200,6 +204,7 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
output_path = output_dir / OUTPUT_FILE
|
output_path = output_dir / OUTPUT_FILE
|
||||||
|
|
||||||
cmd.extend([url, str(output_path)])
|
cmd.extend([url, str(output_path)])
|
||||||
|
print(f'[singlefile] CLI command: {" ".join(cmd[:6])} ...', file=sys.stderr)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
output_lines: list[str] = []
|
output_lines: list[str] = []
|
||||||
@@ -258,36 +263,93 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
|
|
||||||
def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
|
def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
|
||||||
"""Save using the SingleFile Chrome extension via existing Chrome session."""
|
"""Save using the SingleFile Chrome extension via existing Chrome session."""
|
||||||
|
print(f'[singlefile] Extension mode start url={url}', file=sys.stderr)
|
||||||
# Only attempt if chrome session exists
|
# Only attempt if chrome session exists
|
||||||
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
|
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
|
||||||
if not cdp_url:
|
if not cdp_url:
|
||||||
|
print('[singlefile] No chrome session (cdp_url.txt missing)', file=sys.stderr)
|
||||||
return False, None, 'No Chrome session available'
|
return False, None, 'No Chrome session available'
|
||||||
|
|
||||||
if not EXTENSION_SAVE_SCRIPT.exists():
|
if not EXTENSION_SAVE_SCRIPT.exists():
|
||||||
|
print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr)
|
||||||
return False, None, 'SingleFile extension helper script missing'
|
return False, None, 'SingleFile extension helper script missing'
|
||||||
|
|
||||||
node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
|
node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
|
||||||
|
downloads_dir = get_env('CHROME_DOWNLOADS_DIR', '')
|
||||||
|
extensions_dir = get_env('CHROME_EXTENSIONS_DIR', '')
|
||||||
cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
|
cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
|
||||||
|
print(f'[singlefile] cdp_url={cdp_url}', file=sys.stderr)
|
||||||
|
print(f'[singlefile] node={node_binary}', file=sys.stderr)
|
||||||
|
node_resolved = shutil.which(node_binary) if node_binary else None
|
||||||
|
print(f'[singlefile] node_resolved={node_resolved}', file=sys.stderr)
|
||||||
|
print(f'[singlefile] PATH={os.environ.get("PATH","")}', file=sys.stderr)
|
||||||
|
if downloads_dir:
|
||||||
|
print(f'[singlefile] CHROME_DOWNLOADS_DIR={downloads_dir}', file=sys.stderr)
|
||||||
|
if extensions_dir:
|
||||||
|
print(f'[singlefile] CHROME_EXTENSIONS_DIR={extensions_dir}', file=sys.stderr)
|
||||||
|
print(f'[singlefile] helper_cmd={" ".join(cmd)}', file=sys.stderr)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
output_lines: list[str] = []
|
||||||
except subprocess.TimeoutExpired:
|
error_lines: list[str] = []
|
||||||
return False, None, f'Timed out after {timeout} seconds'
|
process = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
bufsize=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _read_stream(stream, sink, label: str) -> None:
|
||||||
|
if not stream:
|
||||||
|
return
|
||||||
|
for line in stream:
|
||||||
|
sink.append(line)
|
||||||
|
sys.stderr.write(line)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
stdout_thread = threading.Thread(target=_read_stream, args=(process.stdout, output_lines, 'stdout'), daemon=True)
|
||||||
|
stderr_thread = threading.Thread(target=_read_stream, args=(process.stderr, error_lines, 'stderr'), daemon=True)
|
||||||
|
stdout_thread.start()
|
||||||
|
stderr_thread.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
process.wait(timeout=timeout)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
process.kill()
|
||||||
|
stdout_thread.join(timeout=1)
|
||||||
|
stderr_thread.join(timeout=1)
|
||||||
|
print(f'[singlefile] Extension helper timed out after {timeout}s', file=sys.stderr)
|
||||||
|
return False, None, f'Timed out after {timeout} seconds'
|
||||||
|
|
||||||
|
stdout_thread.join(timeout=1)
|
||||||
|
stderr_thread.join(timeout=1)
|
||||||
|
|
||||||
|
result_stdout = ''.join(output_lines).encode('utf-8', errors='replace')
|
||||||
|
result_stderr = ''.join(error_lines).encode('utf-8', errors='replace')
|
||||||
|
result_returncode = process.returncode
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print(f'[singlefile] Extension helper error: {type(e).__name__}: {e}', file=sys.stderr)
|
||||||
return False, None, f'{type(e).__name__}: {e}'
|
return False, None, f'{type(e).__name__}: {e}'
|
||||||
|
|
||||||
if result.returncode == 0:
|
print(f'[singlefile] helper_returncode={result_returncode}', file=sys.stderr)
|
||||||
|
print(f'[singlefile] helper_stdout_len={len(result_stdout or b"")}', file=sys.stderr)
|
||||||
|
print(f'[singlefile] helper_stderr_len={len(result_stderr or b"")}', file=sys.stderr)
|
||||||
|
|
||||||
|
if result_returncode == 0:
|
||||||
# Prefer explicit stdout path, fallback to local output file
|
# Prefer explicit stdout path, fallback to local output file
|
||||||
out_text = result.stdout.decode('utf-8', errors='replace').strip()
|
out_text = result_stdout.decode('utf-8', errors='replace').strip()
|
||||||
if out_text and Path(out_text).exists():
|
if out_text and Path(out_text).exists():
|
||||||
|
print(f'[singlefile] Extension output: {out_text}', file=sys.stderr)
|
||||||
return True, out_text, ''
|
return True, out_text, ''
|
||||||
output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
|
output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
|
||||||
if output_path.exists() and output_path.stat().st_size > 0:
|
if output_path.exists() and output_path.stat().st_size > 0:
|
||||||
|
print(f'[singlefile] Extension output: {output_path}', file=sys.stderr)
|
||||||
return True, str(output_path), ''
|
return True, str(output_path), ''
|
||||||
return False, None, 'SingleFile extension completed but no output file found'
|
return False, None, 'SingleFile extension completed but no output file found'
|
||||||
|
|
||||||
stderr = result.stderr.decode('utf-8', errors='replace').strip()
|
stderr = result_stderr.decode('utf-8', errors='replace').strip()
|
||||||
stdout = result.stdout.decode('utf-8', errors='replace').strip()
|
stdout = result_stdout.decode('utf-8', errors='replace').strip()
|
||||||
detail = stderr or stdout
|
detail = stderr or stdout
|
||||||
return False, None, detail or 'SingleFile extension failed'
|
return False, None, detail or 'SingleFile extension failed'
|
||||||
|
|
||||||
@@ -298,6 +360,7 @@ def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str |
|
|||||||
def main(url: str, snapshot_id: str):
|
def main(url: str, snapshot_id: str):
|
||||||
"""Archive a URL using SingleFile."""
|
"""Archive a URL using SingleFile."""
|
||||||
|
|
||||||
|
print(f'[singlefile] Hook starting pid={os.getpid()} url={url}', file=sys.stderr)
|
||||||
output = None
|
output = None
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
error = ''
|
error = ''
|
||||||
@@ -318,11 +381,6 @@ def main(url: str, snapshot_id: str):
|
|||||||
# Prefer SingleFile extension via existing Chrome session
|
# Prefer SingleFile extension via existing Chrome session
|
||||||
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||||
success, output, error = save_singlefile_with_extension(url, timeout)
|
success, output, error = save_singlefile_with_extension(url, timeout)
|
||||||
|
|
||||||
# Fallback to single-file-cli if extension path failed
|
|
||||||
if not success:
|
|
||||||
binary = get_env('SINGLEFILE_BINARY', 'single-file')
|
|
||||||
success, output, error = save_singlefile(url, binary)
|
|
||||||
status = 'succeeded' if success else 'failed'
|
status = 'succeeded' if success else 'failed'
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
207
archivebox/plugins/singlefile/singlefile_extension_save.js
Normal file
207
archivebox/plugins/singlefile/singlefile_extension_save.js
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
/**
|
||||||
|
* Save a page using the SingleFile Chrome extension via an existing Chrome session.
|
||||||
|
*
|
||||||
|
* Usage: singlefile_extension_save.js --url=<url>
|
||||||
|
* Output: prints saved file path on success
|
||||||
|
*/
|
||||||
|
|
||||||
|
const fs = require('fs');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
const CHROME_SESSION_DIR = '../chrome';
|
||||||
|
const DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||||
|
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||||
|
|
||||||
|
process.env.CHROME_DOWNLOADS_DIR = DOWNLOADS_DIR;
|
||||||
|
|
||||||
|
async function setDownloadDir(page, downloadDir) {
|
||||||
|
try {
|
||||||
|
await fs.promises.mkdir(downloadDir, { recursive: true });
|
||||||
|
const client = await page.target().createCDPSession();
|
||||||
|
try {
|
||||||
|
await client.send('Page.setDownloadBehavior', {
|
||||||
|
behavior: 'allow',
|
||||||
|
downloadPath: downloadDir,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
// Fallback for newer protocol versions
|
||||||
|
await client.send('Browser.setDownloadBehavior', {
|
||||||
|
behavior: 'allow',
|
||||||
|
downloadPath: downloadDir,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[⚠️] Failed to set download directory: ${err.message || err}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseArgs() {
|
||||||
|
const args = {};
|
||||||
|
process.argv.slice(2).forEach((arg) => {
|
||||||
|
if (arg.startsWith('--')) {
|
||||||
|
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||||
|
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = parseArgs();
|
||||||
|
const url = args.url;
|
||||||
|
|
||||||
|
if (!url) {
|
||||||
|
console.error('Usage: singlefile_extension_save.js --url=<url>');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(`[singlefile] helper start url=${url}`);
|
||||||
|
console.error(`[singlefile] downloads_dir=${DOWNLOADS_DIR}`);
|
||||||
|
if (process.env.CHROME_EXTENSIONS_DIR) {
|
||||||
|
console.error(`[singlefile] extensions_dir=${process.env.CHROME_EXTENSIONS_DIR}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
console.error('[singlefile] loading dependencies...');
|
||||||
|
const puppeteer = require('puppeteer-core');
|
||||||
|
const chromeUtils = require('../chrome/chrome_utils.js');
|
||||||
|
const {
|
||||||
|
EXTENSION,
|
||||||
|
saveSinglefileWithExtension,
|
||||||
|
} = require('./on_Crawl__82_singlefile_install.js');
|
||||||
|
console.error('[singlefile] dependencies loaded');
|
||||||
|
|
||||||
|
// Ensure extension is installed and metadata is cached
|
||||||
|
console.error('[singlefile] ensuring extension cache...');
|
||||||
|
const extension = await chromeUtils.installExtensionWithCache(
|
||||||
|
EXTENSION,
|
||||||
|
{ extensionsDir: process.env.CHROME_EXTENSIONS_DIR }
|
||||||
|
);
|
||||||
|
if (!extension) {
|
||||||
|
console.error('[❌] SingleFile extension not installed');
|
||||||
|
process.exit(2);
|
||||||
|
}
|
||||||
|
if (extension.unpacked_path) {
|
||||||
|
const runtimeId = chromeUtils.getExtensionId(extension.unpacked_path);
|
||||||
|
if (runtimeId) {
|
||||||
|
extension.id = runtimeId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.error(`[singlefile] extension ready id=${extension.id} version=${extension.version}`);
|
||||||
|
|
||||||
|
// Connect to existing Chrome session
|
||||||
|
console.error('[singlefile] connecting to chrome session...');
|
||||||
|
const { browser, page } = await chromeUtils.connectToPage({
|
||||||
|
chromeSessionDir: CHROME_SESSION_DIR,
|
||||||
|
timeoutMs: 60000,
|
||||||
|
puppeteer,
|
||||||
|
});
|
||||||
|
console.error('[singlefile] connected to chrome');
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Ensure CDP target discovery is enabled so service_worker targets appear
|
||||||
|
try {
|
||||||
|
const client = await page.createCDPSession();
|
||||||
|
await client.send('Target.setDiscoverTargets', { discover: true });
|
||||||
|
await client.send('Target.setAutoAttach', { autoAttach: true, waitForDebuggerOnStart: false, flatten: true });
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[singlefile] failed to enable target discovery: ${err.message || err}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for extension target to be available, then attach dispatchAction
|
||||||
|
console.error('[singlefile] waiting for extension target...');
|
||||||
|
const deadline = Date.now() + 30000;
|
||||||
|
let matchTarget = null;
|
||||||
|
let matchInfo = null;
|
||||||
|
let lastLog = 0;
|
||||||
|
const wantedName = (extension.name || 'singlefile').toLowerCase();
|
||||||
|
|
||||||
|
while (Date.now() < deadline && !matchTarget) {
|
||||||
|
const targets = browser.targets();
|
||||||
|
for (const target of targets) {
|
||||||
|
const info = await chromeUtils.isTargetExtension(target);
|
||||||
|
if (!info?.target_is_extension || !info?.extension_id) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const manifestName = (info.manifest_name || '').toLowerCase();
|
||||||
|
const targetUrl = (info.target_url || '').toLowerCase();
|
||||||
|
const nameMatches = manifestName.includes(wantedName) || manifestName.includes('singlefile') || manifestName.includes('single-file');
|
||||||
|
const urlMatches = targetUrl.includes('singlefile') || targetUrl.includes('single-file') || targetUrl.includes('single-file-extension');
|
||||||
|
if (nameMatches || urlMatches) {
|
||||||
|
matchTarget = target;
|
||||||
|
matchInfo = info;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!matchTarget) {
|
||||||
|
if (Date.now() - lastLog > 5000) {
|
||||||
|
const targetsSummary = [];
|
||||||
|
for (const target of targets) {
|
||||||
|
const info = await chromeUtils.isTargetExtension(target);
|
||||||
|
if (!info?.target_is_extension) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
targetsSummary.push({
|
||||||
|
type: info.target_type,
|
||||||
|
url: info.target_url,
|
||||||
|
extensionId: info.extension_id,
|
||||||
|
manifestName: info.manifest_name,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
console.error(`[singlefile] waiting... targets total=${targets.length} extensions=${targetsSummary.length} details=${JSON.stringify(targetsSummary)}`);
|
||||||
|
lastLog = Date.now();
|
||||||
|
}
|
||||||
|
await new Promise(r => setTimeout(r, 500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!matchTarget || !matchInfo) {
|
||||||
|
const targets = chromeUtils.getExtensionTargets(browser);
|
||||||
|
console.error(`[singlefile] extension target not found (name=${extension.name})`);
|
||||||
|
console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`);
|
||||||
|
await browser.disconnect();
|
||||||
|
process.exit(5);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use the runtime extension id from the matched target
|
||||||
|
extension.id = matchInfo.extension_id;
|
||||||
|
|
||||||
|
console.error('[singlefile] loading extension from target...');
|
||||||
|
await chromeUtils.loadExtensionFromTarget([extension], matchTarget);
|
||||||
|
if (typeof extension.dispatchAction !== 'function') {
|
||||||
|
const targets = chromeUtils.getExtensionTargets(browser);
|
||||||
|
console.error(`[singlefile] extension dispatchAction missing for id=${extension.id}`);
|
||||||
|
console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`);
|
||||||
|
await browser.disconnect();
|
||||||
|
process.exit(6);
|
||||||
|
}
|
||||||
|
console.error('[singlefile] setting download dir...');
|
||||||
|
await setDownloadDir(page, DOWNLOADS_DIR);
|
||||||
|
|
||||||
|
console.error('[singlefile] triggering save via extension...');
|
||||||
|
const output = await saveSinglefileWithExtension(page, extension, { downloadsDir: DOWNLOADS_DIR });
|
||||||
|
if (output && fs.existsSync(output)) {
|
||||||
|
console.error(`[singlefile] saved: ${output}`);
|
||||||
|
console.log(output);
|
||||||
|
await browser.disconnect();
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error('[❌] SingleFile extension did not produce output');
|
||||||
|
await browser.disconnect();
|
||||||
|
process.exit(3);
|
||||||
|
} catch (err) {
|
||||||
|
await browser.disconnect();
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error(`[❌] ${err.message || err}`);
|
||||||
|
process.exit(4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
main();
|
||||||
|
}
|
||||||
@@ -483,8 +483,7 @@ const puppeteer = require('puppeteer-core');
|
|||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['node', str(script_path)],
|
['node', str(script_path)],
|
||||||
cwd=str(tmpdir,
|
cwd=str(tmpdir),
|
||||||
env=get_test_env()),
|
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
env=env,
|
env=env,
|
||||||
|
|||||||
@@ -144,6 +144,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
cmd,
|
cmd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
timeout=timeout * 2, # Allow extra time for large downloads
|
timeout=timeout * 2, # Allow extra time for large downloads
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -166,7 +168,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
|
output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
|
||||||
|
|
||||||
# Parse download stats from wget output
|
# Parse download stats from wget output
|
||||||
output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:]
|
stderr_text = (result.stderr or '')
|
||||||
|
output_tail = stderr_text.strip().split('\n')[-3:] if stderr_text else []
|
||||||
files_count = len(downloaded_files)
|
files_count = len(downloaded_files)
|
||||||
|
|
||||||
return True, output_path, ''
|
return True, output_path, ''
|
||||||
|
|||||||
@@ -201,7 +201,7 @@ def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
if 'Unable to extract' in stderr:
|
if 'Unable to extract' in stderr:
|
||||||
return False, None, 'Unable to extract media info'
|
return False, None, 'Unable to extract media info'
|
||||||
|
|
||||||
return False, None, f'yt-dlp error: {stderr[:200]}'
|
return False, None, f'yt-dlp error: {stderr}'
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
return False, None, f'Timed out after {timeout} seconds'
|
return False, None, f'Timed out after {timeout} seconds'
|
||||||
|
|||||||
@@ -459,7 +459,6 @@ class Orchestrator:
|
|||||||
# Enable progress layout only in TTY + foreground mode
|
# Enable progress layout only in TTY + foreground mode
|
||||||
show_progress = IS_TTY and self.exit_on_idle
|
show_progress = IS_TTY and self.exit_on_idle
|
||||||
plain_output = not IS_TTY
|
plain_output = not IS_TTY
|
||||||
|
|
||||||
self.on_startup()
|
self.on_startup()
|
||||||
|
|
||||||
if not show_progress:
|
if not show_progress:
|
||||||
@@ -520,7 +519,6 @@ class Orchestrator:
|
|||||||
|
|
||||||
def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False):
|
def _run_orchestrator_loop(self, progress_layout, plain_output: bool = False):
|
||||||
"""Run the main orchestrator loop with optional progress display."""
|
"""Run the main orchestrator loop with optional progress display."""
|
||||||
last_queue_sizes = {}
|
|
||||||
last_snapshot_count = None
|
last_snapshot_count = None
|
||||||
tick_count = 0
|
tick_count = 0
|
||||||
last_plain_lines: set[tuple[str, str]] = set()
|
last_plain_lines: set[tuple[str, str]] = set()
|
||||||
@@ -611,6 +609,21 @@ class Orchestrator:
|
|||||||
seconds = max(0.0, float(total_seconds))
|
seconds = max(0.0, float(total_seconds))
|
||||||
return f"{seconds:.1f}s"
|
return f"{seconds:.1f}s"
|
||||||
|
|
||||||
|
def _tail_stderr_line(proc) -> str:
|
||||||
|
try:
|
||||||
|
path = getattr(proc, 'stderr_file', None)
|
||||||
|
if not path or not path.exists():
|
||||||
|
return ''
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
f.seek(0, os.SEEK_END)
|
||||||
|
size = f.tell()
|
||||||
|
f.seek(max(0, size - 4096))
|
||||||
|
data = f.read().decode('utf-8', errors='ignore')
|
||||||
|
lines = [ln.strip() for ln in data.splitlines() if ln.strip()]
|
||||||
|
return lines[-1] if lines else ''
|
||||||
|
except Exception:
|
||||||
|
return ''
|
||||||
|
|
||||||
tree_data: list[dict] = []
|
tree_data: list[dict] = []
|
||||||
for crawl in crawls:
|
for crawl in crawls:
|
||||||
urls = crawl.get_urls_list()
|
urls = crawl.get_urls_list()
|
||||||
@@ -684,7 +697,10 @@ class Orchestrator:
|
|||||||
elapsed = ''
|
elapsed = ''
|
||||||
timeout = ''
|
timeout = ''
|
||||||
size = ''
|
size = ''
|
||||||
|
stderr_tail = ''
|
||||||
if ar:
|
if ar:
|
||||||
|
if ar.process_id and ar.process:
|
||||||
|
stderr_tail = _tail_stderr_line(ar.process)
|
||||||
if ar.status == ArchiveResult.StatusChoices.STARTED:
|
if ar.status == ArchiveResult.StatusChoices.STARTED:
|
||||||
status = 'started'
|
status = 'started'
|
||||||
is_running = True
|
is_running = True
|
||||||
@@ -700,6 +716,8 @@ class Orchestrator:
|
|||||||
timeout = _format_seconds(hook_timeout)
|
timeout = _format_seconds(hook_timeout)
|
||||||
else:
|
else:
|
||||||
status = ar.status
|
status = ar.status
|
||||||
|
if ar.process_id and ar.process and ar.process.exit_code == 137:
|
||||||
|
status = 'failed'
|
||||||
is_pending = False
|
is_pending = False
|
||||||
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
|
start_ts = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
|
||||||
end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None)
|
end_ts = ar.end_ts or (ar.process.ended_at if ar.process_id and ar.process else None)
|
||||||
@@ -724,6 +742,7 @@ class Orchestrator:
|
|||||||
'is_running': is_running,
|
'is_running': is_running,
|
||||||
'is_pending': is_pending,
|
'is_pending': is_pending,
|
||||||
'hook_name': hook_name,
|
'hook_name': hook_name,
|
||||||
|
'stderr': stderr_tail,
|
||||||
})
|
})
|
||||||
|
|
||||||
hooks = []
|
hooks = []
|
||||||
@@ -734,6 +753,7 @@ class Orchestrator:
|
|||||||
any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries)
|
any_succeeded = any(h['status'] == ArchiveResult.StatusChoices.SUCCEEDED for h in hook_entries)
|
||||||
any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries)
|
any_skipped = any(h['status'] == ArchiveResult.StatusChoices.SKIPPED for h in hook_entries)
|
||||||
|
|
||||||
|
stderr_tail = ''
|
||||||
if running:
|
if running:
|
||||||
status = 'started'
|
status = 'started'
|
||||||
is_running = True
|
is_running = True
|
||||||
@@ -741,6 +761,7 @@ class Orchestrator:
|
|||||||
is_bg = running['is_bg']
|
is_bg = running['is_bg']
|
||||||
elapsed = running.get('elapsed', '')
|
elapsed = running.get('elapsed', '')
|
||||||
timeout = running.get('timeout', '')
|
timeout = running.get('timeout', '')
|
||||||
|
stderr_tail = running.get('stderr', '')
|
||||||
size = ''
|
size = ''
|
||||||
elif pending:
|
elif pending:
|
||||||
status = 'pending'
|
status = 'pending'
|
||||||
@@ -749,6 +770,7 @@ class Orchestrator:
|
|||||||
is_bg = pending['is_bg']
|
is_bg = pending['is_bg']
|
||||||
elapsed = pending.get('elapsed', '') or _format_seconds(0)
|
elapsed = pending.get('elapsed', '') or _format_seconds(0)
|
||||||
timeout = pending.get('timeout', '')
|
timeout = pending.get('timeout', '')
|
||||||
|
stderr_tail = pending.get('stderr', '')
|
||||||
size = ''
|
size = ''
|
||||||
else:
|
else:
|
||||||
is_running = False
|
is_running = False
|
||||||
@@ -762,6 +784,10 @@ class Orchestrator:
|
|||||||
status = 'skipped'
|
status = 'skipped'
|
||||||
else:
|
else:
|
||||||
status = 'skipped'
|
status = 'skipped'
|
||||||
|
for h in hook_entries:
|
||||||
|
if h.get('stderr'):
|
||||||
|
stderr_tail = h['stderr']
|
||||||
|
break
|
||||||
total_elapsed = 0.0
|
total_elapsed = 0.0
|
||||||
has_elapsed = False
|
has_elapsed = False
|
||||||
for h in hook_entries:
|
for h in hook_entries:
|
||||||
@@ -793,6 +819,7 @@ class Orchestrator:
|
|||||||
'is_bg': is_bg,
|
'is_bg': is_bg,
|
||||||
'is_running': is_running,
|
'is_running': is_running,
|
||||||
'is_pending': is_pending,
|
'is_pending': is_pending,
|
||||||
|
'stderr': stderr_tail,
|
||||||
})
|
})
|
||||||
|
|
||||||
snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80)
|
snap_label = _abbrev(f"{str(snap.id)[-8:]} {snap.url or ''}".strip(), max_len=80)
|
||||||
@@ -857,8 +884,6 @@ class Orchestrator:
|
|||||||
|
|
||||||
progress_layout.update_process_panels(running_processes, pending=pending_processes)
|
progress_layout.update_process_panels(running_processes, pending=pending_processes)
|
||||||
|
|
||||||
last_queue_sizes = queue_sizes.copy()
|
|
||||||
|
|
||||||
# Update snapshot progress
|
# Update snapshot progress
|
||||||
from archivebox.core.models import Snapshot
|
from archivebox.core.models import Snapshot
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user