better tui

This commit is contained in:
Nick Sweeting
2026-01-19 01:53:32 -08:00
parent 1cb2d5070e
commit b5bbc3b549
9 changed files with 690 additions and 109 deletions

View File

@@ -384,6 +384,8 @@ async function launchChromium(options = {}) {
return { success: false, error: 'Chrome binary not found' };
}
const downloadsDir = getEnv('CHROME_DOWNLOADS_DIR');
// Kill zombies first
if (killZombies) {
killZombieChrome();
@@ -412,6 +414,28 @@ async function launchChromium(options = {}) {
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
}
}
if (downloadsDir) {
try {
const defaultProfileDir = path.join(userDataDir, 'Default');
const prefsPath = path.join(defaultProfileDir, 'Preferences');
fs.mkdirSync(defaultProfileDir, { recursive: true });
let prefs = {};
if (fs.existsSync(prefsPath)) {
try {
prefs = JSON.parse(fs.readFileSync(prefsPath, 'utf-8'));
} catch (e) {
prefs = {};
}
}
prefs.download = prefs.download || {};
prefs.download.default_directory = downloadsDir;
prefs.download.prompt_for_download = false;
fs.writeFileSync(prefsPath, JSON.stringify(prefs));
console.error(`[*] Set Chrome download directory: ${downloadsDir}`);
} catch (e) {
console.error(`[!] Failed to set Chrome download directory: ${e.message}`);
}
}
}
// Find a free port
@@ -455,6 +479,11 @@ async function launchChromium(options = {}) {
// Dynamic args come after base so they can override if needed
const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs];
// Ensure keychain prompts are disabled on macOS
if (!chromiumArgs.includes('--use-mock-keychain')) {
chromiumArgs.push('--use-mock-keychain');
}
// Add extension loading flags
if (extensionPaths.length > 0) {
const extPathsArg = extensionPaths.join(',');

View File

@@ -84,6 +84,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
}
const url = await page.url();
console.error(`[singlefile] Triggering extension for: ${url}`);
// Check for unsupported URL schemes
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
@@ -93,24 +94,28 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
return null;
}
const downloadsDir = options.downloadsDir || CHROME_DOWNLOADS_DIR;
console.error(`[singlefile] Watching downloads dir: ${downloadsDir}`);
// Ensure downloads directory exists
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
await fs.promises.mkdir(downloadsDir, { recursive: true });
// Get list of existing files to ignore
const files_before = new Set(
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'))
(await fs.promises.readdir(downloadsDir))
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'))
);
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
console.error(`[singlefile] Saving via extension (${extension.id})...`);
// Bring page to front (extension action button acts on foreground tab)
await page.bringToFront();
// Trigger the extension's action (toolbar button click)
console.error('[singlefile] Dispatching extension action...');
await extension.dispatchAction();
// Wait for file to appear in downloads directory
@@ -118,34 +123,90 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
const max_tries = 10;
let files_new = [];
console.error(`[singlefile] Waiting up to ${(check_delay * max_tries) / 1000}s for download...`);
for (let attempt = 0; attempt < max_tries; attempt++) {
await wait(check_delay);
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'));
const files_after = (await fs.promises.readdir(downloadsDir))
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'));
files_new = files_after.filter(file => !files_before.has(file));
if (files_new.length === 0) {
console.error(`[singlefile] No new downloads yet (${attempt + 1}/${max_tries})`);
continue;
}
// Find the matching file by checking if it contains the URL in the HTML header
for (const file of files_new) {
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
const dl_header = dl_text.split('meta charset')[0];
console.error(`[singlefile] New download(s) detected: ${files_new.join(', ')}`);
if (dl_header.includes(`url: ${url}`)) {
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
await fs.promises.rename(dl_path, out_path);
// Prefer files that match the URL or have SingleFile markers
const url_variants = new Set([url]);
if (url.endsWith('/')) {
url_variants.add(url.slice(0, -1));
} else {
url_variants.add(`${url}/`);
}
const scored = [];
for (const file of files_new) {
const dl_path = path.join(downloadsDir, file);
let header = '';
try {
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
header = dl_text.slice(0, 200000);
const stat = await fs.promises.stat(dl_path);
console.error(`[singlefile] Download ${file} size=${stat.size} bytes`);
} catch (err) {
// Skip unreadable files
continue;
}
const header_lower = header.toLowerCase();
const has_url = Array.from(url_variants).some(v => header.includes(v));
const has_singlefile_marker = header_lower.includes('singlefile') || header_lower.includes('single-file');
const score = (has_url ? 2 : 0) + (has_singlefile_marker ? 1 : 0);
scored.push({ file, dl_path, score });
}
scored.sort((a, b) => b.score - a.score);
if (scored.length > 0) {
const best = scored[0];
if (best.score > 0 || files_new.length === 1) {
console.error(`[singlefile] Moving download from ${best.file} -> ${out_path}`);
await fs.promises.rename(best.dl_path, out_path);
const out_stat = await fs.promises.stat(out_path);
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
return out_path;
}
}
if (files_new.length > 0) {
// Fallback: move the newest file if no clear match found
let newest = null;
let newest_mtime = -1;
for (const file of files_new) {
const dl_path = path.join(downloadsDir, file);
try {
const stat = await fs.promises.stat(dl_path);
if (stat.mtimeMs > newest_mtime) {
newest_mtime = stat.mtimeMs;
newest = { file, dl_path };
}
} catch (err) {}
}
if (newest) {
console.error(`[singlefile] Moving newest download from ${newest.file} -> ${out_path}`);
await fs.promises.rename(newest.dl_path, out_path);
const out_stat = await fs.promises.stat(out_path);
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
return out_path;
}
}
}
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
console.error(`[singlefile] Failed to find SingleFile HTML in ${downloadsDir} after ${(check_delay * max_tries) / 1000}s`);
console.error(`[singlefile] New files seen: ${files_new.join(', ')}`);
return null;
}

View File

@@ -37,6 +37,7 @@ BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'singlefile.html'
EXTENSION_SAVE_SCRIPT = Path(__file__).parent / 'singlefile_extension_save.js'
def get_env(name: str, default: str = '') -> str:
@@ -255,6 +256,42 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
return False, None, f'{type(e).__name__}: {e}'
def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
"""Save using the SingleFile Chrome extension via existing Chrome session."""
# Only attempt if chrome session exists
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
if not cdp_url:
return False, None, 'No Chrome session available'
if not EXTENSION_SAVE_SCRIPT.exists():
return False, None, 'SingleFile extension helper script missing'
node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
if result.returncode == 0:
# Prefer explicit stdout path, fallback to local output file
out_text = result.stdout.decode('utf-8', errors='replace').strip()
if out_text and Path(out_text).exists():
return True, out_text, ''
output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
if output_path.exists() and output_path.stat().st_size > 0:
return True, str(output_path), ''
return False, None, 'SingleFile extension completed but no output file found'
stderr = result.stderr.decode('utf-8', errors='replace').strip()
stdout = result.stdout.decode('utf-8', errors='replace').strip()
detail = stderr or stdout
return False, None, detail or 'SingleFile extension failed'
@click.command()
@click.option('--url', required=True, help='URL to archive')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
@@ -278,11 +315,14 @@ def main(url: str, snapshot_id: str):
print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
sys.exit(0)
# Get binary from environment
binary = get_env('SINGLEFILE_BINARY', 'single-file')
# Prefer SingleFile extension via existing Chrome session
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
success, output, error = save_singlefile_with_extension(url, timeout)
# Run extraction
success, output, error = save_singlefile(url, binary)
# Fallback to single-file-cli if extension path failed
if not success:
binary = get_env('SINGLEFILE_BINARY', 'single-file')
success, output, error = save_singlefile(url, binary)
status = 'succeeded' if success else 'failed'
except Exception as e:

View File

@@ -30,6 +30,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
PLUGIN_DIR = get_plugin_dir(__file__)
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__82_singlefile_install.js'
TEST_URL = "https://example.com"
@@ -142,6 +143,95 @@ def test_singlefile_with_chrome_session():
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
def test_singlefile_with_extension_uses_existing_chrome():
"""Test SingleFile uses the Chrome extension via existing session (CLI fallback disabled)."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
data_dir = tmpdir / 'data'
extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
extensions_dir.mkdir(parents=True, exist_ok=True)
downloads_dir.mkdir(parents=True, exist_ok=True)
user_data_dir.mkdir(parents=True, exist_ok=True)
env_install = os.environ.copy()
env_install.update({
'DATA_DIR': str(data_dir),
'CHROME_EXTENSIONS_DIR': str(extensions_dir),
'CHROME_DOWNLOADS_DIR': str(downloads_dir),
})
# Install SingleFile extension cache before launching Chrome
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env_install,
timeout=120
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Launch Chrome session with extensions loaded
old_env = os.environ.copy()
os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
try:
with chrome_session(
tmpdir=tmpdir,
crawl_id='singlefile-ext-crawl',
snapshot_id='singlefile-ext-snap',
test_url=TEST_URL,
navigate=True,
timeout=30,
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
# Ensure ../chrome points to snapshot chrome session (contains target_id.txt)
chrome_dir = singlefile_output_dir.parent / 'chrome'
if not chrome_dir.exists():
chrome_dir.symlink_to(snapshot_chrome_dir)
env['SINGLEFILE_ENABLED'] = 'true'
env['SINGLEFILE_BINARY'] = '/nonexistent/single-file' # force extension path
env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
env['CHROME_HEADLESS'] = 'false'
# Track downloads dir state before run to ensure file is created then moved out
downloads_before = set(downloads_dir.glob('*.html'))
downloads_mtime_before = downloads_dir.stat().st_mtime_ns
result = subprocess.run(
[sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-ext-snap'],
cwd=str(singlefile_output_dir),
capture_output=True,
text=True,
env=env,
timeout=120
)
assert result.returncode == 0, f"SingleFile extension run failed: {result.stderr}"
output_file = singlefile_output_dir / 'singlefile.html'
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
html_content = output_file.read_text(errors='ignore')
assert 'Example Domain' in html_content, "Output should contain example.com content"
# Verify download moved out of downloads dir
downloads_after = set(downloads_dir.glob('*.html'))
new_downloads = downloads_after - downloads_before
downloads_mtime_after = downloads_dir.stat().st_mtime_ns
assert downloads_mtime_after != downloads_mtime_before, "Downloads dir should be modified during extension save"
assert not new_downloads, f"SingleFile download should be moved out of downloads dir, found: {new_downloads}"
finally:
os.environ.clear()
os.environ.update(old_env)
def test_singlefile_disabled_skips():
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
with tempfile.TemporaryDirectory() as tmpdir: