mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
better tui
This commit is contained in:
@@ -384,6 +384,8 @@ async function launchChromium(options = {}) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
const downloadsDir = getEnv('CHROME_DOWNLOADS_DIR');
|
||||
|
||||
// Kill zombies first
|
||||
if (killZombies) {
|
||||
killZombieChrome();
|
||||
@@ -412,6 +414,28 @@ async function launchChromium(options = {}) {
|
||||
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
|
||||
}
|
||||
}
|
||||
if (downloadsDir) {
|
||||
try {
|
||||
const defaultProfileDir = path.join(userDataDir, 'Default');
|
||||
const prefsPath = path.join(defaultProfileDir, 'Preferences');
|
||||
fs.mkdirSync(defaultProfileDir, { recursive: true });
|
||||
let prefs = {};
|
||||
if (fs.existsSync(prefsPath)) {
|
||||
try {
|
||||
prefs = JSON.parse(fs.readFileSync(prefsPath, 'utf-8'));
|
||||
} catch (e) {
|
||||
prefs = {};
|
||||
}
|
||||
}
|
||||
prefs.download = prefs.download || {};
|
||||
prefs.download.default_directory = downloadsDir;
|
||||
prefs.download.prompt_for_download = false;
|
||||
fs.writeFileSync(prefsPath, JSON.stringify(prefs));
|
||||
console.error(`[*] Set Chrome download directory: ${downloadsDir}`);
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to set Chrome download directory: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find a free port
|
||||
@@ -455,6 +479,11 @@ async function launchChromium(options = {}) {
|
||||
// Dynamic args come after base so they can override if needed
|
||||
const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs];
|
||||
|
||||
// Ensure keychain prompts are disabled on macOS
|
||||
if (!chromiumArgs.includes('--use-mock-keychain')) {
|
||||
chromiumArgs.push('--use-mock-keychain');
|
||||
}
|
||||
|
||||
// Add extension loading flags
|
||||
if (extensionPaths.length > 0) {
|
||||
const extPathsArg = extensionPaths.join(',');
|
||||
|
||||
@@ -84,6 +84,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
}
|
||||
|
||||
const url = await page.url();
|
||||
console.error(`[singlefile] Triggering extension for: ${url}`);
|
||||
|
||||
// Check for unsupported URL schemes
|
||||
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
@@ -93,24 +94,28 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const downloadsDir = options.downloadsDir || CHROME_DOWNLOADS_DIR;
|
||||
console.error(`[singlefile] Watching downloads dir: ${downloadsDir}`);
|
||||
|
||||
// Ensure downloads directory exists
|
||||
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
await fs.promises.mkdir(downloadsDir, { recursive: true });
|
||||
|
||||
// Get list of existing files to ignore
|
||||
const files_before = new Set(
|
||||
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'))
|
||||
(await fs.promises.readdir(downloadsDir))
|
||||
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'))
|
||||
);
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
console.error(`[singlefile] Saving via extension (${extension.id})...`);
|
||||
|
||||
// Bring page to front (extension action button acts on foreground tab)
|
||||
await page.bringToFront();
|
||||
|
||||
// Trigger the extension's action (toolbar button click)
|
||||
console.error('[singlefile] Dispatching extension action...');
|
||||
await extension.dispatchAction();
|
||||
|
||||
// Wait for file to appear in downloads directory
|
||||
@@ -118,34 +123,90 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
const max_tries = 10;
|
||||
let files_new = [];
|
||||
|
||||
console.error(`[singlefile] Waiting up to ${(check_delay * max_tries) / 1000}s for download...`);
|
||||
for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
await wait(check_delay);
|
||||
|
||||
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'));
|
||||
const files_after = (await fs.promises.readdir(downloadsDir))
|
||||
.filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'));
|
||||
|
||||
files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
if (files_new.length === 0) {
|
||||
console.error(`[singlefile] No new downloads yet (${attempt + 1}/${max_tries})`);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the matching file by checking if it contains the URL in the HTML header
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
const dl_header = dl_text.split('meta charset')[0];
|
||||
console.error(`[singlefile] New download(s) detected: ${files_new.join(', ')}`);
|
||||
|
||||
if (dl_header.includes(`url: ${url}`)) {
|
||||
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
await fs.promises.rename(dl_path, out_path);
|
||||
// Prefer files that match the URL or have SingleFile markers
|
||||
const url_variants = new Set([url]);
|
||||
if (url.endsWith('/')) {
|
||||
url_variants.add(url.slice(0, -1));
|
||||
} else {
|
||||
url_variants.add(`${url}/`);
|
||||
}
|
||||
|
||||
const scored = [];
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(downloadsDir, file);
|
||||
let header = '';
|
||||
try {
|
||||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
header = dl_text.slice(0, 200000);
|
||||
const stat = await fs.promises.stat(dl_path);
|
||||
console.error(`[singlefile] Download ${file} size=${stat.size} bytes`);
|
||||
} catch (err) {
|
||||
// Skip unreadable files
|
||||
continue;
|
||||
}
|
||||
|
||||
const header_lower = header.toLowerCase();
|
||||
const has_url = Array.from(url_variants).some(v => header.includes(v));
|
||||
const has_singlefile_marker = header_lower.includes('singlefile') || header_lower.includes('single-file');
|
||||
const score = (has_url ? 2 : 0) + (has_singlefile_marker ? 1 : 0);
|
||||
scored.push({ file, dl_path, score });
|
||||
}
|
||||
|
||||
scored.sort((a, b) => b.score - a.score);
|
||||
|
||||
if (scored.length > 0) {
|
||||
const best = scored[0];
|
||||
if (best.score > 0 || files_new.length === 1) {
|
||||
console.error(`[singlefile] Moving download from ${best.file} -> ${out_path}`);
|
||||
await fs.promises.rename(best.dl_path, out_path);
|
||||
const out_stat = await fs.promises.stat(out_path);
|
||||
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
|
||||
return out_path;
|
||||
}
|
||||
}
|
||||
|
||||
if (files_new.length > 0) {
|
||||
// Fallback: move the newest file if no clear match found
|
||||
let newest = null;
|
||||
let newest_mtime = -1;
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(downloadsDir, file);
|
||||
try {
|
||||
const stat = await fs.promises.stat(dl_path);
|
||||
if (stat.mtimeMs > newest_mtime) {
|
||||
newest_mtime = stat.mtimeMs;
|
||||
newest = { file, dl_path };
|
||||
}
|
||||
} catch (err) {}
|
||||
}
|
||||
if (newest) {
|
||||
console.error(`[singlefile] Moving newest download from ${newest.file} -> ${out_path}`);
|
||||
await fs.promises.rename(newest.dl_path, out_path);
|
||||
const out_stat = await fs.promises.stat(out_path);
|
||||
console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
|
||||
return out_path;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
console.error(`[singlefile] Failed to find SingleFile HTML in ${downloadsDir} after ${(check_delay * max_tries) / 1000}s`);
|
||||
console.error(`[singlefile] New files seen: ${files_new.join(', ')}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ BIN_NAME = 'single-file'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'singlefile.html'
|
||||
EXTENSION_SAVE_SCRIPT = Path(__file__).parent / 'singlefile_extension_save.js'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -255,6 +256,42 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
|
||||
"""Save using the SingleFile Chrome extension via existing Chrome session."""
|
||||
# Only attempt if chrome session exists
|
||||
cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
|
||||
if not cdp_url:
|
||||
return False, None, 'No Chrome session available'
|
||||
|
||||
if not EXTENSION_SAVE_SCRIPT.exists():
|
||||
return False, None, 'SingleFile extension helper script missing'
|
||||
|
||||
node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
|
||||
cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
if result.returncode == 0:
|
||||
# Prefer explicit stdout path, fallback to local output file
|
||||
out_text = result.stdout.decode('utf-8', errors='replace').strip()
|
||||
if out_text and Path(out_text).exists():
|
||||
return True, out_text, ''
|
||||
output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
|
||||
if output_path.exists() and output_path.stat().st_size > 0:
|
||||
return True, str(output_path), ''
|
||||
return False, None, 'SingleFile extension completed but no output file found'
|
||||
|
||||
stderr = result.stderr.decode('utf-8', errors='replace').strip()
|
||||
stdout = result.stdout.decode('utf-8', errors='replace').strip()
|
||||
detail = stderr or stdout
|
||||
return False, None, detail or 'SingleFile extension failed'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to archive')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
@@ -278,11 +315,14 @@ def main(url: str, snapshot_id: str):
|
||||
print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
binary = get_env('SINGLEFILE_BINARY', 'single-file')
|
||||
# Prefer SingleFile extension via existing Chrome session
|
||||
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||
success, output, error = save_singlefile_with_extension(url, timeout)
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
# Fallback to single-file-cli if extension path failed
|
||||
if not success:
|
||||
binary = get_env('SINGLEFILE_BINARY', 'single-file')
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -30,6 +30,7 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__82_singlefile_install.js'
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
@@ -142,6 +143,95 @@ def test_singlefile_with_chrome_session():
|
||||
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
|
||||
|
||||
|
||||
def test_singlefile_with_extension_uses_existing_chrome():
|
||||
"""Test SingleFile uses the Chrome extension via existing session (CLI fallback disabled)."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
data_dir = tmpdir / 'data'
|
||||
extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads'
|
||||
user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data'
|
||||
extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
downloads_dir.mkdir(parents=True, exist_ok=True)
|
||||
user_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
env_install = os.environ.copy()
|
||||
env_install.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(extensions_dir),
|
||||
'CHROME_DOWNLOADS_DIR': str(downloads_dir),
|
||||
})
|
||||
|
||||
# Install SingleFile extension cache before launching Chrome
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env_install,
|
||||
timeout=120
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Launch Chrome session with extensions loaded
|
||||
old_env = os.environ.copy()
|
||||
os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
|
||||
os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
|
||||
try:
|
||||
with chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id='singlefile-ext-crawl',
|
||||
snapshot_id='singlefile-ext-snap',
|
||||
test_url=TEST_URL,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env):
|
||||
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
|
||||
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Ensure ../chrome points to snapshot chrome session (contains target_id.txt)
|
||||
chrome_dir = singlefile_output_dir.parent / 'chrome'
|
||||
if not chrome_dir.exists():
|
||||
chrome_dir.symlink_to(snapshot_chrome_dir)
|
||||
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
env['SINGLEFILE_BINARY'] = '/nonexistent/single-file' # force extension path
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir)
|
||||
env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir)
|
||||
env['CHROME_HEADLESS'] = 'false'
|
||||
|
||||
# Track downloads dir state before run to ensure file is created then moved out
|
||||
downloads_before = set(downloads_dir.glob('*.html'))
|
||||
downloads_mtime_before = downloads_dir.stat().st_mtime_ns
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-ext-snap'],
|
||||
cwd=str(singlefile_output_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"SingleFile extension run failed: {result.stderr}"
|
||||
|
||||
output_file = singlefile_output_dir / 'singlefile.html'
|
||||
assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
|
||||
html_content = output_file.read_text(errors='ignore')
|
||||
assert 'Example Domain' in html_content, "Output should contain example.com content"
|
||||
|
||||
# Verify download moved out of downloads dir
|
||||
downloads_after = set(downloads_dir.glob('*.html'))
|
||||
new_downloads = downloads_after - downloads_before
|
||||
downloads_mtime_after = downloads_dir.stat().st_mtime_ns
|
||||
assert downloads_mtime_after != downloads_mtime_before, "Downloads dir should be modified during extension save"
|
||||
assert not new_downloads, f"SingleFile download should be moved out of downloads dir, found: {new_downloads}"
|
||||
finally:
|
||||
os.environ.clear()
|
||||
os.environ.update(old_env)
|
||||
|
||||
|
||||
def test_singlefile_disabled_skips():
|
||||
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
Reference in New Issue
Block a user