#!/usr/bin/env node /** * Chrome Extension Management Utilities * * Handles downloading, installing, and managing Chrome extensions for browser automation. * Ported from the TypeScript implementation in archivebox.ts */ const fs = require('fs'); const path = require('path'); const crypto = require('crypto'); const http = require('http'); const net = require('net'); const { exec, spawn } = require('child_process'); const { promisify } = require('util'); const { Readable } = require('stream'); const { finished } = require('stream/promises'); const execAsync = promisify(exec); // ============================================================================ // Environment helpers // ============================================================================ /** * Get environment variable with default value. * @param {string} name - Environment variable name * @param {string} [defaultValue=''] - Default value if not set * @returns {string} - Trimmed environment variable value */ function getEnv(name, defaultValue = '') { return (process.env[name] || defaultValue).trim(); } /** * Get boolean environment variable. * @param {string} name - Environment variable name * @param {boolean} [defaultValue=false] - Default value if not set * @returns {boolean} - Boolean value */ function getEnvBool(name, defaultValue = false) { const val = getEnv(name, '').toLowerCase(); if (['true', '1', 'yes', 'on'].includes(val)) return true; if (['false', '0', 'no', 'off'].includes(val)) return false; return defaultValue; } /** * Get integer environment variable. * @param {string} name - Environment variable name * @param {number} [defaultValue=0] - Default value if not set * @returns {number} - Integer value */ function getEnvInt(name, defaultValue = 0) { const val = parseInt(getEnv(name, String(defaultValue)), 10); return isNaN(val) ? defaultValue : val; } /** * Get array environment variable (JSON array or comma-separated string). * * Parsing strategy: * - If value starts with '[', parse as JSON array * - Otherwise, parse as comma-separated values * * This prevents incorrect splitting of arguments that contain internal commas. * For arguments with commas, use JSON format: * CHROME_ARGS='["--user-data-dir=/path/with,comma", "--window-size=1440,900"]' * * @param {string} name - Environment variable name * @param {string[]} [defaultValue=[]] - Default value if not set * @returns {string[]} - Array of strings */ function getEnvArray(name, defaultValue = []) { const val = getEnv(name, ''); if (!val) return defaultValue; // If starts with '[', parse as JSON array if (val.startsWith('[')) { try { const parsed = JSON.parse(val); if (Array.isArray(parsed)) return parsed; } catch (e) { console.error(`[!] Failed to parse ${name} as JSON array: ${e.message}`); // Fall through to comma-separated parsing } } // Parse as comma-separated values return val.split(',').map(s => s.trim()).filter(Boolean); } /** * Parse resolution string into width/height. * @param {string} resolution - Resolution string like "1440,2000" * @returns {{width: number, height: number}} - Parsed dimensions */ function parseResolution(resolution) { const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); return { width: width || 1440, height: height || 2000 }; } // ============================================================================ // PID file management // ============================================================================ /** * Write PID file with specific mtime for process validation. * @param {string} filePath - Path to PID file * @param {number} pid - Process ID * @param {number} startTimeSeconds - Process start time in seconds */ function writePidWithMtime(filePath, pid, startTimeSeconds) { fs.writeFileSync(filePath, String(pid)); const startTimeMs = startTimeSeconds * 1000; fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs)); } /** * Write a shell script that can re-run the Chrome command. * @param {string} filePath - Path to script file * @param {string} binary - Chrome binary path * @param {string[]} args - Chrome arguments */ function writeCmdScript(filePath, binary, args) { const escape = (arg) => arg.includes(' ') || arg.includes('"') || arg.includes('$') ? `"${arg.replace(/"/g, '\\"')}"` : arg; fs.writeFileSync( filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n` ); fs.chmodSync(filePath, 0o755); } // ============================================================================ // Port management // ============================================================================ /** * Find a free port on localhost. * @returns {Promise} - Available port number */ function findFreePort() { return new Promise((resolve, reject) => { const server = net.createServer(); server.unref(); server.on('error', reject); server.listen(0, () => { const port = server.address().port; server.close(() => resolve(port)); }); }); } /** * Wait for Chrome's DevTools port to be ready. * @param {number} port - Debug port number * @param {number} [timeout=30000] - Timeout in milliseconds * @returns {Promise} - Chrome version info */ function waitForDebugPort(port, timeout = 30000) { const startTime = Date.now(); return new Promise((resolve, reject) => { const tryConnect = () => { if (Date.now() - startTime > timeout) { reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); return; } const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { let data = ''; res.on('data', (chunk) => (data += chunk)); res.on('end', () => { try { const info = JSON.parse(data); resolve(info); } catch (e) { setTimeout(tryConnect, 100); } }); }); req.on('error', () => { setTimeout(tryConnect, 100); }); req.setTimeout(1000, () => { req.destroy(); setTimeout(tryConnect, 100); }); }; tryConnect(); }); } // ============================================================================ // Zombie process cleanup // ============================================================================ /** * Kill zombie Chrome processes from stale crawls. * Recursively scans DATA_DIR for any .../chrome/...pid files from stale crawls. * Does not assume specific directory structure - works with nested paths. * @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.') * @returns {number} - Number of zombies killed */ function killZombieChrome(dataDir = null) { dataDir = dataDir || getEnv('DATA_DIR', '.'); const now = Date.now(); const fiveMinutesAgo = now - 300000; let killed = 0; console.error('[*] Checking for zombie Chrome processes...'); if (!fs.existsSync(dataDir)) { console.error('[+] No data directory found'); return 0; } /** * Recursively find all chrome/.pid files in directory tree * @param {string} dir - Directory to search * @param {number} depth - Current recursion depth (limit to 10) * @returns {Array<{pidFile: string, crawlDir: string}>} - Array of PID file info */ function findChromePidFiles(dir, depth = 0) { if (depth > 10) return []; // Prevent infinite recursion const results = []; try { const entries = fs.readdirSync(dir, { withFileTypes: true }); for (const entry of entries) { if (!entry.isDirectory()) continue; const fullPath = path.join(dir, entry.name); // Found a chrome directory - check for .pid files if (entry.name === 'chrome') { try { const pidFiles = fs.readdirSync(fullPath).filter(f => f.endsWith('.pid')); const crawlDir = dir; // Parent of chrome/ is the crawl dir for (const pidFileName of pidFiles) { results.push({ pidFile: path.join(fullPath, pidFileName), crawlDir: crawlDir, }); } } catch (e) { // Skip if can't read chrome dir } } else { // Recurse into subdirectory (skip hidden dirs and node_modules) if (!entry.name.startsWith('.') && entry.name !== 'node_modules') { results.push(...findChromePidFiles(fullPath, depth + 1)); } } } } catch (e) { // Skip if can't read directory } return results; } try { const chromePids = findChromePidFiles(dataDir); for (const {pidFile, crawlDir} of chromePids) { // Check if crawl was modified recently (still active) try { const crawlStats = fs.statSync(crawlDir); if (crawlStats.mtimeMs > fiveMinutesAgo) { continue; // Crawl is active, skip } } catch (e) { continue; } // Crawl is stale, check PID try { const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); if (isNaN(pid) || pid <= 0) continue; // Check if process exists try { process.kill(pid, 0); } catch (e) { // Process dead, remove stale PID file try { fs.unlinkSync(pidFile); } catch (e) {} continue; } // Process alive and crawl is stale - zombie! console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${path.basename(crawlDir)}`); try { try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); } killed++; console.error(`[+] Killed zombie (PID ${pid})`); try { fs.unlinkSync(pidFile); } catch (e) {} } catch (e) { console.error(`[!] Failed to kill PID ${pid}: ${e.message}`); } } catch (e) { // Skip invalid PID files } } } catch (e) { console.error(`[!] Error scanning for Chrome processes: ${e.message}`); } if (killed > 0) { console.error(`[+] Killed ${killed} zombie process(es)`); } else { console.error('[+] No zombies found'); } // Clean up stale SingletonLock files from persona chrome_user_data directories const personasDir = path.join(dataDir, 'personas'); if (fs.existsSync(personasDir)) { try { const personas = fs.readdirSync(personasDir, { withFileTypes: true }); for (const persona of personas) { if (!persona.isDirectory()) continue; const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data'); const singletonLock = path.join(userDataDir, 'SingletonLock'); if (fs.existsSync(singletonLock)) { try { fs.unlinkSync(singletonLock); console.error(`[+] Removed stale SingletonLock: ${singletonLock}`); } catch (e) { // Ignore - may be in use by active Chrome } } } } catch (e) { // Ignore errors scanning personas directory } } return killed; } // ============================================================================ // Chrome launching // ============================================================================ /** * Launch Chromium with extensions and return connection info. * * @param {Object} options - Launch options * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided) * @param {string} [options.outputDir='chrome'] - Directory for output files * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions * @param {string} [options.resolution='1440,2000'] - Window resolution * @param {boolean} [options.headless=true] - Run in headless mode * @param {boolean} [options.sandbox=true] - Enable Chrome sandbox * @param {boolean} [options.checkSsl=true] - Check SSL certificates * @param {string[]} [options.extensionPaths=[]] - Paths to unpacked extensions * @param {boolean} [options.killZombies=true] - Kill zombie processes first * @returns {Promise} - {success, cdpUrl, pid, port, process, error} */ async function launchChromium(options = {}) { const { binary = findChromium(), outputDir = 'chrome', userDataDir = getEnv('CHROME_USER_DATA_DIR'), resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), headless = getEnvBool('CHROME_HEADLESS', true), sandbox = getEnvBool('CHROME_SANDBOX', true), checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), extensionPaths = [], killZombies = true, } = options; if (!binary) { return { success: false, error: 'Chrome binary not found' }; } // Kill zombies first if (killZombies) { killZombieChrome(); } const { width, height } = parseResolution(resolution); // Create output directory if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Create user data directory if specified and doesn't exist if (userDataDir) { if (!fs.existsSync(userDataDir)) { fs.mkdirSync(userDataDir, { recursive: true }); console.error(`[*] Created user data directory: ${userDataDir}`); } // Clean up any stale SingletonLock file from previous crashed sessions const singletonLock = path.join(userDataDir, 'SingletonLock'); if (fs.existsSync(singletonLock)) { try { fs.unlinkSync(singletonLock); console.error(`[*] Removed stale SingletonLock: ${singletonLock}`); } catch (e) { console.error(`[!] Failed to remove SingletonLock: ${e.message}`); } } } // Find a free port const debugPort = await findFreePort(); console.error(`[*] Using debug port: ${debugPort}`); // Get base Chrome args from config (static flags from CHROME_ARGS env var) // These come from config.json defaults, merged by get_config() in Python const baseArgs = getEnvArray('CHROME_ARGS', []); // Get extra user-provided args const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []); // Build dynamic Chrome arguments (these must be computed at runtime) const dynamicArgs = [ // Remote debugging setup `--remote-debugging-port=${debugPort}`, '--remote-debugging-address=127.0.0.1', // Sandbox settings (disable in Docker) ...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']), // Docker-specific workarounds '--disable-dev-shm-usage', '--disable-gpu', // Window size `--window-size=${width},${height}`, // User data directory (for persistent sessions with persona) ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []), // Headless mode ...(headless ? ['--headless=new'] : []), // SSL certificate checking ...(checkSsl ? [] : ['--ignore-certificate-errors']), ]; // Combine all args: base (from config) + dynamic (runtime) + extra (user overrides) // Dynamic args come after base so they can override if needed const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs]; // Add extension loading flags if (extensionPaths.length > 0) { const extPathsArg = extensionPaths.join(','); chromiumArgs.push(`--load-extension=${extPathsArg}`); chromiumArgs.push('--enable-unsafe-extension-debugging'); chromiumArgs.push('--disable-features=DisableLoadExtensionCommandLineSwitch,ExtensionManifestV2Unsupported,ExtensionManifestV2Disabled'); console.error(`[*] Loading ${extensionPaths.length} extension(s) via --load-extension`); } chromiumArgs.push('about:blank'); // Write command script for debugging writeCmdScript(path.join(outputDir, 'cmd.sh'), binary, chromiumArgs); try { console.error(`[*] Spawning Chromium (headless=${headless})...`); const chromiumProcess = spawn(binary, chromiumArgs, { stdio: ['ignore', 'pipe', 'pipe'], detached: true, }); const chromePid = chromiumProcess.pid; const chromeStartTime = Date.now() / 1000; if (chromePid) { console.error(`[*] Chromium spawned (PID: ${chromePid})`); writePidWithMtime(path.join(outputDir, 'chrome.pid'), chromePid, chromeStartTime); } // Pipe Chrome output to stderr chromiumProcess.stdout.on('data', (data) => { process.stderr.write(`[chromium:stdout] ${data}`); }); chromiumProcess.stderr.on('data', (data) => { process.stderr.write(`[chromium:stderr] ${data}`); }); // Wait for debug port console.error(`[*] Waiting for debug port ${debugPort}...`); const versionInfo = await waitForDebugPort(debugPort, 30000); const wsUrl = versionInfo.webSocketDebuggerUrl; console.error(`[+] Chromium ready: ${wsUrl}`); fs.writeFileSync(path.join(outputDir, 'cdp_url.txt'), wsUrl); fs.writeFileSync(path.join(outputDir, 'port.txt'), String(debugPort)); return { success: true, cdpUrl: wsUrl, pid: chromePid, port: debugPort, process: chromiumProcess, }; } catch (e) { return { success: false, error: `${e.name}: ${e.message}` }; } } /** * Check if a process is still running. * @param {number} pid - Process ID to check * @returns {boolean} - True if process exists */ function isProcessAlive(pid) { try { process.kill(pid, 0); // Signal 0 checks existence without killing return true; } catch (e) { return false; } } /** * Find all Chrome child processes for a given debug port. * @param {number} port - Debug port number * @returns {Array} - Array of PIDs */ function findChromeProcessesByPort(port) { const { execSync } = require('child_process'); const pids = []; try { // Find all Chrome processes using this debug port const output = execSync( `ps aux | grep -i "chrome.*--remote-debugging-port=${port}" | grep -v grep | awk '{print $2}'`, { encoding: 'utf8', timeout: 5000 } ); for (const line of output.split('\n')) { const pid = parseInt(line.trim(), 10); if (!isNaN(pid) && pid > 0) { pids.push(pid); } } } catch (e) { // Command failed or no processes found } return pids; } /** * Kill a Chrome process by PID. * Always sends SIGTERM before SIGKILL, then verifies death. * * @param {number} pid - Process ID to kill * @param {string} [outputDir] - Directory containing PID files to clean up */ async function killChrome(pid, outputDir = null) { if (!pid) return; console.error(`[*] Killing Chrome process tree (PID ${pid})...`); // Get debug port for finding child processes let debugPort = null; if (outputDir) { try { const portFile = path.join(outputDir, 'port.txt'); if (fs.existsSync(portFile)) { debugPort = parseInt(fs.readFileSync(portFile, 'utf8').trim(), 10); } } catch (e) {} } // Step 1: SIGTERM to process group (graceful shutdown) console.error(`[*] Sending SIGTERM to process group -${pid}...`); try { process.kill(-pid, 'SIGTERM'); } catch (e) { try { console.error(`[*] Process group kill failed, trying single process...`); process.kill(pid, 'SIGTERM'); } catch (e2) { console.error(`[!] SIGTERM failed: ${e2.message}`); } } // Step 2: Wait for graceful shutdown await new Promise(resolve => setTimeout(resolve, 2000)); // Step 3: Check if still alive if (!isProcessAlive(pid)) { console.error('[+] Chrome process terminated gracefully'); } else { // Step 4: Force kill ENTIRE process group with SIGKILL console.error(`[*] Process still alive, sending SIGKILL to process group -${pid}...`); try { process.kill(-pid, 'SIGKILL'); // Kill entire process group } catch (e) { console.error(`[!] Process group SIGKILL failed, trying single process: ${e.message}`); try { process.kill(pid, 'SIGKILL'); } catch (e2) { console.error(`[!] SIGKILL failed: ${e2.message}`); } } // Step 5: Wait briefly and verify death await new Promise(resolve => setTimeout(resolve, 1000)); if (isProcessAlive(pid)) { console.error(`[!] WARNING: Process ${pid} is unkillable (likely in UNE state)`); console.error(`[!] This typically happens when Chrome crashes in kernel syscall`); console.error(`[!] Process will remain as zombie until system reboot`); console.error(`[!] macOS IOSurface crash creates unkillable processes in UNE state`); // Try one more time to kill the entire process group if (debugPort) { const relatedPids = findChromeProcessesByPort(debugPort); if (relatedPids.length > 1) { console.error(`[*] Found ${relatedPids.length} Chrome processes still running on port ${debugPort}`); console.error(`[*] Attempting final process group SIGKILL...`); // Try to kill each unique process group we find const processGroups = new Set(); for (const relatedPid of relatedPids) { if (relatedPid !== pid) { processGroups.add(relatedPid); } } for (const groupPid of processGroups) { try { process.kill(-groupPid, 'SIGKILL'); } catch (e) {} } } } } else { console.error('[+] Chrome process group killed successfully'); } } // Step 8: Clean up PID files // Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup() if (outputDir) { try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {} } console.error('[*] Chrome cleanup completed'); } /** * Install Chromium using @puppeteer/browsers programmatic API. * Uses puppeteer's default cache location, returns the binary path. * * @param {Object} options - Install options * @returns {Promise} - {success, binary, version, error} */ async function installChromium(options = {}) { // Check if CHROME_BINARY is already set and valid const configuredBinary = getEnv('CHROME_BINARY'); if (configuredBinary && fs.existsSync(configuredBinary)) { console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`); return { success: true, binary: configuredBinary, version: null }; } // Try to load @puppeteer/browsers from NODE_MODULES_DIR or system let puppeteerBrowsers; try { if (process.env.NODE_MODULES_DIR) { module.paths.unshift(process.env.NODE_MODULES_DIR); } puppeteerBrowsers = require('@puppeteer/browsers'); } catch (e) { console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`); return { success: false, error: '@puppeteer/browsers not installed' }; } console.error(`[*] Installing Chromium via @puppeteer/browsers...`); try { const result = await puppeteerBrowsers.install({ browser: 'chromium', buildId: 'latest', }); const binary = result.executablePath; const version = result.buildId; if (!binary || !fs.existsSync(binary)) { console.error(`[!] Chromium binary not found at: ${binary}`); return { success: false, error: `Chromium binary not found at: ${binary}` }; } console.error(`[+] Chromium installed: ${binary}`); return { success: true, binary, version }; } catch (e) { console.error(`[!] Failed to install Chromium: ${e.message}`); return { success: false, error: e.message }; } } /** * Install puppeteer-core npm package. * * @param {Object} options - Install options * @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib//npm or ./node_modules parent) * @param {number} [options.timeout=60000] - Timeout in milliseconds * @returns {Promise} - {success, path, error} */ async function installPuppeteerCore(options = {}) { const arch = `${process.arch}-${process.platform}`; const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm'); const { npmPrefix = defaultPrefix, timeout = 60000, } = options; const nodeModulesDir = path.join(npmPrefix, 'node_modules'); const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core'); // Check if already installed if (fs.existsSync(puppeteerPath)) { console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`); return { success: true, path: puppeteerPath }; } console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`); // Create directory if (!fs.existsSync(npmPrefix)) { fs.mkdirSync(npmPrefix, { recursive: true }); } try { const { execSync } = require('child_process'); execSync( `npm install --prefix "${npmPrefix}" puppeteer-core`, { encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] } ); console.error(`[+] puppeteer-core installed successfully`); return { success: true, path: puppeteerPath }; } catch (e) { console.error(`[!] Failed to install puppeteer-core: ${e.message}`); return { success: false, error: e.message }; } } // Try to import unzipper, fallback to system unzip if not available let unzip = null; try { const unzipper = require('unzipper'); unzip = async (sourcePath, destPath) => { const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath })); return stream.promise(); }; } catch (err) { // Will use system unzip command as fallback } /** * Compute the extension ID from the unpacked path. * Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id. * * @param {string} unpacked_path - Path to the unpacked extension directory * @returns {string} - 32-character extension ID */ function getExtensionId(unpacked_path) { // Chrome uses a SHA256 hash of the unpacked extension directory path const hash = crypto.createHash('sha256'); hash.update(Buffer.from(unpacked_path, 'utf-8')); // Convert first 32 hex chars to characters in the range 'a'-'p' const detected_extension_id = Array.from(hash.digest('hex')) .slice(0, 32) .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0))) .join(''); return detected_extension_id; } /** * Download and install a Chrome extension from the Chrome Web Store. * * @param {Object} extension - Extension metadata object * @param {string} extension.webstore_id - Chrome Web Store extension ID * @param {string} extension.name - Human-readable extension name * @param {string} extension.crx_url - URL to download the CRX file * @param {string} extension.crx_path - Local path to save the CRX file * @param {string} extension.unpacked_path - Path to extract the extension * @returns {Promise} - True if installation succeeded */ async function installExtension(extension) { const manifest_path = path.join(extension.unpacked_path, 'manifest.json'); // Download CRX file if not already downloaded if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) { console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`); try { // Ensure parent directory exists const crxDir = path.dirname(extension.crx_path); if (!fs.existsSync(crxDir)) { fs.mkdirSync(crxDir, { recursive: true }); } // Download CRX file from Chrome Web Store const response = await fetch(extension.crx_url); if (!response.ok) { console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`); return false; } if (response.body) { const crx_file = fs.createWriteStream(extension.crx_path); const crx_stream = Readable.fromWeb(response.body); await finished(crx_stream.pipe(crx_file)); } else { console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`); return false; } } catch (err) { console.error(`[❌] Failed to download extension ${extension.name}:`, err); return false; } } // Unzip CRX file to unpacked_path (CRX files have extra header bytes but unzip handles it) await fs.promises.mkdir(extension.unpacked_path, { recursive: true }); try { // Use -q to suppress warnings about extra bytes in CRX header await execAsync(`/usr/bin/unzip -q -o "${extension.crx_path}" -d "${extension.unpacked_path}"`); } catch (err1) { // unzip may return non-zero even on success due to CRX header warning, check if manifest exists if (!fs.existsSync(manifest_path)) { if (unzip) { // Fallback to unzipper library try { await unzip(extension.crx_path, extension.unpacked_path); } catch (err2) { console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err2.message); return false; } } else { console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message); return false; } } } if (!fs.existsSync(manifest_path)) { console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`); return false; } return true; } /** * Load or install a Chrome extension, computing all metadata. * * @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path) * @param {string} [ext.webstore_id] - Chrome Web Store extension ID * @param {string} [ext.name] - Human-readable extension name * @param {string} [ext.unpacked_path] - Path to unpacked extension * @param {string} [extensions_dir] - Directory to store extensions * @returns {Promise} - Complete extension metadata object */ async function loadOrInstallExtension(ext, extensions_dir = null) { if (!(ext.webstore_id || ext.unpacked_path)) { throw new Error('Extension must have either {webstore_id} or {unpacked_path}'); } // Determine extensions directory // Use provided dir, or fall back to getExtensionsDir() which handles env vars and defaults const EXTENSIONS_DIR = extensions_dir || getExtensionsDir(); // Set statically computable extension metadata ext.webstore_id = ext.webstore_id || ext.id; ext.name = ext.name || ext.webstore_id; ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`; ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`; ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`); ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`); const manifest_path = path.join(ext.unpacked_path, 'manifest.json'); ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8')); ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null; // If extension is not installed, download and unpack it if (!ext.read_version()) { await installExtension(ext); } // Autodetect ID from filesystem path (unpacked extensions don't have stable IDs) ext.id = getExtensionId(ext.unpacked_path); ext.version = ext.read_version(); if (!ext.version) { console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`); } else { console.log(`[➕] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`); } return ext; } /** * Check if a Puppeteer target is an extension background page/service worker. * * @param {Object} target - Puppeteer target object * @returns {Promise} - Object with target_is_bg, extension_id, manifest_version, etc. */ async function isTargetExtension(target) { let target_type; let target_ctx; let target_url; try { target_type = target.type(); target_ctx = (await target.worker()) || (await target.page()) || null; target_url = target.url() || target_ctx?.url() || null; } catch (err) { if (String(err).includes('No target with given id found')) { // Target closed during check, ignore harmless race condition target_type = 'closed'; target_ctx = null; target_url = 'about:closed'; } else { throw err; } } // Check if this is an extension background page or service worker const is_chrome_extension = target_url?.startsWith('chrome-extension://'); const is_background_page = target_type === 'background_page'; const is_service_worker = target_type === 'service_worker'; const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker); let extension_id = null; let manifest_version = null; const target_is_extension = is_chrome_extension || target_is_bg; if (target_is_extension) { try { extension_id = target_url?.split('://')[1]?.split('/')[0] || null; if (target_ctx) { const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); manifest_version = manifest?.manifest_version || null; } } catch (err) { // Failed to get extension metadata } } return { target_is_extension, target_is_bg, target_type, target_ctx, target_url, extension_id, manifest_version, }; } /** * Load extension metadata and connection handlers from a browser target. * * @param {Array} extensions - Array of extension metadata objects to update * @param {Object} target - Puppeteer target object * @returns {Promise} - Updated extension object or null if not an extension */ async function loadExtensionFromTarget(extensions, target) { const { target_is_bg, target_is_extension, target_type, target_ctx, target_url, extension_id, manifest_version, } = await isTargetExtension(target); if (!(target_is_bg && extension_id && target_ctx)) { return null; } // Find matching extension in our list const extension = extensions.find(ext => ext.id === extension_id); if (!extension) { console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`); return null; } // Load manifest from the extension context let manifest = null; try { manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); } catch (err) { console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err); return null; } // Create dispatch methods for communicating with the extension const new_extension = { ...extension, target, target_type, target_url, manifest, manifest_version, // Trigger extension toolbar button click dispatchAction: async (tab) => { return await target_ctx.evaluate((tabId) => { return new Promise((resolve) => { chrome.action.onClicked.addListener((tab) => { resolve({ success: true, tab }); }); chrome.action.openPopup(); }); }, tab?.id || null); }, // Send message to extension dispatchMessage: async (message, options = {}) => { return await target_ctx.evaluate((msg, opts) => { return new Promise((resolve) => { chrome.runtime.sendMessage(msg, opts, (response) => { resolve(response); }); }); }, message, options); }, // Trigger extension command (keyboard shortcut) dispatchCommand: async (command) => { return await target_ctx.evaluate((cmd) => { return new Promise((resolve) => { chrome.commands.onCommand.addListener((receivedCommand) => { if (receivedCommand === cmd) { resolve({ success: true, command: receivedCommand }); } }); // Note: Actually triggering commands programmatically is not directly supported // This would need to be done via CDP or keyboard simulation }); }, command); }, }; // Update the extension in the array Object.assign(extension, new_extension); console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`); return new_extension; } /** * Install all extensions in the list if not already installed. * * @param {Array} extensions - Array of extension metadata objects * @param {string} [extensions_dir] - Directory to store extensions * @returns {Promise} - Array of installed extension objects */ async function installAllExtensions(extensions, extensions_dir = null) { console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`); for (const extension of extensions) { await loadOrInstallExtension(extension, extensions_dir); } return extensions; } /** * Load and connect to all extensions from a running browser. * * @param {Object} browser - Puppeteer browser instance * @param {Array} extensions - Array of extension metadata objects * @returns {Promise} - Array of loaded extension objects with connection handlers */ async function loadAllExtensionsFromBrowser(browser, extensions) { console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`); // Find loaded extensions at runtime by examining browser targets for (const target of browser.targets()) { await loadExtensionFromTarget(extensions, target); } return extensions; } /** * Load extension manifest.json file * * @param {string} unpacked_path - Path to unpacked extension directory * @returns {object|null} - Parsed manifest object or null if not found/invalid */ function loadExtensionManifest(unpacked_path) { const manifest_path = path.join(unpacked_path, 'manifest.json'); if (!fs.existsSync(manifest_path)) { return null; } try { const manifest_content = fs.readFileSync(manifest_path, 'utf-8'); return JSON.parse(manifest_content); } catch (error) { // Invalid JSON or read error return null; } } /** * @deprecated Use puppeteer's enableExtensions option instead. * * Generate Chrome launch arguments for loading extensions. * NOTE: This is deprecated. Use puppeteer.launch({ pipe: true, enableExtensions: [paths] }) instead. * * @param {Array} extensions - Array of extension metadata objects * @returns {Array} - Chrome CLI arguments for loading extensions */ function getExtensionLaunchArgs(extensions) { console.warn('[DEPRECATED] getExtensionLaunchArgs is deprecated. Use puppeteer enableExtensions option instead.'); if (!extensions || extensions.length === 0) { return []; } // Filter out extensions without unpacked_path first const validExtensions = extensions.filter(ext => ext.unpacked_path); const unpacked_paths = validExtensions.map(ext => ext.unpacked_path); // Use computed id (from path hash) for allowlisting, as that's what Chrome uses for unpacked extensions // Fall back to webstore_id if computed id not available const extension_ids = validExtensions.map(ext => ext.id || getExtensionId(ext.unpacked_path)); return [ `--load-extension=${unpacked_paths.join(',')}`, `--allowlisted-extension-id=${extension_ids.join(',')}`, '--allow-legacy-extension-manifests', '--disable-extensions-auto-update', ]; } /** * Get extension paths for use with puppeteer's enableExtensions option. * Following puppeteer best practices: https://pptr.dev/guides/chrome-extensions * * @param {Array} extensions - Array of extension metadata objects * @returns {Array} - Array of extension unpacked paths */ function getExtensionPaths(extensions) { if (!extensions || extensions.length === 0) { return []; } return extensions .filter(ext => ext.unpacked_path) .map(ext => ext.unpacked_path); } /** * Wait for an extension target to be available in the browser. * Following puppeteer best practices for accessing extension contexts. * * For Manifest V3 extensions (service workers): * const worker = await waitForExtensionTarget(browser, extensionId); * // worker is a WebWorker context * * For Manifest V2 extensions (background pages): * const page = await waitForExtensionTarget(browser, extensionId); * // page is a Page context * * @param {Object} browser - Puppeteer browser instance * @param {string} extensionId - Extension ID to wait for (computed from path hash) * @param {number} [timeout=30000] - Timeout in milliseconds * @returns {Promise} - Worker or Page context for the extension */ async function waitForExtensionTarget(browser, extensionId, timeout = 30000) { // Try to find service worker first (Manifest V3) try { const workerTarget = await browser.waitForTarget( target => target.type() === 'service_worker' && target.url().includes(`chrome-extension://${extensionId}`), { timeout } ); const worker = await workerTarget.worker(); if (worker) return worker; } catch (err) { // No service worker found, try background page } // Try background page (Manifest V2) try { const backgroundTarget = await browser.waitForTarget( target => target.type() === 'background_page' && target.url().includes(`chrome-extension://${extensionId}`), { timeout } ); const page = await backgroundTarget.page(); if (page) return page; } catch (err) { // No background page found } // Try any extension page as fallback const extTarget = await browser.waitForTarget( target => target.url().startsWith(`chrome-extension://${extensionId}`), { timeout } ); // Return worker or page depending on target type if (extTarget.type() === 'service_worker') { return await extTarget.worker(); } return await extTarget.page(); } /** * Get all loaded extension targets from a browser. * * @param {Object} browser - Puppeteer browser instance * @returns {Array} - Array of extension target info objects */ function getExtensionTargets(browser) { return browser.targets() .filter(target => target.url().startsWith('chrome-extension://') || target.type() === 'service_worker' || target.type() === 'background_page' ) .map(target => ({ type: target.type(), url: target.url(), extensionId: target.url().includes('chrome-extension://') ? target.url().split('chrome-extension://')[1]?.split('/')[0] : null, })); } /** * Find Chromium/Chrome binary path. * Checks CHROME_BINARY env var first, then falls back to system locations. * * @returns {string|null} - Absolute path to browser binary or null if not found */ function findChromium() { const { execSync } = require('child_process'); // Helper to validate a binary by running --version const validateBinary = (binaryPath) => { if (!binaryPath || !fs.existsSync(binaryPath)) return false; try { execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' }); return true; } catch (e) { return false; } }; // 1. Check CHROME_BINARY env var first const chromeBinary = getEnv('CHROME_BINARY'); if (chromeBinary) { const absPath = path.resolve(chromeBinary); if (validateBinary(absPath)) { return absPath; } console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`); } // 2. Warn that no CHROME_BINARY is configured, searching fallbacks if (!chromeBinary) { console.error('[!] Warning: CHROME_BINARY not set, searching system locations...'); } // Helper to find Chromium in @puppeteer/browsers directory structure const findInPuppeteerDir = (baseDir) => { if (!fs.existsSync(baseDir)) return null; try { const versions = fs.readdirSync(baseDir); for (const version of versions.sort().reverse()) { const versionDir = path.join(baseDir, version); const candidates = [ path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'), path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'), path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'), path.join(versionDir, 'chrome-linux64/chrome'), path.join(versionDir, 'chrome-linux/chrome'), ]; for (const c of candidates) { if (fs.existsSync(c)) return c; } } } catch (e) {} return null; }; // 3. Search fallback locations (Chromium first, then Chrome) const fallbackLocations = [ // System Chromium '/Applications/Chromium.app/Contents/MacOS/Chromium', '/usr/bin/chromium', '/usr/bin/chromium-browser', // Puppeteer cache path.join(process.env.HOME || '', '.cache/puppeteer/chromium'), path.join(process.env.HOME || '', '.cache/puppeteer'), // Chrome (fallback - extensions may not work in 137+) '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', '/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', ]; for (const loc of fallbackLocations) { // Check if it's a puppeteer cache dir if (loc.includes('.cache/puppeteer')) { const binary = findInPuppeteerDir(loc); if (binary && validateBinary(binary)) { return binary; } } else if (validateBinary(loc)) { if (loc.includes('Google Chrome') || loc.includes('google-chrome')) { console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+'); } return loc; } } return null; } // ============================================================================ // Shared Extension Installer Utilities // ============================================================================ /** * Get the extensions directory path. * Centralized path calculation used by extension installers and chrome launch. * * Path is derived from environment variables in this priority: * 1. CHROME_EXTENSIONS_DIR (explicit override) * 2. DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions (default) * * @returns {string} - Absolute path to extensions directory */ function getExtensionsDir() { const dataDir = getEnv('DATA_DIR', '.'); const persona = getEnv('ACTIVE_PERSONA', 'Default'); return getEnv('CHROME_EXTENSIONS_DIR') || path.join(dataDir, 'personas', persona, 'chrome_extensions'); } /** * Get machine type string for platform-specific paths. * Matches Python's archivebox.config.paths.get_machine_type() * * @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin') */ function getMachineType() { if (process.env.MACHINE_TYPE) { return process.env.MACHINE_TYPE; } let machine = process.arch; const system = process.platform; // Normalize machine type to match Python's convention if (machine === 'arm64' || machine === 'aarch64') { machine = 'arm64'; } else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') { machine = 'x86_64'; } else if (machine === 'ia32' || machine === 'x86') { machine = 'x86'; } return `${machine}-${system}`; } /** * Get LIB_DIR path for platform-specific binaries. * Returns DATA_DIR/lib/MACHINE_TYPE/ * * @returns {string} - Absolute path to lib directory */ function getLibDir() { if (process.env.LIB_DIR) { return path.resolve(process.env.LIB_DIR); } const dataDir = getEnv('DATA_DIR', './data'); const machineType = getMachineType(); return path.resolve(path.join(dataDir, 'lib', machineType)); } /** * Get NODE_MODULES_DIR path for npm packages. * Returns LIB_DIR/npm/node_modules/ * * @returns {string} - Absolute path to node_modules directory */ function getNodeModulesDir() { if (process.env.NODE_MODULES_DIR) { return path.resolve(process.env.NODE_MODULES_DIR); } return path.resolve(path.join(getLibDir(), 'npm', 'node_modules')); } /** * Get all test environment paths as a JSON object. * This is the single source of truth for path calculations - Python calls this * to avoid duplicating path logic. * * @returns {Object} - Object with all test environment paths */ function getTestEnv() { const dataDir = getEnv('DATA_DIR', './data'); const machineType = getMachineType(); const libDir = getLibDir(); const nodeModulesDir = getNodeModulesDir(); return { DATA_DIR: dataDir, MACHINE_TYPE: machineType, LIB_DIR: libDir, NODE_MODULES_DIR: nodeModulesDir, NODE_PATH: nodeModulesDir, // Node.js uses NODE_PATH for module resolution NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'), CHROME_EXTENSIONS_DIR: getExtensionsDir(), }; } /** * Install a Chrome extension with caching support. * * This is the main entry point for extension installer hooks. It handles: * - Checking for cached extension metadata * - Installing the extension if not cached * - Writing cache file for future runs * * @param {Object} extension - Extension metadata object * @param {string} extension.webstore_id - Chrome Web Store extension ID * @param {string} extension.name - Human-readable extension name (used for cache file) * @param {Object} [options] - Options * @param {string} [options.extensionsDir] - Override extensions directory * @param {boolean} [options.quiet=false] - Suppress info logging * @returns {Promise} - Installed extension metadata or null on failure */ async function installExtensionWithCache(extension, options = {}) { const { extensionsDir = getExtensionsDir(), quiet = false, } = options; const cacheFile = path.join(extensionsDir, `${extension.name}.extension.json`); // Check if extension is already cached and valid if (fs.existsSync(cacheFile)) { try { const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); if (fs.existsSync(manifestPath)) { if (!quiet) { console.log(`[*] ${extension.name} extension already installed (using cache)`); } return cached; } } catch (e) { // Cache file corrupted, re-install console.warn(`[⚠️] Extension cache corrupted for ${extension.name}, re-installing...`); } } // Install extension if (!quiet) { console.log(`[*] Installing ${extension.name} extension...`); } const installedExt = await loadOrInstallExtension(extension, extensionsDir); if (!installedExt?.version) { console.error(`[❌] Failed to install ${extension.name} extension`); return null; } // Write cache file try { await fs.promises.mkdir(extensionsDir, { recursive: true }); await fs.promises.writeFile(cacheFile, JSON.stringify(installedExt, null, 2)); if (!quiet) { console.log(`[+] Extension metadata written to ${cacheFile}`); } } catch (e) { console.warn(`[⚠️] Failed to write cache file: ${e.message}`); } if (!quiet) { console.log(`[+] ${extension.name} extension installed`); } return installedExt; } // ============================================================================ // Snapshot Hook Utilities (for CDP-based plugins like ssl, responses, dns) // ============================================================================ /** * Parse command line arguments into an object. * Handles --key=value and --flag formats. * * @returns {Object} - Parsed arguments object */ function parseArgs() { const args = {}; process.argv.slice(2).forEach(arg => { if (arg.startsWith('--')) { const [key, ...valueParts] = arg.slice(2).split('='); args[key.replace(/-/g, '_')] = valueParts.join('=') || true; } }); return args; } /** * Wait for Chrome session files to be ready. * Polls for cdp_url.txt and target_id.txt in the chrome session directory. * * @param {string} chromeSessionDir - Path to chrome session directory (e.g., '../chrome') * @param {number} [timeoutMs=60000] - Timeout in milliseconds * @returns {Promise} - True if files are ready, false if timeout */ async function waitForChromeSession(chromeSessionDir, timeoutMs = 60000) { const cdpFile = path.join(chromeSessionDir, 'cdp_url.txt'); const targetIdFile = path.join(chromeSessionDir, 'target_id.txt'); const startTime = Date.now(); while (Date.now() - startTime < timeoutMs) { if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { return true; } await new Promise(resolve => setTimeout(resolve, 100)); } return false; } /** * Read CDP WebSocket URL from chrome session directory. * * @param {string} chromeSessionDir - Path to chrome session directory * @returns {string|null} - CDP URL or null if not found */ function readCdpUrl(chromeSessionDir) { const cdpFile = path.join(chromeSessionDir, 'cdp_url.txt'); if (fs.existsSync(cdpFile)) { return fs.readFileSync(cdpFile, 'utf8').trim(); } return null; } /** * Read target ID from chrome session directory. * * @param {string} chromeSessionDir - Path to chrome session directory * @returns {string|null} - Target ID or null if not found */ function readTargetId(chromeSessionDir) { const targetIdFile = path.join(chromeSessionDir, 'target_id.txt'); if (fs.existsSync(targetIdFile)) { return fs.readFileSync(targetIdFile, 'utf8').trim(); } return null; } /** * Connect to Chrome browser and find the target page. * This is a high-level utility that handles all the connection logic: * 1. Wait for chrome session files * 2. Connect to browser via CDP * 3. Find the target page by ID * * @param {Object} options - Connection options * @param {string} [options.chromeSessionDir='../chrome'] - Path to chrome session directory * @param {number} [options.timeoutMs=60000] - Timeout for waiting * @param {Object} [options.puppeteer] - Puppeteer module (must be passed in) * @returns {Promise} - { browser, page, targetId, cdpUrl } * @throws {Error} - If connection fails or page not found */ async function connectToPage(options = {}) { const { chromeSessionDir = '../chrome', timeoutMs = 60000, puppeteer, } = options; if (!puppeteer) { throw new Error('puppeteer module must be passed to connectToPage()'); } // Wait for chrome session to be ready const sessionReady = await waitForChromeSession(chromeSessionDir, timeoutMs); if (!sessionReady) { throw new Error(`Chrome session not ready after ${timeoutMs/1000}s (chrome plugin must run first)`); } // Read session files const cdpUrl = readCdpUrl(chromeSessionDir); if (!cdpUrl) { throw new Error('No Chrome session found (cdp_url.txt missing)'); } const targetId = readTargetId(chromeSessionDir); // Connect to browser const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); // Find the target page const pages = await browser.pages(); let page = null; if (targetId) { page = pages.find(p => { const target = p.target(); return target && target._targetId === targetId; }); } // Fallback to last page if target not found if (!page) { page = pages[pages.length - 1]; } if (!page) { throw new Error('No page found in browser'); } return { browser, page, targetId, cdpUrl }; } /** * Wait for page navigation to complete. * Polls for page_loaded.txt marker file written by chrome_navigate. * * @param {string} chromeSessionDir - Path to chrome session directory * @param {number} [timeoutMs=120000] - Timeout in milliseconds * @param {number} [postLoadDelayMs=0] - Additional delay after page load marker * @returns {Promise} * @throws {Error} - If timeout waiting for navigation */ async function waitForPageLoaded(chromeSessionDir, timeoutMs = 120000, postLoadDelayMs = 0) { const pageLoadedMarker = path.join(chromeSessionDir, 'page_loaded.txt'); const pollInterval = 100; let waitTime = 0; while (!fs.existsSync(pageLoadedMarker) && waitTime < timeoutMs) { await new Promise(resolve => setTimeout(resolve, pollInterval)); waitTime += pollInterval; } if (!fs.existsSync(pageLoadedMarker)) { throw new Error('Timeout waiting for navigation (chrome_navigate did not complete)'); } // Optional post-load delay for late responses if (postLoadDelayMs > 0) { await new Promise(resolve => setTimeout(resolve, postLoadDelayMs)); } } // Export all functions module.exports = { // Environment helpers getEnv, getEnvBool, getEnvInt, getEnvArray, parseResolution, // PID file management writePidWithMtime, writeCmdScript, // Port management findFreePort, waitForDebugPort, // Zombie cleanup killZombieChrome, // Chrome launching launchChromium, killChrome, // Chrome/Chromium install installChromium, installPuppeteerCore, // Chrome/Chromium binary finding findChromium, // Extension utilities getExtensionId, loadExtensionManifest, installExtension, loadOrInstallExtension, isTargetExtension, loadExtensionFromTarget, installAllExtensions, loadAllExtensionsFromBrowser, // New puppeteer best-practices helpers getExtensionPaths, waitForExtensionTarget, getExtensionTargets, // Shared path utilities (single source of truth for Python/JS) getMachineType, getLibDir, getNodeModulesDir, getExtensionsDir, getTestEnv, // Shared extension installer utilities installExtensionWithCache, // Deprecated - use enableExtensions option instead getExtensionLaunchArgs, // Snapshot hook utilities (for CDP-based plugins) parseArgs, waitForChromeSession, readCdpUrl, readTargetId, connectToPage, waitForPageLoaded, }; // CLI usage if (require.main === module) { const args = process.argv.slice(2); if (args.length === 0) { console.log('Usage: chrome_utils.js [args...]'); console.log(''); console.log('Commands:'); console.log(' findChromium Find Chrome/Chromium binary'); console.log(' installChromium Install Chromium via @puppeteer/browsers'); console.log(' installPuppeteerCore Install puppeteer-core npm package'); console.log(' launchChromium Launch Chrome with CDP debugging'); console.log(' killChrome Kill Chrome process by PID'); console.log(' killZombieChrome Clean up zombie Chrome processes'); console.log(''); console.log(' getMachineType Get machine type (e.g., x86_64-linux)'); console.log(' getLibDir Get LIB_DIR path'); console.log(' getNodeModulesDir Get NODE_MODULES_DIR path'); console.log(' getExtensionsDir Get Chrome extensions directory'); console.log(' getTestEnv Get all paths as JSON (for tests)'); console.log(''); console.log(' getExtensionId Get extension ID from unpacked path'); console.log(' loadExtensionManifest Load extension manifest.json'); console.log(' loadOrInstallExtension Load or install an extension'); console.log(' installExtensionWithCache Install extension with caching'); console.log(''); console.log('Environment variables:'); console.log(' DATA_DIR Base data directory'); console.log(' LIB_DIR Library directory (computed if not set)'); console.log(' MACHINE_TYPE Machine type override'); console.log(' NODE_MODULES_DIR Node modules directory'); console.log(' CHROME_BINARY Chrome binary path'); console.log(' CHROME_EXTENSIONS_DIR Extensions directory'); process.exit(1); } const [command, ...commandArgs] = args; (async () => { try { switch (command) { case 'findChromium': { const binary = findChromium(); if (binary) { console.log(binary); } else { console.error('Chromium binary not found'); process.exit(1); } break; } case 'installChromium': { const result = await installChromium(); if (result.success) { console.log(JSON.stringify({ binary: result.binary, version: result.version, })); } else { console.error(result.error); process.exit(1); } break; } case 'installPuppeteerCore': { const [npmPrefix] = commandArgs; const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined }); if (result.success) { console.log(JSON.stringify({ path: result.path })); } else { console.error(result.error); process.exit(1); } break; } case 'launchChromium': { const [outputDir, extensionPathsJson] = commandArgs; const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : []; const result = await launchChromium({ outputDir: outputDir || 'chrome', extensionPaths, }); if (result.success) { console.log(JSON.stringify({ cdpUrl: result.cdpUrl, pid: result.pid, port: result.port, })); } else { console.error(result.error); process.exit(1); } break; } case 'killChrome': { const [pidStr, outputDir] = commandArgs; const pid = parseInt(pidStr, 10); if (isNaN(pid)) { console.error('Invalid PID'); process.exit(1); } await killChrome(pid, outputDir); break; } case 'killZombieChrome': { const [dataDir] = commandArgs; const killed = killZombieChrome(dataDir); console.log(killed); break; } case 'getExtensionId': { const [unpacked_path] = commandArgs; const id = getExtensionId(unpacked_path); console.log(id); break; } case 'loadExtensionManifest': { const [unpacked_path] = commandArgs; const manifest = loadExtensionManifest(unpacked_path); console.log(JSON.stringify(manifest)); break; } case 'getExtensionLaunchArgs': { const [extensions_json] = commandArgs; const extensions = JSON.parse(extensions_json); const launchArgs = getExtensionLaunchArgs(extensions); console.log(JSON.stringify(launchArgs)); break; } case 'loadOrInstallExtension': { const [webstore_id, name, extensions_dir] = commandArgs; const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir); console.log(JSON.stringify(ext, null, 2)); break; } case 'getMachineType': { console.log(getMachineType()); break; } case 'getLibDir': { console.log(getLibDir()); break; } case 'getNodeModulesDir': { console.log(getNodeModulesDir()); break; } case 'getExtensionsDir': { console.log(getExtensionsDir()); break; } case 'getTestEnv': { console.log(JSON.stringify(getTestEnv(), null, 2)); break; } case 'installExtensionWithCache': { const [webstore_id, name] = commandArgs; if (!webstore_id || !name) { console.error('Usage: installExtensionWithCache '); process.exit(1); } const ext = await installExtensionWithCache({ webstore_id, name }); if (ext) { console.log(JSON.stringify(ext, null, 2)); } else { process.exit(1); } break; } default: console.error(`Unknown command: ${command}`); process.exit(1); } } catch (error) { console.error(`Error: ${error.message}`); process.exit(1); } })(); }