diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 3cc8e83e..6485f2c0 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -328,6 +328,21 @@ def run_hook( env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive')) env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', '')) + # Use Machine.config.PATH if set (includes pip/npm bin dirs from providers) + try: + from archivebox.machine.models import Machine + machine = Machine.current() + if machine and machine.config: + machine_path = machine.config.get('config/PATH') + if machine_path: + env['PATH'] = machine_path + # Also set NODE_MODULES_DIR if configured + node_modules_dir = machine.config.get('config/NODE_MODULES_DIR') + if node_modules_dir: + env['NODE_MODULES_DIR'] = node_modules_dir + except Exception: + pass # Fall back to system PATH if Machine not available + # Export all config values to environment (already merged by get_config()) for key, value in config.items(): if value is None: diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js index fdae84e8..e25136e0 100755 --- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js +++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js @@ -17,6 +17,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); // Extractor metadata diff --git a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js index c12d9708..45fb8956 100755 --- a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js +++ b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js @@ -20,7 +20,7 @@ const path = require('path'); const fs = require('fs'); // Import extension utilities -const extensionUtils = require('../chrome/chrome_extension_utils.js'); +const extensionUtils = require('../chrome/chrome_utils.js'); // Extension metadata const EXTENSION = { diff --git a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js index 9ad5d6f3..cf528a1b 100755 --- a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js +++ b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js @@ -15,6 +15,8 @@ const path = require('path'); const fs = require('fs'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); // Get crawl's chrome directory from environment variable set by hooks.py diff --git a/archivebox/plugins/chrome/chrome_extension_utils.js b/archivebox/plugins/chrome/chrome_extension_utils.js deleted file mode 100755 index cb06e603..00000000 --- a/archivebox/plugins/chrome/chrome_extension_utils.js +++ /dev/null @@ -1,483 +0,0 @@ -#!/usr/bin/env node -/** - * Chrome Extension Management Utilities - * - * Handles downloading, installing, and managing Chrome extensions for browser automation. - * Ported from the TypeScript implementation in archivebox.ts - */ - -const fs = require('fs'); -const path = require('path'); -const crypto = require('crypto'); -const { exec } = require('child_process'); -const { promisify } = require('util'); -const { Readable } = require('stream'); -const { finished } = require('stream/promises'); - -const execAsync = promisify(exec); - -// Try to import unzipper, fallback to system unzip if not available -let unzip = null; -try { - const unzipper = require('unzipper'); - unzip = async (sourcePath, destPath) => { - const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath })); - return stream.promise(); - }; -} catch (err) { - // Will use system unzip command as fallback -} - -/** - * Compute the extension ID from the unpacked path. - * Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id. - * - * @param {string} unpacked_path - Path to the unpacked extension directory - * @returns {string} - 32-character extension ID - */ -function getExtensionId(unpacked_path) { - // Chrome uses a SHA256 hash of the unpacked extension directory path - const hash = crypto.createHash('sha256'); - hash.update(Buffer.from(unpacked_path, 'utf-8')); - - // Convert first 32 hex chars to characters in the range 'a'-'p' - const detected_extension_id = Array.from(hash.digest('hex')) - .slice(0, 32) - .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0))) - .join(''); - - return detected_extension_id; -} - -/** - * Download and install a Chrome extension from the Chrome Web Store. - * - * @param {Object} extension - Extension metadata object - * @param {string} extension.webstore_id - Chrome Web Store extension ID - * @param {string} extension.name - Human-readable extension name - * @param {string} extension.crx_url - URL to download the CRX file - * @param {string} extension.crx_path - Local path to save the CRX file - * @param {string} extension.unpacked_path - Path to extract the extension - * @returns {Promise} - True if installation succeeded - */ -async function installExtension(extension) { - const manifest_path = path.join(extension.unpacked_path, 'manifest.json'); - - // Download CRX file if not already downloaded - if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) { - console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`); - - try { - // Ensure parent directory exists - const crxDir = path.dirname(extension.crx_path); - if (!fs.existsSync(crxDir)) { - fs.mkdirSync(crxDir, { recursive: true }); - } - - // Download CRX file from Chrome Web Store - const response = await fetch(extension.crx_url); - - if (!response.ok) { - console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`); - return false; - } - - if (response.body) { - const crx_file = fs.createWriteStream(extension.crx_path); - const crx_stream = Readable.fromWeb(response.body); - await finished(crx_stream.pipe(crx_file)); - } else { - console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`); - return false; - } - } catch (err) { - console.error(`[❌] Failed to download extension ${extension.name}:`, err); - return false; - } - } - - // Unzip CRX file to unpacked_path - await fs.promises.mkdir(extension.unpacked_path, { recursive: true }); - - try { - // Try system unzip command first - await execAsync(`/usr/bin/unzip -o ${extension.crx_path} -d ${extension.unpacked_path}`); - } catch (err1) { - if (unzip) { - // Fallback to unzipper library - try { - await unzip(extension.crx_path, extension.unpacked_path); - } catch (err2) { - console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message); - return false; - } - } else { - console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message); - return false; - } - } - - if (!fs.existsSync(manifest_path)) { - console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`); - return false; - } - - return true; -} - -/** - * Load or install a Chrome extension, computing all metadata. - * - * @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path) - * @param {string} [ext.webstore_id] - Chrome Web Store extension ID - * @param {string} [ext.name] - Human-readable extension name - * @param {string} [ext.unpacked_path] - Path to unpacked extension - * @param {string} [extensions_dir] - Directory to store extensions - * @returns {Promise} - Complete extension metadata object - */ -async function loadOrInstallExtension(ext, extensions_dir = null) { - if (!(ext.webstore_id || ext.unpacked_path)) { - throw new Error('Extension must have either {webstore_id} or {unpacked_path}'); - } - - // Determine extensions directory - const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions'; - - // Set statically computable extension metadata - ext.webstore_id = ext.webstore_id || ext.id; - ext.name = ext.name || ext.webstore_id; - ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`; - ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`; - ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`); - ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`); - - const manifest_path = path.join(ext.unpacked_path, 'manifest.json'); - ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8')); - ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null; - - // If extension is not installed, download and unpack it - if (!ext.read_version()) { - await installExtension(ext); - } - - // Autodetect ID from filesystem path (unpacked extensions don't have stable IDs) - ext.id = getExtensionId(ext.unpacked_path); - ext.version = ext.read_version(); - - if (!ext.version) { - console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`); - } else { - console.log(`[➕] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`); - } - - return ext; -} - -/** - * Check if a Puppeteer target is an extension background page/service worker. - * - * @param {Object} target - Puppeteer target object - * @returns {Promise} - Object with target_is_bg, extension_id, manifest_version, etc. - */ -async function isTargetExtension(target) { - let target_type; - let target_ctx; - let target_url; - - try { - target_type = target.type(); - target_ctx = (await target.worker()) || (await target.page()) || null; - target_url = target.url() || target_ctx?.url() || null; - } catch (err) { - if (String(err).includes('No target with given id found')) { - // Target closed during check, ignore harmless race condition - target_type = 'closed'; - target_ctx = null; - target_url = 'about:closed'; - } else { - throw err; - } - } - - // Check if this is an extension background page or service worker - const is_chrome_extension = target_url?.startsWith('chrome-extension://'); - const is_background_page = target_type === 'background_page'; - const is_service_worker = target_type === 'service_worker'; - const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker); - - let extension_id = null; - let manifest_version = null; - const target_is_extension = is_chrome_extension || target_is_bg; - - if (target_is_extension) { - try { - extension_id = target_url?.split('://')[1]?.split('/')[0] || null; - - if (target_ctx) { - const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); - manifest_version = manifest?.manifest_version || null; - } - } catch (err) { - // Failed to get extension metadata - } - } - - return { - target_is_extension, - target_is_bg, - target_type, - target_ctx, - target_url, - extension_id, - manifest_version, - }; -} - -/** - * Load extension metadata and connection handlers from a browser target. - * - * @param {Array} extensions - Array of extension metadata objects to update - * @param {Object} target - Puppeteer target object - * @returns {Promise} - Updated extension object or null if not an extension - */ -async function loadExtensionFromTarget(extensions, target) { - const { - target_is_bg, - target_is_extension, - target_type, - target_ctx, - target_url, - extension_id, - manifest_version, - } = await isTargetExtension(target); - - if (!(target_is_bg && extension_id && target_ctx)) { - return null; - } - - // Find matching extension in our list - const extension = extensions.find(ext => ext.id === extension_id); - if (!extension) { - console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`); - return null; - } - - // Load manifest from the extension context - let manifest = null; - try { - manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); - } catch (err) { - console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err); - return null; - } - - // Create dispatch methods for communicating with the extension - const new_extension = { - ...extension, - target, - target_type, - target_url, - manifest, - manifest_version, - - // Trigger extension toolbar button click - dispatchAction: async (tab) => { - return await target_ctx.evaluate((tabId) => { - return new Promise((resolve) => { - chrome.action.onClicked.addListener((tab) => { - resolve({ success: true, tab }); - }); - chrome.action.openPopup(); - }); - }, tab?.id || null); - }, - - // Send message to extension - dispatchMessage: async (message, options = {}) => { - return await target_ctx.evaluate((msg, opts) => { - return new Promise((resolve) => { - chrome.runtime.sendMessage(msg, opts, (response) => { - resolve(response); - }); - }); - }, message, options); - }, - - // Trigger extension command (keyboard shortcut) - dispatchCommand: async (command) => { - return await target_ctx.evaluate((cmd) => { - return new Promise((resolve) => { - chrome.commands.onCommand.addListener((receivedCommand) => { - if (receivedCommand === cmd) { - resolve({ success: true, command: receivedCommand }); - } - }); - // Note: Actually triggering commands programmatically is not directly supported - // This would need to be done via CDP or keyboard simulation - }); - }, command); - }, - }; - - // Update the extension in the array - Object.assign(extension, new_extension); - - console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`); - - return new_extension; -} - -/** - * Install all extensions in the list if not already installed. - * - * @param {Array} extensions - Array of extension metadata objects - * @param {string} [extensions_dir] - Directory to store extensions - * @returns {Promise} - Array of installed extension objects - */ -async function installAllExtensions(extensions, extensions_dir = null) { - console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`); - - for (const extension of extensions) { - await loadOrInstallExtension(extension, extensions_dir); - } - - return extensions; -} - -/** - * Load and connect to all extensions from a running browser. - * - * @param {Object} browser - Puppeteer browser instance - * @param {Array} extensions - Array of extension metadata objects - * @returns {Promise} - Array of loaded extension objects with connection handlers - */ -async function loadAllExtensionsFromBrowser(browser, extensions) { - console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`); - - // Find loaded extensions at runtime by examining browser targets - for (const target of browser.targets()) { - await loadExtensionFromTarget(extensions, target); - } - - return extensions; -} - -/** - * Load extension manifest.json file - * - * @param {string} unpacked_path - Path to unpacked extension directory - * @returns {object|null} - Parsed manifest object or null if not found/invalid - */ -function loadExtensionManifest(unpacked_path) { - const manifest_path = path.join(unpacked_path, 'manifest.json'); - - if (!fs.existsSync(manifest_path)) { - return null; - } - - try { - const manifest_content = fs.readFileSync(manifest_path, 'utf-8'); - return JSON.parse(manifest_content); - } catch (error) { - // Invalid JSON or read error - return null; - } -} - -/** - * Generate Chrome launch arguments for loading extensions. - * - * @param {Array} extensions - Array of extension metadata objects - * @returns {Array} - Chrome CLI arguments for loading extensions - */ -function getExtensionLaunchArgs(extensions) { - if (!extensions || extensions.length === 0) { - return []; - } - - // Filter out extensions without unpacked_path first - const validExtensions = extensions.filter(ext => ext.unpacked_path); - - const unpacked_paths = validExtensions.map(ext => ext.unpacked_path); - const webstore_ids = validExtensions.map(ext => ext.webstore_id || ext.id); - - return [ - `--load-extension=${unpacked_paths.join(',')}`, - `--allowlisted-extension-id=${webstore_ids.join(',')}`, - '--allow-legacy-extension-manifests', - '--disable-extensions-auto-update', - ]; -} - -// Export all functions -module.exports = { - getExtensionId, - loadExtensionManifest, - installExtension, - loadOrInstallExtension, - isTargetExtension, - loadExtensionFromTarget, - installAllExtensions, - loadAllExtensionsFromBrowser, - getExtensionLaunchArgs, -}; - -// CLI usage -if (require.main === module) { - const args = process.argv.slice(2); - - if (args.length === 0) { - console.log('Usage: chrome_extension_utils.js [args...]'); - console.log(''); - console.log('Commands:'); - console.log(' getExtensionId '); - console.log(' loadExtensionManifest '); - console.log(' getExtensionLaunchArgs '); - console.log(' loadOrInstallExtension [extensions_dir]'); - process.exit(1); - } - - const [command, ...commandArgs] = args; - - (async () => { - try { - switch (command) { - case 'getExtensionId': { - const [unpacked_path] = commandArgs; - const id = getExtensionId(unpacked_path); - console.log(id); - break; - } - - case 'loadExtensionManifest': { - const [unpacked_path] = commandArgs; - const manifest = loadExtensionManifest(unpacked_path); - console.log(JSON.stringify(manifest)); - break; - } - - case 'getExtensionLaunchArgs': { - const [extensions_json] = commandArgs; - const extensions = JSON.parse(extensions_json); - const args = getExtensionLaunchArgs(extensions); - console.log(JSON.stringify(args)); - break; - } - - case 'loadOrInstallExtension': { - const [webstore_id, name, extensions_dir] = commandArgs; - const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir); - console.log(JSON.stringify(ext, null, 2)); - break; - } - - default: - console.error(`Unknown command: ${command}`); - process.exit(1); - } - } catch (error) { - console.error(`Error: ${error.message}`); - process.exit(1); - } - })(); -} diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js new file mode 100755 index 00000000..333cf418 --- /dev/null +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -0,0 +1,1150 @@ +#!/usr/bin/env node +/** + * Chrome Extension Management Utilities + * + * Handles downloading, installing, and managing Chrome extensions for browser automation. + * Ported from the TypeScript implementation in archivebox.ts + */ + +const fs = require('fs'); +const path = require('path'); +const crypto = require('crypto'); +const http = require('http'); +const net = require('net'); +const { exec, spawn } = require('child_process'); +const { promisify } = require('util'); +const { Readable } = require('stream'); +const { finished } = require('stream/promises'); + +const execAsync = promisify(exec); + +// ============================================================================ +// Environment helpers +// ============================================================================ + +/** + * Get environment variable with default value. + * @param {string} name - Environment variable name + * @param {string} [defaultValue=''] - Default value if not set + * @returns {string} - Trimmed environment variable value + */ +function getEnv(name, defaultValue = '') { + return (process.env[name] || defaultValue).trim(); +} + +/** + * Get boolean environment variable. + * @param {string} name - Environment variable name + * @param {boolean} [defaultValue=false] - Default value if not set + * @returns {boolean} - Boolean value + */ +function getEnvBool(name, defaultValue = false) { + const val = getEnv(name, '').toLowerCase(); + if (['true', '1', 'yes', 'on'].includes(val)) return true; + if (['false', '0', 'no', 'off'].includes(val)) return false; + return defaultValue; +} + +/** + * Parse resolution string into width/height. + * @param {string} resolution - Resolution string like "1440,2000" + * @returns {{width: number, height: number}} - Parsed dimensions + */ +function parseResolution(resolution) { + const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); + return { width: width || 1440, height: height || 2000 }; +} + +// ============================================================================ +// PID file management +// ============================================================================ + +/** + * Write PID file with specific mtime for process validation. + * @param {string} filePath - Path to PID file + * @param {number} pid - Process ID + * @param {number} startTimeSeconds - Process start time in seconds + */ +function writePidWithMtime(filePath, pid, startTimeSeconds) { + fs.writeFileSync(filePath, String(pid)); + const startTimeMs = startTimeSeconds * 1000; + fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs)); +} + +/** + * Write a shell script that can re-run the Chrome command. + * @param {string} filePath - Path to script file + * @param {string} binary - Chrome binary path + * @param {string[]} args - Chrome arguments + */ +function writeCmdScript(filePath, binary, args) { + const escape = (arg) => + arg.includes(' ') || arg.includes('"') || arg.includes('$') + ? `"${arg.replace(/"/g, '\\"')}"` + : arg; + fs.writeFileSync( + filePath, + `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n` + ); + fs.chmodSync(filePath, 0o755); +} + +// ============================================================================ +// Port management +// ============================================================================ + +/** + * Find a free port on localhost. + * @returns {Promise} - Available port number + */ +function findFreePort() { + return new Promise((resolve, reject) => { + const server = net.createServer(); + server.unref(); + server.on('error', reject); + server.listen(0, () => { + const port = server.address().port; + server.close(() => resolve(port)); + }); + }); +} + +/** + * Wait for Chrome's DevTools port to be ready. + * @param {number} port - Debug port number + * @param {number} [timeout=30000] - Timeout in milliseconds + * @returns {Promise} - Chrome version info + */ +function waitForDebugPort(port, timeout = 30000) { + const startTime = Date.now(); + + return new Promise((resolve, reject) => { + const tryConnect = () => { + if (Date.now() - startTime > timeout) { + reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); + return; + } + + const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { + let data = ''; + res.on('data', (chunk) => (data += chunk)); + res.on('end', () => { + try { + const info = JSON.parse(data); + resolve(info); + } catch (e) { + setTimeout(tryConnect, 100); + } + }); + }); + + req.on('error', () => { + setTimeout(tryConnect, 100); + }); + + req.setTimeout(1000, () => { + req.destroy(); + setTimeout(tryConnect, 100); + }); + }; + + tryConnect(); + }); +} + +// ============================================================================ +// Zombie process cleanup +// ============================================================================ + +/** + * Kill zombie Chrome processes from stale crawls. + * Scans DATA_DIR/crawls//chrome/.pid for stale processes. + * @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.') + * @returns {number} - Number of zombies killed + */ +function killZombieChrome(dataDir = null) { + dataDir = dataDir || getEnv('DATA_DIR', '.'); + const crawlsDir = path.join(dataDir, 'crawls'); + const now = Date.now(); + const fiveMinutesAgo = now - 300000; + let killed = 0; + + console.error('[*] Checking for zombie Chrome processes...'); + + if (!fs.existsSync(crawlsDir)) { + console.error('[+] No crawls directory found'); + return 0; + } + + try { + const crawls = fs.readdirSync(crawlsDir, { withFileTypes: true }); + + for (const crawl of crawls) { + if (!crawl.isDirectory()) continue; + + const crawlDir = path.join(crawlsDir, crawl.name); + const chromeDir = path.join(crawlDir, 'chrome'); + + if (!fs.existsSync(chromeDir)) continue; + + // Check if crawl was modified recently (still active) + try { + const crawlStats = fs.statSync(crawlDir); + if (crawlStats.mtimeMs > fiveMinutesAgo) { + continue; + } + } catch (e) { + continue; + } + + // Crawl is stale, check for PIDs + try { + const pidFiles = fs.readdirSync(chromeDir).filter(f => f.endsWith('.pid')); + + for (const pidFileName of pidFiles) { + const pidFile = path.join(chromeDir, pidFileName); + + try { + const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); + if (isNaN(pid) || pid <= 0) continue; + + // Check if process exists + try { + process.kill(pid, 0); + } catch (e) { + // Process dead, remove stale PID file + try { fs.unlinkSync(pidFile); } catch (e) {} + continue; + } + + // Process alive and crawl is stale - zombie! + console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`); + + try { + try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); } + killed++; + console.error(`[+] Killed zombie (PID ${pid})`); + try { fs.unlinkSync(pidFile); } catch (e) {} + } catch (e) { + console.error(`[!] Failed to kill PID ${pid}: ${e.message}`); + } + } catch (e) { + // Skip invalid PID files + } + } + } catch (e) { + // Skip if can't read chrome dir + } + } + } catch (e) { + console.error(`[!] Error scanning crawls: ${e.message}`); + } + + if (killed > 0) { + console.error(`[+] Killed ${killed} zombie process(es)`); + } else { + console.error('[+] No zombies found'); + } + + return killed; +} + +// ============================================================================ +// Chrome launching +// ============================================================================ + +/** + * Launch Chromium with extensions and return connection info. + * + * @param {Object} options - Launch options + * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided) + * @param {string} [options.outputDir='chrome'] - Directory for output files + * @param {string} [options.resolution='1440,2000'] - Window resolution + * @param {boolean} [options.headless=true] - Run in headless mode + * @param {boolean} [options.checkSsl=true] - Check SSL certificates + * @param {string[]} [options.extensionPaths=[]] - Paths to unpacked extensions + * @param {boolean} [options.killZombies=true] - Kill zombie processes first + * @returns {Promise} - {success, cdpUrl, pid, port, process, error} + */ +async function launchChromium(options = {}) { + const { + binary = findChromium(), + outputDir = 'chrome', + resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), + headless = getEnvBool('CHROME_HEADLESS', true), + checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), + extensionPaths = [], + killZombies = true, + } = options; + + if (!binary) { + return { success: false, error: 'Chrome binary not found' }; + } + + // Kill zombies first + if (killZombies) { + killZombieChrome(); + } + + const { width, height } = parseResolution(resolution); + + // Create output directory + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + // Find a free port + const debugPort = await findFreePort(); + console.error(`[*] Using debug port: ${debugPort}`); + + // Build Chrome arguments + const chromiumArgs = [ + `--remote-debugging-port=${debugPort}`, + '--remote-debugging-address=127.0.0.1', + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--disable-sync', + '--no-first-run', + '--no-default-browser-check', + '--disable-default-apps', + '--disable-infobars', + '--disable-blink-features=AutomationControlled', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-breakpad', + '--disable-background-networking', + '--disable-background-timer-throttling', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + '--disable-ipc-flooding-protection', + '--password-store=basic', + '--use-mock-keychain', + '--font-render-hinting=none', + '--force-color-profile=srgb', + `--window-size=${width},${height}`, + ...(headless ? ['--headless=new'] : []), + ...(checkSsl ? [] : ['--ignore-certificate-errors']), + ]; + + // Add extension loading flags + if (extensionPaths.length > 0) { + const extPathsArg = extensionPaths.join(','); + chromiumArgs.push(`--load-extension=${extPathsArg}`); + chromiumArgs.push('--enable-unsafe-extension-debugging'); + chromiumArgs.push('--disable-features=DisableLoadExtensionCommandLineSwitch,ExtensionManifestV2Unsupported,ExtensionManifestV2Disabled'); + console.error(`[*] Loading ${extensionPaths.length} extension(s) via --load-extension`); + } + + chromiumArgs.push('about:blank'); + + // Write command script for debugging + writeCmdScript(path.join(outputDir, 'cmd.sh'), binary, chromiumArgs); + + try { + console.error(`[*] Spawning Chromium (headless=${headless})...`); + const chromiumProcess = spawn(binary, chromiumArgs, { + stdio: ['ignore', 'pipe', 'pipe'], + detached: true, + }); + + const chromePid = chromiumProcess.pid; + const chromeStartTime = Date.now() / 1000; + + if (chromePid) { + console.error(`[*] Chromium spawned (PID: ${chromePid})`); + writePidWithMtime(path.join(outputDir, 'chrome.pid'), chromePid, chromeStartTime); + } + + // Pipe Chrome output to stderr + chromiumProcess.stdout.on('data', (data) => { + process.stderr.write(`[chromium:stdout] ${data}`); + }); + chromiumProcess.stderr.on('data', (data) => { + process.stderr.write(`[chromium:stderr] ${data}`); + }); + + // Wait for debug port + console.error(`[*] Waiting for debug port ${debugPort}...`); + const versionInfo = await waitForDebugPort(debugPort, 30000); + const wsUrl = versionInfo.webSocketDebuggerUrl; + console.error(`[+] Chromium ready: ${wsUrl}`); + + fs.writeFileSync(path.join(outputDir, 'cdp_url.txt'), wsUrl); + fs.writeFileSync(path.join(outputDir, 'port.txt'), String(debugPort)); + + return { + success: true, + cdpUrl: wsUrl, + pid: chromePid, + port: debugPort, + process: chromiumProcess, + }; + } catch (e) { + return { success: false, error: `${e.name}: ${e.message}` }; + } +} + +/** + * Kill a Chrome process by PID. + * @param {number} pid - Process ID to kill + * @param {string} [outputDir] - Directory containing PID files to clean up + */ +async function killChrome(pid, outputDir = null) { + if (!pid) return; + + console.error(`[*] Killing Chrome process tree (PID ${pid})...`); + + // Try to kill process group first + try { + process.kill(-pid, 'SIGTERM'); + } catch (e) { + try { process.kill(pid, 'SIGTERM'); } catch (e2) {} + } + + // Wait for graceful shutdown + await new Promise(resolve => setTimeout(resolve, 2000)); + + // Force kill + try { + process.kill(-pid, 'SIGKILL'); + } catch (e) { + try { process.kill(pid, 'SIGKILL'); } catch (e2) {} + } + + // Clean up PID files + if (outputDir) { + try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {} + try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {} + } + + console.error('[*] Chrome process killed'); +} + +// Try to import unzipper, fallback to system unzip if not available +let unzip = null; +try { + const unzipper = require('unzipper'); + unzip = async (sourcePath, destPath) => { + const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath })); + return stream.promise(); + }; +} catch (err) { + // Will use system unzip command as fallback +} + +/** + * Compute the extension ID from the unpacked path. + * Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id. + * + * @param {string} unpacked_path - Path to the unpacked extension directory + * @returns {string} - 32-character extension ID + */ +function getExtensionId(unpacked_path) { + // Chrome uses a SHA256 hash of the unpacked extension directory path + const hash = crypto.createHash('sha256'); + hash.update(Buffer.from(unpacked_path, 'utf-8')); + + // Convert first 32 hex chars to characters in the range 'a'-'p' + const detected_extension_id = Array.from(hash.digest('hex')) + .slice(0, 32) + .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0))) + .join(''); + + return detected_extension_id; +} + +/** + * Download and install a Chrome extension from the Chrome Web Store. + * + * @param {Object} extension - Extension metadata object + * @param {string} extension.webstore_id - Chrome Web Store extension ID + * @param {string} extension.name - Human-readable extension name + * @param {string} extension.crx_url - URL to download the CRX file + * @param {string} extension.crx_path - Local path to save the CRX file + * @param {string} extension.unpacked_path - Path to extract the extension + * @returns {Promise} - True if installation succeeded + */ +async function installExtension(extension) { + const manifest_path = path.join(extension.unpacked_path, 'manifest.json'); + + // Download CRX file if not already downloaded + if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) { + console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`); + + try { + // Ensure parent directory exists + const crxDir = path.dirname(extension.crx_path); + if (!fs.existsSync(crxDir)) { + fs.mkdirSync(crxDir, { recursive: true }); + } + + // Download CRX file from Chrome Web Store + const response = await fetch(extension.crx_url); + + if (!response.ok) { + console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`); + return false; + } + + if (response.body) { + const crx_file = fs.createWriteStream(extension.crx_path); + const crx_stream = Readable.fromWeb(response.body); + await finished(crx_stream.pipe(crx_file)); + } else { + console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`); + return false; + } + } catch (err) { + console.error(`[❌] Failed to download extension ${extension.name}:`, err); + return false; + } + } + + // Unzip CRX file to unpacked_path (CRX files have extra header bytes but unzip handles it) + await fs.promises.mkdir(extension.unpacked_path, { recursive: true }); + + try { + // Use -q to suppress warnings about extra bytes in CRX header + await execAsync(`/usr/bin/unzip -q -o "${extension.crx_path}" -d "${extension.unpacked_path}"`); + } catch (err1) { + // unzip may return non-zero even on success due to CRX header warning, check if manifest exists + if (!fs.existsSync(manifest_path)) { + if (unzip) { + // Fallback to unzipper library + try { + await unzip(extension.crx_path, extension.unpacked_path); + } catch (err2) { + console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err2.message); + return false; + } + } else { + console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message); + return false; + } + } + } + + if (!fs.existsSync(manifest_path)) { + console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`); + return false; + } + + return true; +} + +/** + * Load or install a Chrome extension, computing all metadata. + * + * @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path) + * @param {string} [ext.webstore_id] - Chrome Web Store extension ID + * @param {string} [ext.name] - Human-readable extension name + * @param {string} [ext.unpacked_path] - Path to unpacked extension + * @param {string} [extensions_dir] - Directory to store extensions + * @returns {Promise} - Complete extension metadata object + */ +async function loadOrInstallExtension(ext, extensions_dir = null) { + if (!(ext.webstore_id || ext.unpacked_path)) { + throw new Error('Extension must have either {webstore_id} or {unpacked_path}'); + } + + // Determine extensions directory + const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions'; + + // Set statically computable extension metadata + ext.webstore_id = ext.webstore_id || ext.id; + ext.name = ext.name || ext.webstore_id; + ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`; + ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`; + ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`); + ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`); + + const manifest_path = path.join(ext.unpacked_path, 'manifest.json'); + ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8')); + ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null; + + // If extension is not installed, download and unpack it + if (!ext.read_version()) { + await installExtension(ext); + } + + // Autodetect ID from filesystem path (unpacked extensions don't have stable IDs) + ext.id = getExtensionId(ext.unpacked_path); + ext.version = ext.read_version(); + + if (!ext.version) { + console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`); + } else { + console.log(`[➕] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`); + } + + return ext; +} + +/** + * Check if a Puppeteer target is an extension background page/service worker. + * + * @param {Object} target - Puppeteer target object + * @returns {Promise} - Object with target_is_bg, extension_id, manifest_version, etc. + */ +async function isTargetExtension(target) { + let target_type; + let target_ctx; + let target_url; + + try { + target_type = target.type(); + target_ctx = (await target.worker()) || (await target.page()) || null; + target_url = target.url() || target_ctx?.url() || null; + } catch (err) { + if (String(err).includes('No target with given id found')) { + // Target closed during check, ignore harmless race condition + target_type = 'closed'; + target_ctx = null; + target_url = 'about:closed'; + } else { + throw err; + } + } + + // Check if this is an extension background page or service worker + const is_chrome_extension = target_url?.startsWith('chrome-extension://'); + const is_background_page = target_type === 'background_page'; + const is_service_worker = target_type === 'service_worker'; + const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker); + + let extension_id = null; + let manifest_version = null; + const target_is_extension = is_chrome_extension || target_is_bg; + + if (target_is_extension) { + try { + extension_id = target_url?.split('://')[1]?.split('/')[0] || null; + + if (target_ctx) { + const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); + manifest_version = manifest?.manifest_version || null; + } + } catch (err) { + // Failed to get extension metadata + } + } + + return { + target_is_extension, + target_is_bg, + target_type, + target_ctx, + target_url, + extension_id, + manifest_version, + }; +} + +/** + * Load extension metadata and connection handlers from a browser target. + * + * @param {Array} extensions - Array of extension metadata objects to update + * @param {Object} target - Puppeteer target object + * @returns {Promise} - Updated extension object or null if not an extension + */ +async function loadExtensionFromTarget(extensions, target) { + const { + target_is_bg, + target_is_extension, + target_type, + target_ctx, + target_url, + extension_id, + manifest_version, + } = await isTargetExtension(target); + + if (!(target_is_bg && extension_id && target_ctx)) { + return null; + } + + // Find matching extension in our list + const extension = extensions.find(ext => ext.id === extension_id); + if (!extension) { + console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`); + return null; + } + + // Load manifest from the extension context + let manifest = null; + try { + manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); + } catch (err) { + console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err); + return null; + } + + // Create dispatch methods for communicating with the extension + const new_extension = { + ...extension, + target, + target_type, + target_url, + manifest, + manifest_version, + + // Trigger extension toolbar button click + dispatchAction: async (tab) => { + return await target_ctx.evaluate((tabId) => { + return new Promise((resolve) => { + chrome.action.onClicked.addListener((tab) => { + resolve({ success: true, tab }); + }); + chrome.action.openPopup(); + }); + }, tab?.id || null); + }, + + // Send message to extension + dispatchMessage: async (message, options = {}) => { + return await target_ctx.evaluate((msg, opts) => { + return new Promise((resolve) => { + chrome.runtime.sendMessage(msg, opts, (response) => { + resolve(response); + }); + }); + }, message, options); + }, + + // Trigger extension command (keyboard shortcut) + dispatchCommand: async (command) => { + return await target_ctx.evaluate((cmd) => { + return new Promise((resolve) => { + chrome.commands.onCommand.addListener((receivedCommand) => { + if (receivedCommand === cmd) { + resolve({ success: true, command: receivedCommand }); + } + }); + // Note: Actually triggering commands programmatically is not directly supported + // This would need to be done via CDP or keyboard simulation + }); + }, command); + }, + }; + + // Update the extension in the array + Object.assign(extension, new_extension); + + console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`); + + return new_extension; +} + +/** + * Install all extensions in the list if not already installed. + * + * @param {Array} extensions - Array of extension metadata objects + * @param {string} [extensions_dir] - Directory to store extensions + * @returns {Promise} - Array of installed extension objects + */ +async function installAllExtensions(extensions, extensions_dir = null) { + console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`); + + for (const extension of extensions) { + await loadOrInstallExtension(extension, extensions_dir); + } + + return extensions; +} + +/** + * Load and connect to all extensions from a running browser. + * + * @param {Object} browser - Puppeteer browser instance + * @param {Array} extensions - Array of extension metadata objects + * @returns {Promise} - Array of loaded extension objects with connection handlers + */ +async function loadAllExtensionsFromBrowser(browser, extensions) { + console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`); + + // Find loaded extensions at runtime by examining browser targets + for (const target of browser.targets()) { + await loadExtensionFromTarget(extensions, target); + } + + return extensions; +} + +/** + * Load extension manifest.json file + * + * @param {string} unpacked_path - Path to unpacked extension directory + * @returns {object|null} - Parsed manifest object or null if not found/invalid + */ +function loadExtensionManifest(unpacked_path) { + const manifest_path = path.join(unpacked_path, 'manifest.json'); + + if (!fs.existsSync(manifest_path)) { + return null; + } + + try { + const manifest_content = fs.readFileSync(manifest_path, 'utf-8'); + return JSON.parse(manifest_content); + } catch (error) { + // Invalid JSON or read error + return null; + } +} + +/** + * @deprecated Use puppeteer's enableExtensions option instead. + * + * Generate Chrome launch arguments for loading extensions. + * NOTE: This is deprecated. Use puppeteer.launch({ pipe: true, enableExtensions: [paths] }) instead. + * + * @param {Array} extensions - Array of extension metadata objects + * @returns {Array} - Chrome CLI arguments for loading extensions + */ +function getExtensionLaunchArgs(extensions) { + console.warn('[DEPRECATED] getExtensionLaunchArgs is deprecated. Use puppeteer enableExtensions option instead.'); + if (!extensions || extensions.length === 0) { + return []; + } + + // Filter out extensions without unpacked_path first + const validExtensions = extensions.filter(ext => ext.unpacked_path); + + const unpacked_paths = validExtensions.map(ext => ext.unpacked_path); + // Use computed id (from path hash) for allowlisting, as that's what Chrome uses for unpacked extensions + // Fall back to webstore_id if computed id not available + const extension_ids = validExtensions.map(ext => ext.id || getExtensionId(ext.unpacked_path)); + + return [ + `--load-extension=${unpacked_paths.join(',')}`, + `--allowlisted-extension-id=${extension_ids.join(',')}`, + '--allow-legacy-extension-manifests', + '--disable-extensions-auto-update', + ]; +} + +/** + * Get extension paths for use with puppeteer's enableExtensions option. + * Following puppeteer best practices: https://pptr.dev/guides/chrome-extensions + * + * @param {Array} extensions - Array of extension metadata objects + * @returns {Array} - Array of extension unpacked paths + */ +function getExtensionPaths(extensions) { + if (!extensions || extensions.length === 0) { + return []; + } + return extensions + .filter(ext => ext.unpacked_path) + .map(ext => ext.unpacked_path); +} + +/** + * Wait for an extension target to be available in the browser. + * Following puppeteer best practices for accessing extension contexts. + * + * For Manifest V3 extensions (service workers): + * const worker = await waitForExtensionTarget(browser, extensionId); + * // worker is a WebWorker context + * + * For Manifest V2 extensions (background pages): + * const page = await waitForExtensionTarget(browser, extensionId); + * // page is a Page context + * + * @param {Object} browser - Puppeteer browser instance + * @param {string} extensionId - Extension ID to wait for (computed from path hash) + * @param {number} [timeout=30000] - Timeout in milliseconds + * @returns {Promise} - Worker or Page context for the extension + */ +async function waitForExtensionTarget(browser, extensionId, timeout = 30000) { + // Try to find service worker first (Manifest V3) + try { + const workerTarget = await browser.waitForTarget( + target => target.type() === 'service_worker' && + target.url().includes(`chrome-extension://${extensionId}`), + { timeout } + ); + const worker = await workerTarget.worker(); + if (worker) return worker; + } catch (err) { + // No service worker found, try background page + } + + // Try background page (Manifest V2) + try { + const backgroundTarget = await browser.waitForTarget( + target => target.type() === 'background_page' && + target.url().includes(`chrome-extension://${extensionId}`), + { timeout } + ); + const page = await backgroundTarget.page(); + if (page) return page; + } catch (err) { + // No background page found + } + + // Try any extension page as fallback + const extTarget = await browser.waitForTarget( + target => target.url().startsWith(`chrome-extension://${extensionId}`), + { timeout } + ); + + // Return worker or page depending on target type + if (extTarget.type() === 'service_worker') { + return await extTarget.worker(); + } + return await extTarget.page(); +} + +/** + * Get all loaded extension targets from a browser. + * + * @param {Object} browser - Puppeteer browser instance + * @returns {Array} - Array of extension target info objects + */ +function getExtensionTargets(browser) { + return browser.targets() + .filter(target => + target.url().startsWith('chrome-extension://') || + target.type() === 'service_worker' || + target.type() === 'background_page' + ) + .map(target => ({ + type: target.type(), + url: target.url(), + extensionId: target.url().includes('chrome-extension://') + ? target.url().split('chrome-extension://')[1]?.split('/')[0] + : null, + })); +} + +/** + * Find Chromium/Chrome binary path. + * Prefers Chromium over Chrome because Chrome 137+ removed --load-extension support. + * + * @param {string} [dataDir] - Data directory to check for puppeteer installs + * @returns {string|null} - Absolute path to browser binary or null if not found + */ +function findChromium(dataDir = null) { + // Check CHROME_BINARY env var first + const chromeBinary = (process.env.CHROME_BINARY || '').trim(); + if (chromeBinary && fs.existsSync(chromeBinary)) { + // Ensure absolute path + return path.resolve(chromeBinary); + } + + // Helper to find Chromium in @puppeteer/browsers directory structure + // Always returns absolute paths + const findInPuppeteerDir = (baseDir) => { + const absBaseDir = path.resolve(baseDir); + if (!fs.existsSync(absBaseDir)) return null; + try { + const versions = fs.readdirSync(absBaseDir); + for (const version of versions.sort().reverse()) { + const versionDir = path.join(absBaseDir, version); + // Check for macOS ARM structure + const macArmBinary = path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'); + if (fs.existsSync(macArmBinary)) return macArmBinary; + // Check for macOS x64 structure + const macX64Binary = path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'); + if (fs.existsSync(macX64Binary)) return macX64Binary; + // Check for Linux structure + const linuxBinary = path.join(versionDir, 'chrome-linux/chrome'); + if (fs.existsSync(linuxBinary)) return linuxBinary; + } + } catch (e) { + // Continue + } + return null; + }; + + // Check @puppeteer/browsers install locations + const puppeteerDirs = [ + // Local project install (from npx @puppeteer/browsers install) + path.join(dataDir || process.env.DATA_DIR || '.', 'chromium'), + path.join(process.cwd(), 'chromium'), + // User cache locations + path.join(process.env.HOME || '', '.cache/puppeteer/chromium'), + ]; + + for (const puppeteerDir of puppeteerDirs) { + const binary = findInPuppeteerDir(puppeteerDir); + if (binary) return binary; + } + + // Check standard system locations + const candidates = [ + // Linux Chromium + '/usr/bin/chromium', + '/usr/bin/chromium-browser', + // macOS Chromium (Homebrew or manual install) + '/Applications/Chromium.app/Contents/MacOS/Chromium', + // Fallback to Chrome (extension loading may not work in Chrome 137+) + '/usr/bin/google-chrome', + '/usr/bin/google-chrome-stable', + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + ]; + + for (const candidate of candidates) { + if (fs.existsSync(candidate)) { + // Warn if falling back to Chrome + if (candidate.includes('google-chrome') || candidate.includes('Google Chrome')) { + console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+'); + } + return candidate; + } + } + + return null; +} + +// Export all functions +module.exports = { + // Environment helpers + getEnv, + getEnvBool, + parseResolution, + // PID file management + writePidWithMtime, + writeCmdScript, + // Port management + findFreePort, + waitForDebugPort, + // Zombie cleanup + killZombieChrome, + // Chrome launching + launchChromium, + killChrome, + // Chrome/Chromium binary finding + findChromium, + // Extension utilities + getExtensionId, + loadExtensionManifest, + installExtension, + loadOrInstallExtension, + isTargetExtension, + loadExtensionFromTarget, + installAllExtensions, + loadAllExtensionsFromBrowser, + // New puppeteer best-practices helpers + getExtensionPaths, + waitForExtensionTarget, + getExtensionTargets, + // Deprecated - use enableExtensions option instead + getExtensionLaunchArgs, +}; + +// CLI usage +if (require.main === module) { + const args = process.argv.slice(2); + + if (args.length === 0) { + console.log('Usage: chrome_utils.js [args...]'); + console.log(''); + console.log('Commands:'); + console.log(' findChromium [data_dir]'); + console.log(' launchChromium [output_dir] [extension_paths_json]'); + console.log(' killChrome [output_dir]'); + console.log(' killZombieChrome [data_dir]'); + console.log(' getExtensionId '); + console.log(' loadExtensionManifest '); + console.log(' getExtensionLaunchArgs '); + console.log(' loadOrInstallExtension [extensions_dir]'); + process.exit(1); + } + + const [command, ...commandArgs] = args; + + (async () => { + try { + switch (command) { + case 'findChromium': { + const [dataDir] = commandArgs; + const binary = findChromium(dataDir); + if (binary) { + console.log(binary); + } else { + console.error('Chromium binary not found'); + process.exit(1); + } + break; + } + + case 'launchChromium': { + const [outputDir, extensionPathsJson] = commandArgs; + const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : []; + const result = await launchChromium({ + outputDir: outputDir || 'chrome', + extensionPaths, + }); + if (result.success) { + console.log(JSON.stringify({ + cdpUrl: result.cdpUrl, + pid: result.pid, + port: result.port, + })); + } else { + console.error(result.error); + process.exit(1); + } + break; + } + + case 'killChrome': { + const [pidStr, outputDir] = commandArgs; + const pid = parseInt(pidStr, 10); + if (isNaN(pid)) { + console.error('Invalid PID'); + process.exit(1); + } + await killChrome(pid, outputDir); + break; + } + + case 'killZombieChrome': { + const [dataDir] = commandArgs; + const killed = killZombieChrome(dataDir); + console.log(killed); + break; + } + + case 'getExtensionId': { + const [unpacked_path] = commandArgs; + const id = getExtensionId(unpacked_path); + console.log(id); + break; + } + + case 'loadExtensionManifest': { + const [unpacked_path] = commandArgs; + const manifest = loadExtensionManifest(unpacked_path); + console.log(JSON.stringify(manifest)); + break; + } + + case 'getExtensionLaunchArgs': { + const [extensions_json] = commandArgs; + const extensions = JSON.parse(extensions_json); + const launchArgs = getExtensionLaunchArgs(extensions); + console.log(JSON.stringify(launchArgs)); + break; + } + + case 'loadOrInstallExtension': { + const [webstore_id, name, extensions_dir] = commandArgs; + const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir); + console.log(JSON.stringify(ext, null, 2)); + break; + } + + default: + console.error(`Unknown command: ${command}`); + process.exit(1); + } + } catch (error) { + console.error(`Error: ${error.message}`); + process.exit(1); + } + })(); +} diff --git a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py index 0d089390..589c58c0 100644 --- a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py +++ b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py @@ -2,10 +2,14 @@ """ Install hook for Chrome/Chromium binary. -Runs at crawl start to verify Chrome is available. +Runs at crawl start to verify Chromium is available. Outputs JSONL for Binary and Machine config updates. Respects CHROME_BINARY env var for custom binary paths. -Falls back to `npx @puppeteer/browsers install chrome@stable` if not found. +Falls back to `npx @puppeteer/browsers install chromium@latest` if not found. + +NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for +--load-extension and --disable-extensions-except flags, which are needed for +loading unpacked extensions in headless mode. """ import os @@ -14,24 +18,24 @@ import json import subprocess -def install_chrome_via_puppeteer() -> bool: - """Install Chrome using @puppeteer/browsers.""" +def install_chromium_via_puppeteer() -> bool: + """Install Chromium using @puppeteer/browsers.""" try: - print("Chrome not found, attempting to install via @puppeteer/browsers...", file=sys.stderr) + print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr) result = subprocess.run( - ['npx', '@puppeteer/browsers', 'install', 'chrome@stable'], + ['npx', '@puppeteer/browsers', 'install', 'chromium@latest'], capture_output=True, text=True, timeout=300 ) return result.returncode == 0 except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e: - print(f"Failed to install Chrome: {e}", file=sys.stderr) + print(f"Failed to install Chromium: {e}", file=sys.stderr) return False -def find_chrome() -> dict | None: - """Find Chrome/Chromium binary, respecting CHROME_BINARY env var.""" +def find_chromium() -> dict | None: + """Find Chromium binary, respecting CHROME_BINARY env var.""" # Quick check: if CHROME_BINARY is set and exists, skip expensive lookup configured_binary = os.environ.get('CHROME_BINARY', '').strip() if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): @@ -41,9 +45,10 @@ def find_chrome() -> dict | None: try: from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider - # Try to find chrome using abx-pkg + # Try to find chromium using abx-pkg + # Prefer chromium over chrome because Chrome 137+ removed --load-extension support binary = Binary( - name='chrome', + name='chromium', binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()], overrides={'npm': {'packages': ['@puppeteer/browsers']}} ) @@ -51,7 +56,7 @@ def find_chrome() -> dict | None: loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'chrome', + 'name': 'chromium', 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -59,12 +64,12 @@ def find_chrome() -> dict | None: } # If not found, try to install via @puppeteer/browsers - if install_chrome_via_puppeteer(): + if install_chromium_via_puppeteer(): # Try loading again after install loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'chrome', + 'name': 'chromium', 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -77,7 +82,7 @@ def find_chrome() -> dict | None: def main(): - result = find_chrome() + result = find_chromium() if result and result.get('abspath'): print(json.dumps({ @@ -99,13 +104,13 @@ def main(): print(json.dumps({ 'type': 'Machine', '_method': 'update', - 'key': 'config/CHROME_VERSION', + 'key': 'config/CHROMIUM_VERSION', 'value': result['version'], })) sys.exit(0) else: - print(f"Chrome/Chromium binary not found", file=sys.stderr) + print(f"Chromium binary not found", file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js index 781d8c5f..c2d62775 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js @@ -1,55 +1,57 @@ #!/usr/bin/env node /** - * Launch a shared Chrome browser session for the entire crawl. + * Launch a shared Chromium browser session for the entire crawl. * - * This runs once per crawl and keeps Chrome alive for all snapshots to share. + * This runs once per crawl and keeps Chromium alive for all snapshots to share. * Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js. * + * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for + * --load-extension and --disable-extensions-except flags. + * * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id= --source-url= * Output: Creates chrome/ directory under crawl output dir with: * - cdp_url.txt: WebSocket URL for CDP connection - * - pid.txt: Chrome process ID (for cleanup) + * - chrome.pid: Chromium process ID (for cleanup) * - port.txt: Debug port number * - extensions.json: Loaded extensions metadata * * Environment variables: - * CHROME_BINARY: Path to Chrome/Chromium binary + * NODE_MODULES_DIR: Path to node_modules directory for module resolution + * CHROME_BINARY: Path to Chromium binary (falls back to auto-detection) * CHROME_RESOLUTION: Page resolution (default: 1440,2000) * CHROME_HEADLESS: Run in headless mode (default: true) * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) * CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions */ +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) { + module.paths.unshift(process.env.NODE_MODULES_DIR); +} + const fs = require('fs'); const path = require('path'); -const { spawn } = require('child_process'); -const http = require('http'); +const puppeteer = require('puppeteer-core'); +const { + findChromium, + launchChromium, + killChrome, + getEnv, + writePidWithMtime, +} = require('./chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'chrome_launch'; const OUTPUT_DIR = 'chrome'; -// Helpers for PID file creation -function writePidWithMtime(filePath, pid, startTimeSeconds) { - fs.writeFileSync(filePath, String(pid)); - const startTimeMs = startTimeSeconds * 1000; - fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs)); -} - -function writeCmdScript(filePath, binary, args) { - const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$')) - ? `"${arg.replace(/"/g, '\\"')}"` : arg; - fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`); - fs.chmodSync(filePath, 0o755); -} - // Global state for cleanup let chromePid = null; +let browserInstance = null; // Parse command line arguments function parseArgs() { const args = {}; - process.argv.slice(2).forEach(arg => { + process.argv.slice(2).forEach((arg) => { if (arg.startsWith('--')) { const [key, ...valueParts] = arg.slice(2).split('='); args[key.replace(/-/g, '_')] = valueParts.join('=') || true; @@ -58,63 +60,27 @@ function parseArgs() { return args; } -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -// Cleanup handler for SIGTERM - kill Chrome and all child processes +// Cleanup handler for SIGTERM async function cleanup() { - if (!chromePid) { - process.exit(0); - return; - } + console.error('[*] Cleaning up Chrome session...'); - console.log(`[*] Killing Chrome process tree (PID ${chromePid})...`); - - try { - // Try to kill the entire process group - process.kill(-chromePid, 'SIGTERM'); - } catch (e) { - // Fall back to killing just the process + // Try graceful browser close first + if (browserInstance) { try { - process.kill(chromePid, 'SIGTERM'); - } catch (e2) { - // Already dead + console.error('[*] Closing browser gracefully...'); + await browserInstance.close(); + browserInstance = null; + console.error('[+] Browser closed gracefully'); + } catch (e) { + console.error(`[!] Graceful close failed: ${e.message}`); } } - // Wait 2 seconds for graceful shutdown - await new Promise(resolve => setTimeout(resolve, 2000)); - - // Force kill with SIGKILL - try { - process.kill(-chromePid, 'SIGKILL'); - } catch (e) { - try { - process.kill(chromePid, 'SIGKILL'); - } catch (e2) { - // Already dead - } + // Kill Chrome process + if (chromePid) { + await killChrome(chromePid, OUTPUT_DIR); } - console.log('[*] Chrome process tree killed'); - - // Delete PID files to prevent PID reuse issues - try { - fs.unlinkSync(path.join(OUTPUT_DIR, 'chrome.pid')); - } catch (e) {} - try { - fs.unlinkSync(path.join(OUTPUT_DIR, 'hook.pid')); - } catch (e) {} - process.exit(0); } @@ -122,379 +88,158 @@ async function cleanup() { process.on('SIGTERM', cleanup); process.on('SIGINT', cleanup); -// Find Chrome binary -function findChrome() { - const chromeBinary = getEnv('CHROME_BINARY'); - if (chromeBinary && fs.existsSync(chromeBinary)) { - return chromeBinary; - } - - const candidates = [ - // Linux - '/usr/bin/google-chrome', - '/usr/bin/google-chrome-stable', - '/usr/bin/chromium', - '/usr/bin/chromium-browser', - // macOS - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - ]; - - for (const candidate of candidates) { - if (fs.existsSync(candidate)) { - return candidate; - } - } - - return null; -} - -// Parse resolution string -function parseResolution(resolution) { - const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); - return { width: width || 1440, height: height || 2000 }; -} - -// Find a free port -function findFreePort() { - return new Promise((resolve, reject) => { - const server = require('net').createServer(); - server.unref(); - server.on('error', reject); - server.listen(0, () => { - const port = server.address().port; - server.close(() => resolve(port)); - }); - }); -} - -// Wait for Chrome's DevTools port to be ready -function waitForDebugPort(port, timeout = 30000) { - const startTime = Date.now(); - - return new Promise((resolve, reject) => { - const tryConnect = () => { - if (Date.now() - startTime > timeout) { - reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); - return; - } - - const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { - let data = ''; - res.on('data', chunk => data += chunk); - res.on('end', () => { - try { - const info = JSON.parse(data); - resolve(info); - } catch (e) { - setTimeout(tryConnect, 100); - } - }); - }); - - req.on('error', () => { - setTimeout(tryConnect, 100); - }); - - req.setTimeout(1000, () => { - req.destroy(); - setTimeout(tryConnect, 100); - }); - }; - - tryConnect(); - }); -} - -// Kill zombie Chrome processes from stale crawls -function killZombieChrome() { - const dataDir = getEnv('DATA_DIR', '.'); - const crawlsDir = path.join(dataDir, 'crawls'); - const now = Date.now(); - const fiveMinutesAgo = now - 300000; - let killed = 0; - - console.error('[*] Checking for zombie Chrome processes...'); - - if (!fs.existsSync(crawlsDir)) { - console.error('[+] No crawls directory found'); - return; - } - - try { - // Only scan data/crawls/*/chrome/*.pid - no recursion into archive dirs - const crawls = fs.readdirSync(crawlsDir, { withFileTypes: true }); - - for (const crawl of crawls) { - if (!crawl.isDirectory()) continue; - - const crawlDir = path.join(crawlsDir, crawl.name); - const chromeDir = path.join(crawlDir, 'chrome'); - - if (!fs.existsSync(chromeDir)) continue; - - // Check if crawl was modified recently (still active) - try { - const crawlStats = fs.statSync(crawlDir); - if (crawlStats.mtimeMs > fiveMinutesAgo) { - continue; // Crawl modified recently, likely still active - } - } catch (e) { - continue; - } - - // Crawl is stale (> 5 minutes since modification), check for PIDs - try { - const pidFiles = fs.readdirSync(chromeDir).filter(f => f.endsWith('.pid')); - - for (const pidFileName of pidFiles) { - const pidFile = path.join(chromeDir, pidFileName); - - try { - const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); - if (isNaN(pid) || pid <= 0) continue; - - // Check if process exists (simple check, Python will validate properly) - try { - process.kill(pid, 0); - } catch (e) { - // Process dead, remove stale PID file - try { fs.unlinkSync(pidFile); } catch (e) {} - continue; - } - - // Process alive and crawl is stale - zombie! - console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`); - - try { - // Kill process group - try { - process.kill(-pid, 'SIGKILL'); - } catch (e) { - process.kill(pid, 'SIGKILL'); - } - - killed++; - console.error(`[+] Killed zombie (PID ${pid})`); - try { fs.unlinkSync(pidFile); } catch (e) {} - } catch (e) { - console.error(`[!] Failed to kill PID ${pid}: ${e.message}`); - } - } catch (e) { - // Skip invalid PID files - } - } - } catch (e) { - // Skip if can't read chrome dir - } - } - } catch (e) { - console.error(`[!] Error scanning crawls: ${e.message}`); - } - - if (killed > 0) { - console.error(`[+] Killed ${killed} zombie process(es)`); - } else { - console.error('[+] No zombies found'); - } -} - -async function launchChrome(binary) { - // First, kill any zombie Chrome from crashed crawls - killZombieChrome(); - - const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); - const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); - const headless = getEnvBool('CHROME_HEADLESS', true); - - const { width, height } = parseResolution(resolution); - - // Create output directory - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } - - // Find a free port for Chrome DevTools - const debugPort = await findFreePort(); - console.error(`[*] Using debug port: ${debugPort}`); - - // Load any installed extensions - const extensionUtils = require('./chrome_extension_utils.js'); - const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || - path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); - - const installedExtensions = []; - if (fs.existsSync(extensionsDir)) { - const files = fs.readdirSync(extensionsDir); - for (const file of files) { - if (file.endsWith('.extension.json')) { - try { - const extPath = path.join(extensionsDir, file); - const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8')); - if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) { - installedExtensions.push(extData); - console.error(`[*] Loading extension: ${extData.name || file}`); - } - } catch (e) { - // Skip invalid cache files - console.warn(`[!] Skipping invalid extension cache: ${file}`); - } - } - } - } - - // Get extension launch arguments - const extensionArgs = extensionUtils.getExtensionLaunchArgs(installedExtensions); - if (extensionArgs.length > 0) { - console.error(`[+] Loaded ${installedExtensions.length} extension(s)`); - // Write extensions metadata for config hooks to use - fs.writeFileSync( - path.join(OUTPUT_DIR, 'extensions.json'), - JSON.stringify(installedExtensions, null, 2) - ); - } - - // Build Chrome arguments - const chromeArgs = [ - `--remote-debugging-port=${debugPort}`, - '--remote-debugging-address=127.0.0.1', - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--disable-sync', - '--no-first-run', - '--no-default-browser-check', - '--disable-default-apps', - '--disable-infobars', - '--disable-blink-features=AutomationControlled', - '--disable-component-update', - '--disable-domain-reliability', - '--disable-breakpad', - '--disable-background-networking', - '--disable-background-timer-throttling', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - '--disable-ipc-flooding-protection', - '--password-store=basic', - '--use-mock-keychain', - '--font-render-hinting=none', - '--force-color-profile=srgb', - `--window-size=${width},${height}`, - ...extensionArgs, // Load extensions - ...(headless ? ['--headless=new'] : []), - ...(checkSsl ? [] : ['--ignore-certificate-errors']), - 'about:blank', // Start with blank page - ]; - - // Launch Chrome as a detached process group leader - // This allows us to kill Chrome and all its child processes as a group - const chromeProcess = spawn(binary, chromeArgs, { - detached: true, - stdio: ['ignore', 'ignore', 'ignore'], - }); - chromeProcess.unref(); // Don't keep Node.js process running - - chromePid = chromeProcess.pid; - const chromeStartTime = Date.now() / 1000; // Unix epoch seconds - console.error(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`); - - // Write Chrome PID with mtime set to start time for validation - writePidWithMtime(path.join(OUTPUT_DIR, 'chrome.pid'), chromePid, chromeStartTime); - - // Write command script for validation - writeCmdScript(path.join(OUTPUT_DIR, 'cmd.sh'), binary, chromeArgs); - - fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort)); - - // Write hook's own PID with mtime for validation - const hookStartTime = Date.now() / 1000; - writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); - - try { - // Wait for Chrome to be ready - const versionInfo = await waitForDebugPort(debugPort, 30000); - console.error(`[+] Chrome ready: ${versionInfo.Browser}`); - - // Build WebSocket URL - const wsUrl = versionInfo.webSocketDebuggerUrl; - fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl); - - return { success: true, cdpUrl: wsUrl, pid: chromePid, port: debugPort }; - - } catch (e) { - // Kill Chrome if setup failed - try { - process.kill(chromePid, 'SIGTERM'); - } catch (killErr) { - // Ignore - } - return { success: false, error: `${e.name}: ${e.message}` }; - } -} - async function main() { const args = parseArgs(); const crawlId = args.crawl_id; - const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - let version = ''; - try { - const binary = findChrome(); + const binary = findChromium(); if (!binary) { - console.error('ERROR: Chrome/Chromium binary not found'); - console.error('DEPENDENCY_NEEDED=chrome'); + console.error('ERROR: Chromium binary not found'); + console.error('DEPENDENCY_NEEDED=chromium'); console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); - console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable'); + console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest'); process.exit(1); } - // Get Chrome version + // Get Chromium version + let version = ''; try { const { execSync } = require('child_process'); - version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64); - } catch (e) { - version = ''; + version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }) + .trim() + .slice(0, 64); + } catch (e) {} + + console.error(`[*] Using browser: ${binary}`); + if (version) console.error(`[*] Version: ${version}`); + + // Load installed extensions + const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || + path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + + const installedExtensions = []; + const extensionPaths = []; + if (fs.existsSync(extensionsDir)) { + const files = fs.readdirSync(extensionsDir); + for (const file of files) { + if (file.endsWith('.extension.json')) { + try { + const extPath = path.join(extensionsDir, file); + const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8')); + if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) { + installedExtensions.push(extData); + extensionPaths.push(extData.unpacked_path); + console.error(`[*] Loading extension: ${extData.name || file}`); + } + } catch (e) { + console.warn(`[!] Skipping invalid extension cache: ${file}`); + } + } + } } - const result = await launchChrome(binary); - - if (result.success) { - status = 'succeeded'; - output = OUTPUT_DIR; - console.error(`[+] Chrome session started for crawl ${crawlId}`); - console.error(`[+] CDP URL: ${result.cdpUrl}`); - console.error(`[+] PID: ${result.pid}`); - } else { - status = 'failed'; - error = result.error; + if (installedExtensions.length > 0) { + console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); } + + // Write hook's own PID + const hookStartTime = Date.now() / 1000; + if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR, { recursive: true }); + } + writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); + + // Launch Chromium using consolidated function + const result = await launchChromium({ + binary, + outputDir: OUTPUT_DIR, + extensionPaths, + }); + + if (!result.success) { + console.error(`ERROR: ${result.error}`); + process.exit(1); + } + + chromePid = result.pid; + const cdpUrl = result.cdpUrl; + + // Write extensions metadata + if (installedExtensions.length > 0) { + fs.writeFileSync( + path.join(OUTPUT_DIR, 'extensions.json'), + JSON.stringify(installedExtensions, null, 2) + ); + } + + // Connect puppeteer for extension verification + console.error(`[*] Connecting puppeteer to CDP...`); + const browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: null, + }); + browserInstance = browser; + + // Verify extensions loaded + if (extensionPaths.length > 0) { + await new Promise(r => setTimeout(r, 3000)); + + const targets = browser.targets(); + console.error(`[*] All browser targets (${targets.length}):`); + for (const t of targets) { + console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`); + } + + const extTargets = targets.filter(t => + t.url().startsWith('chrome-extension://') || + t.type() === 'service_worker' || + t.type() === 'background_page' + ); + + // Filter out built-in extensions + const builtinIds = [ + 'nkeimhogjdpnpccoofpliimaahmaaome', + 'fignfifoniblkonapihmkfakmlgkbkcf', + 'ahfgeienlihckogmohjhadlkjgocpleb', + 'mhjfbmdgcfjbbpaeojofohoefgiehjai', + ]; + const customExtTargets = extTargets.filter(t => { + const url = t.url(); + if (!url.startsWith('chrome-extension://')) return false; + const extId = url.split('://')[1].split('/')[0]; + return !builtinIds.includes(extId); + }); + + console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`); + + for (const target of customExtTargets) { + const url = target.url(); + const extId = url.split('://')[1].split('/')[0]; + console.error(`[+] Extension loaded: ${extId} (${target.type()})`); + } + + if (customExtTargets.length === 0 && extensionPaths.length > 0) { + console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`); + console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`); + } + } + + console.error(`[+] Chromium session started for crawl ${crawlId}`); + console.error(`[+] CDP URL: ${cdpUrl}`); + console.error(`[+] PID: ${chromePid}`); + + // Stay alive to handle cleanup on SIGTERM + console.log('[*] Chromium launch hook staying alive to handle cleanup...'); + setInterval(() => {}, 1000000); + } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - if (error) { - console.error(`ERROR: ${error}`); + console.error(`ERROR: ${e.name}: ${e.message}`); process.exit(1); } - - // Background hook - stay running to handle cleanup on SIGTERM - console.log('[*] Chrome launch hook staying alive to handle cleanup...'); - - // Keep process alive by setting an interval (won't actually do anything) - // This allows us to receive SIGTERM when crawl ends - setInterval(() => {}, 1000000); } -main().catch(e => { +main().catch((e) => { console.error(`Fatal error: ${e.message}`); process.exit(1); }); diff --git a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js index b2c222c7..f8b740f7 100755 --- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js @@ -26,7 +26,11 @@ const fs = require('fs'); const path = require('path'); const { spawn } = require('child_process'); const http = require('http'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); + const puppeteer = require('puppeteer-core'); +const { findChromium } = require('./chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'chrome_tab'; @@ -87,31 +91,6 @@ async function cleanup() { process.on('SIGTERM', cleanup); process.on('SIGINT', cleanup); -// Find Chrome binary (for fallback) -function findChrome() { - const chromeBinary = getEnv('CHROME_BINARY'); - if (chromeBinary && fs.existsSync(chromeBinary)) { - return chromeBinary; - } - - const candidates = [ - '/usr/bin/google-chrome', - '/usr/bin/google-chrome-stable', - '/usr/bin/chromium', - '/usr/bin/chromium-browser', - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - ]; - - for (const candidate of candidates) { - if (fs.existsSync(candidate)) { - return candidate; - } - } - - return null; -} - // Parse resolution string function parseResolution(resolution) { const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); @@ -367,7 +346,7 @@ async function main() { let version = ''; try { - const binary = findChrome(); + const binary = findChromium(); if (!binary) { console.error('ERROR: Chrome/Chromium binary not found'); console.error('DEPENDENCY_NEEDED=chrome'); diff --git a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js index 400d5bec..5e2c95d6 100644 --- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js +++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js @@ -17,6 +17,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'chrome_navigate'; diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 6c26735a..380c16ae 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -2,14 +2,18 @@ Integration tests for chrome plugin Tests verify: -1. Chrome install hook checks for Chrome/Chromium binary +1. Chromium install via @puppeteer/browsers 2. Verify deps with abx-pkg 3. Chrome hooks exist -4. Chrome launches at crawl level +4. Chromium launches at crawl level 5. Tab creation at snapshot level 6. Tab navigation works 7. Tab cleanup on SIGTERM -8. Chrome cleanup on crawl end +8. Chromium cleanup on crawl end + +NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for +--load-extension and --disable-extensions-except flags, which are needed for +loading unpacked extensions in headless mode. """ import json @@ -40,49 +44,104 @@ def get_lib_dir_and_machine_type(): return Path(lib_dir), machine_type -# Setup NODE_PATH to find npm packages +# Setup NODE_MODULES_DIR to find npm packages LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type() # Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin) NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' NPM_PREFIX = LIB_DIR / 'npm' +# Chromium install location (relative to DATA_DIR) +CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium' + def get_test_env(): - """Get environment with NODE_PATH set correctly.""" + """Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly.""" env = os.environ.copy() - env['NODE_PATH'] = str(NODE_MODULES_DIR) + env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) env['LIB_DIR'] = str(LIB_DIR) env['MACHINE_TYPE'] = MACHINE_TYPE + # Ensure CHROME_BINARY is set to Chromium + if 'CHROME_BINARY' not in env: + chromium = find_chromium_binary() + if chromium: + env['CHROME_BINARY'] = chromium return env +def find_chromium_binary(): + """Find the Chromium binary installed by @puppeteer/browsers.""" + if not CHROMIUM_INSTALL_DIR.exists(): + return None + + # Look for versioned directories + for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True): + if not version_dir.is_dir(): + continue + # macOS ARM + mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' + if mac_arm.exists(): + return str(mac_arm) + # macOS x64 + mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' + if mac_x64.exists(): + return str(mac_x64) + # Linux + linux = version_dir / 'chrome-linux' / 'chrome' + if linux.exists(): + return str(linux) + + return None + + @pytest.fixture(scope="session", autouse=True) -def ensure_puppeteer_installed(): - """Ensure puppeteer is installed in LIB_DIR before running tests.""" - from abx_pkg import Binary, NpmProvider, BinProviderOverrides +def ensure_chromium_and_puppeteer_installed(): + """Ensure Chromium and puppeteer are installed before running tests.""" + from abx_pkg import Binary, NpmProvider # Rebuild pydantic models NpmProvider.model_rebuild() - # Check if puppeteer-core is already available + # Install puppeteer-core if not available puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core' - if puppeteer_core_path.exists(): - return # Already installed + if not puppeteer_core_path.exists(): + print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...") + NPM_PREFIX.mkdir(parents=True, exist_ok=True) - print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...") - NPM_PREFIX.mkdir(parents=True, exist_ok=True) + provider = NpmProvider(npm_prefix=NPM_PREFIX) + try: + binary = Binary( + name='puppeteer', + binproviders=[provider], + overrides={'npm': {'packages': ['puppeteer@^23.5.0']}} + ) + binary.install() + print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}") + except Exception as e: + pytest.skip(f"Failed to install puppeteer: {e}") - # Install puppeteer using NpmProvider with custom prefix - provider = NpmProvider(npm_prefix=NPM_PREFIX) - try: - binary = Binary( - name='puppeteer', - binproviders=[provider], - overrides={'npm': {'packages': ['puppeteer@^23.5.0']}} + # Install Chromium via @puppeteer/browsers if not available + chromium_binary = find_chromium_binary() + if not chromium_binary: + print(f"\n[*] Installing Chromium to {CHROMIUM_INSTALL_DIR}...") + CHROMIUM_INSTALL_DIR.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + ['npx', '@puppeteer/browsers', 'install', 'chromium@latest'], + cwd=str(CHROMIUM_INSTALL_DIR.parent), + capture_output=True, + text=True, + timeout=300 ) - binary.install() - print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}") - except Exception as e: - pytest.skip(f"Failed to install puppeteer: {e}") + if result.returncode != 0: + pytest.skip(f"Failed to install Chromium: {result.stderr}") + + chromium_binary = find_chromium_binary() + if not chromium_binary: + pytest.skip("Chromium installed but binary not found") + + print(f"[*] Chromium installed: {chromium_binary}") + + # Set CHROME_BINARY env var for tests + os.environ['CHROME_BINARY'] = chromium_binary def test_hook_scripts_exist(): @@ -92,26 +151,22 @@ def test_hook_scripts_exist(): assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}" -def test_verify_deps_with_abx_pkg(): - """Verify chrome is available via abx-pkg.""" - from abx_pkg import Binary, NpmProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides +def test_verify_chromium_available(): + """Verify Chromium is available via CHROME_BINARY env var.""" + chromium_binary = os.environ.get('CHROME_BINARY') or find_chromium_binary() - NpmProvider.model_rebuild() - AptProvider.model_rebuild() - BrewProvider.model_rebuild() - EnvProvider.model_rebuild() + assert chromium_binary, "Chromium binary should be available (set by fixture or found)" + assert Path(chromium_binary).exists(), f"Chromium binary should exist at {chromium_binary}" - # Try to find chrome using same config as install hook - chrome_binary = Binary( - name='chrome', - binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()], - overrides={'npm': {'packages': ['@puppeteer/browsers']}} + # Verify it's actually Chromium by checking version + result = subprocess.run( + [chromium_binary, '--version'], + capture_output=True, + text=True, + timeout=10 ) - chrome_loaded = chrome_binary.load() - - # Chrome should be available (either found by install hook or at explicit path) - assert chrome_loaded and chrome_loaded.abspath, "Chrome should be available via abx-pkg after install hook runs" - assert Path(chrome_loaded.abspath).exists(), f"Chrome binary should exist at {chrome_loaded.abspath}" + assert result.returncode == 0, f"Failed to get Chromium version: {result.stderr}" + assert 'Chromium' in result.stdout or 'Chrome' in result.stdout, f"Unexpected version output: {result.stdout}" def test_chrome_launch_and_tab_creation(): @@ -121,7 +176,7 @@ def test_chrome_launch_and_tab_creation(): crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome' - # Get test environment with NODE_PATH set + # Get test environment with NODE_MODULES_DIR set env = get_test_env() env['CHROME_HEADLESS'] = 'true' diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js index da26da8c..b4e4fa63 100755 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js +++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js @@ -12,6 +12,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'consolelog'; diff --git a/archivebox/plugins/dom/on_Snapshot__53_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js index aaff0e5d..20e5fcea 100644 --- a/archivebox/plugins/dom/on_Snapshot__53_dom.js +++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js @@ -40,7 +40,11 @@ if (!getEnvBool('DOM_ENABLED', true)) { // Now safe to require puppeteer const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); + const puppeteer = require('puppeteer-core'); +const { findChromium } = require('../chrome/chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'dom'; @@ -96,33 +100,6 @@ function getCdpUrl() { return null; } -// Find Chrome binary -function findChrome() { - const chromeBinary = getEnv('CHROME_BINARY'); - if (chromeBinary && fs.existsSync(chromeBinary)) { - return chromeBinary; - } - - const candidates = [ - // Linux - '/usr/bin/google-chrome', - '/usr/bin/google-chrome-stable', - '/usr/bin/chromium', - '/usr/bin/chromium-browser', - // macOS - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - ]; - - for (const candidate of candidates) { - if (candidate.startsWith('/') && fs.existsSync(candidate)) { - return candidate; - } - } - - return null; -} - // Parse resolution string function parseResolution(resolution) { const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); @@ -175,7 +152,7 @@ async function dumpDom(url) { // Fall back to launching new browser if (!browser) { - const executablePath = findChrome(); + const executablePath = findChromium(); if (!executablePath) { return { success: false, error: 'Chrome binary not found' }; } diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py index 8980dbc6..494e131a 100644 --- a/archivebox/plugins/dom/tests/test_dom.py +++ b/archivebox/plugins/dom/tests/test_dom.py @@ -27,7 +27,7 @@ DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None) NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None) TEST_URL = 'https://example.com' -# Get LIB_DIR for NODE_PATH +# Get LIB_DIR for NODE_MODULES_DIR def get_lib_dir(): """Get LIB_DIR for tests.""" from archivebox.config.common import STORAGE_CONFIG @@ -37,9 +37,9 @@ LIB_DIR = get_lib_dir() NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' def get_test_env(): - """Get environment with NODE_PATH set correctly.""" + """Get environment with NODE_MODULES_DIR set correctly.""" env = os.environ.copy() - env['NODE_PATH'] = str(NODE_MODULES_DIR) + env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) env['LIB_DIR'] = str(LIB_DIR) return env diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js index 584dc727..506e8371 100755 --- a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js +++ b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js @@ -45,6 +45,8 @@ if (!getEnvBool('INFINISCROLL_ENABLED', true)) { const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'infiniscroll'; diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py index 7a178958..ba0dca66 100644 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -34,9 +34,9 @@ TEST_URL = 'https://www.singsing.movie/' def get_node_modules_dir(): """Get NODE_MODULES_DIR for tests, checking env first.""" - # Check if NODE_PATH is already set in environment - if os.environ.get('NODE_PATH'): - return Path(os.environ['NODE_PATH']) + # Check if NODE_MODULES_DIR is already set in environment + if os.environ.get('NODE_MODULES_DIR'): + return Path(os.environ['NODE_MODULES_DIR']) # Otherwise compute from LIB_DIR from archivebox.config.common import STORAGE_CONFIG lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) @@ -47,9 +47,9 @@ NODE_MODULES_DIR = get_node_modules_dir() def get_test_env(): - """Get environment with NODE_PATH set correctly.""" + """Get environment with NODE_MODULES_DIR set correctly.""" env = os.environ.copy() - env['NODE_PATH'] = str(NODE_MODULES_DIR) + env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) return env diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js index 81ba3bc4..f2df6629 100755 --- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js +++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js @@ -21,7 +21,7 @@ const path = require('path'); const fs = require('fs'); // Import extension utilities -const extensionUtils = require('../chrome/chrome_extension_utils.js'); +const extensionUtils = require('../chrome/chrome_utils.js'); // Extension metadata const EXTENSION = { diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py index 94564bf0..dfc34a90 100644 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py @@ -6,8 +6,10 @@ Tests invoke the plugin hook as an external process and verify outputs/side effe import json import os +import signal import subprocess import tempfile +import time from pathlib import Path import pytest @@ -120,3 +122,435 @@ def test_no_configuration_required(): # Should not require any API keys or configuration assert "API" not in (result.stdout + result.stderr) or result.returncode == 0 + + +def setup_test_lib_dirs(tmpdir: Path) -> dict: + """Create isolated lib directories for tests and return env dict. + + Sets up: + LIB_DIR: tmpdir/lib/ + NODE_MODULES_DIR: tmpdir/lib//npm/node_modules + NPM_BIN_DIR: tmpdir/lib//npm/bin + PIP_VENV_DIR: tmpdir/lib//pip/venv + PIP_BIN_DIR: tmpdir/lib//pip/venv/bin + """ + import platform + arch = platform.machine() + system = platform.system().lower() + arch_dir = f"{arch}-{system}" + + lib_dir = tmpdir / 'lib' / arch_dir + npm_dir = lib_dir / 'npm' + node_modules_dir = npm_dir / 'node_modules' + npm_bin_dir = npm_dir / 'bin' + pip_venv_dir = lib_dir / 'pip' / 'venv' + pip_bin_dir = pip_venv_dir / 'bin' + + # Create directories + node_modules_dir.mkdir(parents=True, exist_ok=True) + npm_bin_dir.mkdir(parents=True, exist_ok=True) + pip_bin_dir.mkdir(parents=True, exist_ok=True) + + # Install puppeteer-core to the test node_modules if not present + if not (node_modules_dir / 'puppeteer-core').exists(): + result = subprocess.run( + ['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'], + capture_output=True, + text=True, + timeout=120 + ) + if result.returncode != 0: + pytest.skip(f"Failed to install puppeteer-core: {result.stderr}") + + return { + 'LIB_DIR': str(lib_dir), + 'NODE_MODULES_DIR': str(node_modules_dir), + 'NPM_BIN_DIR': str(npm_bin_dir), + 'PIP_VENV_DIR': str(pip_venv_dir), + 'PIP_BIN_DIR': str(pip_bin_dir), + } + + +def find_chromium_binary(): + """Find the Chromium binary installed by @puppeteer/browsers.""" + chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium' + if not chromium_dir.exists(): + return None + + for version_dir in sorted(chromium_dir.iterdir(), reverse=True): + if not version_dir.is_dir(): + continue + # macOS ARM + mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' + if mac_arm.exists(): + return str(mac_arm) + # macOS x64 + mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' + if mac_x64.exists(): + return str(mac_x64) + # Linux + linux = version_dir / 'chrome-linux' / 'chrome' + if linux.exists(): + return str(linux) + return None + + +PLUGINS_ROOT = PLUGIN_DIR.parent +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' + +TEST_URL = 'https://www.filmin.es/' + + +def test_extension_loads_in_chromium(): + """Verify extension loads in Chromium by visiting its options page. + + Uses Chromium with --load-extension to load the extension, then navigates + to chrome-extension:///options.html and checks that the extension name + appears in the page content. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Set up isolated lib directories for this test + lib_env = setup_test_lib_dirs(tmpdir) + + # Set up extensions directory + ext_dir = tmpdir / 'chrome_extensions' + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env.update(lib_env) + env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) + env['CHROME_HEADLESS'] = 'true' + + # Ensure CHROME_BINARY points to Chromium + chromium = find_chromium_binary() + if chromium: + env['CHROME_BINARY'] = chromium + + # Step 1: Install the extension + result = subprocess.run( + ['node', str(INSTALL_SCRIPT)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=60 + ) + assert result.returncode == 0, f"Extension install failed: {result.stderr}" + + # Verify extension cache was created + cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json' + assert cache_file.exists(), "Extension cache not created" + ext_data = json.loads(cache_file.read_text()) + print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") + + # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) + crawl_dir = tmpdir / 'crawl' + crawl_dir.mkdir() + chrome_dir = crawl_dir / 'chrome' + + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'], + cwd=str(crawl_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chromium to launch and CDP URL to be available + cdp_url = None + for i in range(20): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) + + assert cdp_url, "Chromium CDP URL not found after 20s" + print(f"Chromium launched with CDP URL: {cdp_url}") + + # Check that extensions were loaded + extensions_file = chrome_dir / 'extensions.json' + if extensions_file.exists(): + loaded_exts = json.loads(extensions_file.read_text()) + print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") + + try: + # Step 3: Connect to Chromium and verify extension loaded via options page + test_script = f''' +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); + +(async () => {{ + const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + + // Wait for extension to initialize + await new Promise(r => setTimeout(r, 2000)); + + // Find extension targets to get the extension ID + const targets = browser.targets(); + const extTargets = targets.filter(t => + t.url().startsWith('chrome-extension://') || + t.type() === 'service_worker' || + t.type() === 'background_page' + ); + + // Filter out Chrome's built-in extensions + const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf', + 'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai']; + const customExtTargets = extTargets.filter(t => {{ + const url = t.url(); + if (!url.startsWith('chrome-extension://')) return false; + const extId = url.split('://')[1].split('/')[0]; + return !builtinIds.includes(extId); + }}); + + console.error('Custom extension targets found:', customExtTargets.length); + customExtTargets.forEach(t => console.error(' -', t.type(), t.url())); + + if (customExtTargets.length === 0) {{ + console.log(JSON.stringify({{ loaded: false, error: 'No custom extension targets found' }})); + browser.disconnect(); + return; + }} + + // Get the extension ID from the first custom extension target + const extUrl = customExtTargets[0].url(); + const extId = extUrl.split('://')[1].split('/')[0]; + console.error('Extension ID:', extId); + + // Try to navigate to the extension's options.html page + const page = await browser.newPage(); + const optionsUrl = 'chrome-extension://' + extId + '/options.html'; + console.error('Navigating to options page:', optionsUrl); + + try {{ + await page.goto(optionsUrl, {{ waitUntil: 'domcontentloaded', timeout: 10000 }}); + const pageContent = await page.content(); + const pageTitle = await page.title(); + + // Check if extension name appears in the page + const hasExtensionName = pageContent.toLowerCase().includes('cookie') || + pageContent.toLowerCase().includes('idontcareaboutcookies') || + pageTitle.toLowerCase().includes('cookie'); + + console.log(JSON.stringify({{ + loaded: true, + extensionId: extId, + optionsPageLoaded: true, + pageTitle: pageTitle, + hasExtensionName: hasExtensionName, + contentLength: pageContent.length + }})); + }} catch (e) {{ + // options.html may not exist, but extension is still loaded + console.log(JSON.stringify({{ + loaded: true, + extensionId: extId, + optionsPageLoaded: false, + error: e.message + }})); + }} + + browser.disconnect(); +}})(); +''' + script_path = tmpdir / 'test_extension.js' + script_path.write_text(test_script) + + result = subprocess.run( + ['node', str(script_path)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=90 + ) + + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + + assert result.returncode == 0, f"Test failed: {result.stderr}" + + output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] + assert output_lines, f"No JSON output: {result.stdout}" + + test_result = json.loads(output_lines[-1]) + assert test_result.get('loaded'), \ + f"Extension should be loaded in Chromium. Result: {test_result}" + print(f"Extension loaded successfully: {test_result}") + + finally: + # Clean up Chromium + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + chrome_pid_file = chrome_dir / 'chrome.pid' + if chrome_pid_file.exists(): + try: + chrome_pid = int(chrome_pid_file.read_text().strip()) + os.kill(chrome_pid, signal.SIGKILL) + except (OSError, ValueError): + pass + + +def test_hides_cookie_consent_on_filmin(): + """Live test: verify extension hides cookie consent popup on filmin.es. + + Uses Chromium with extensions loaded automatically via chrome hook. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Set up isolated lib directories for this test + lib_env = setup_test_lib_dirs(tmpdir) + + # Set up extensions directory + ext_dir = tmpdir / 'chrome_extensions' + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env.update(lib_env) + env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) + env['CHROME_HEADLESS'] = 'true' + + # Ensure CHROME_BINARY points to Chromium + chromium = find_chromium_binary() + if chromium: + env['CHROME_BINARY'] = chromium + + # Step 1: Install the extension + result = subprocess.run( + ['node', str(INSTALL_SCRIPT)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=60 + ) + assert result.returncode == 0, f"Extension install failed: {result.stderr}" + + # Verify extension cache was created + cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json' + assert cache_file.exists(), "Extension cache not created" + ext_data = json.loads(cache_file.read_text()) + print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") + + # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) + crawl_dir = tmpdir / 'crawl' + crawl_dir.mkdir() + chrome_dir = crawl_dir / 'chrome' + + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'], + cwd=str(crawl_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chromium to launch and CDP URL to be available + cdp_url = None + for i in range(20): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) + + assert cdp_url, "Chromium CDP URL not found after 20s" + print(f"Chromium launched with CDP URL: {cdp_url}") + + try: + # Step 3: Connect to Chromium and test cookie consent hiding + test_script = f''' +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); + +(async () => {{ + const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + + // Wait for extension to initialize + await new Promise(r => setTimeout(r, 2000)); + + const page = await browser.newPage(); + await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'); + await page.setViewport({{ width: 1440, height: 900 }}); + + console.error('Navigating to {TEST_URL}...'); + await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }}); + + // Wait for extension content script to process page + await new Promise(r => setTimeout(r, 5000)); + + // Check cookie consent visibility + const result = await page.evaluate(() => {{ + const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay']; + for (const sel of selectors) {{ + const el = document.querySelector(sel); + if (el) {{ + const style = window.getComputedStyle(el); + const rect = el.getBoundingClientRect(); + const visible = style.display !== 'none' && + style.visibility !== 'hidden' && + rect.width > 0 && rect.height > 0; + if (visible) return {{ visible: true, selector: sel }}; + }} + }} + return {{ visible: false }}; + }}); + + console.error('Cookie consent:', JSON.stringify(result)); + browser.disconnect(); + console.log(JSON.stringify(result)); +}})(); +''' + script_path = tmpdir / 'test_extension.js' + script_path.write_text(test_script) + + result = subprocess.run( + ['node', str(script_path)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=90 + ) + + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + + assert result.returncode == 0, f"Test failed: {result.stderr}" + + output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] + assert output_lines, f"No JSON output: {result.stdout}" + + test_result = json.loads(output_lines[-1]) + assert not test_result['visible'], \ + f"Cookie consent should be hidden by extension. Result: {test_result}" + + finally: + # Clean up Chromium + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + chrome_pid_file = chrome_dir / 'chrome.pid' + if chrome_pid_file.exists(): + try: + chrome_pid = int(chrome_pid_file.read_text().strip()) + os.kill(chrome_pid, signal.SIGKILL) + except (OSError, ValueError): + pass diff --git a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js index d500f51d..c60b09ec 100644 --- a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js +++ b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js @@ -44,6 +44,8 @@ if (!getEnvBool('MODALCLOSER_ENABLED', true)) { const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'modalcloser'; @@ -156,22 +158,59 @@ async function closeModals(page) { // Generic fallback - hide unrecognized modals with CSS const genericSelectors = [ - // CookieYes (cky) - popular cookie consent library - '.cky-consent-container', - '.cky-popup-center', - '.cky-overlay', - '.cky-modal', - '#ckyPreferenceCenter', + // CookieYes (cky) + '.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal', '#ckyPreferenceCenter', + // OneTrust + '#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter', '#onetrust-pc-sdk', + // CookieBot + '#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay', '#CookiebotWidget', + // Quantcast / CMP + '.qc-cmp-ui-container', '#qc-cmp2-container', '.qc-cmp2-summary-buttons', + // TrustArc / TrustE + '#truste-consent-track', '.truste-banner', '#truste-consent-content', + // Osano + '.osano-cm-window', '.osano-cm-dialog', + // Klaro + '.klaro .cookie-modal', '.klaro .cookie-notice', + // Tarteaucitron + '#tarteaucitronRoot', '#tarteaucitronAlertBig', + // Complianz (WordPress) + '.cmplz-cookiebanner', '#cmplz-cookiebanner-container', + // GDPR Cookie Consent (WordPress) + '#gdpr-cookie-consent-bar', '.gdpr-cookie-consent-popup', + // Cookie Notice (WordPress) + '#cookie-notice', '.cookie-notice-container', + // EU Cookie Law + '.eupopup', '#eu-cookie-law', + // Didomi + '#didomi-popup', '#didomi-host', '.didomi-popup-container', + // Usercentrics + '#usercentrics-root', '.uc-banner', + // Axeptio + '#axeptio_overlay', '#axeptio_btn', + // iubenda + '#iubenda-cs-banner', '.iubenda-cs-container', + // Termly + '.termly-consent-banner', '#termly-code-snippet-support', + // Borlabs Cookie (WordPress) + '#BorlabsCookieBox', '.BorlabsCookie', + // CookieFirst + '.cookiefirst-root', '#cookiefirst-root', + // CookieScript + '#cookiescript_injected', '.cookiescript_injected_wrapper', + // Civic Cookie Control + '#ccc', '#ccc-overlay', + // Generic patterns + '#cookie-consent', '.cookie-banner', '.cookie-notice', + '#cookieConsent', '.cookie-consent', '.cookies-banner', + '[class*="cookie"][class*="banner"]', '[class*="cookie"][class*="notice"]', + '[class*="cookie"][class*="popup"]', '[class*="cookie"][class*="modal"]', + '[class*="consent"][class*="banner"]', '[class*="consent"][class*="popup"]', + '[class*="gdpr"]', '[class*="privacy"][class*="banner"]', // Modal overlays and backdrops '.modal-overlay:not([style*="display: none"])', '.modal-backdrop:not([style*="display: none"])', '.overlay-visible', - // Cookie consent banners - '#cookie-consent', '.cookie-banner', '.cookie-notice', - '#cookieConsent', '.cookie-consent', '.cookies-banner', - '[class*="cookie"][class*="banner"]', - '[class*="cookie"][class*="notice"]', - '[class*="gdpr"]', // Popup overlays '.popup-overlay', '.newsletter-popup', '.age-gate', '.subscribe-popup', '.subscription-modal', diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py index fd62697f..b0b185f8 100644 --- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py +++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py @@ -35,9 +35,9 @@ COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/' def get_node_modules_dir(): """Get NODE_MODULES_DIR for tests, checking env first.""" - # Check if NODE_PATH is already set in environment - if os.environ.get('NODE_PATH'): - return Path(os.environ['NODE_PATH']) + # Check if NODE_MODULES_DIR is already set in environment + if os.environ.get('NODE_MODULES_DIR'): + return Path(os.environ['NODE_MODULES_DIR']) # Otherwise compute from LIB_DIR from archivebox.config.common import STORAGE_CONFIG lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) @@ -48,9 +48,9 @@ NODE_MODULES_DIR = get_node_modules_dir() def get_test_env(): - """Get environment with NODE_PATH set correctly.""" + """Get environment with NODE_MODULES_DIR set correctly.""" env = os.environ.copy() - env['NODE_PATH'] = str(NODE_MODULES_DIR) + env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) return env diff --git a/archivebox/plugins/npm/on_Binary__install_using_npm_provider.py b/archivebox/plugins/npm/on_Binary__install_using_npm_provider.py index 8c56d4fd..4bf1a05c 100644 --- a/archivebox/plugins/npm/on_Binary__install_using_npm_provider.py +++ b/archivebox/plugins/npm/on_Binary__install_using_npm_provider.py @@ -90,6 +90,32 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c } print(json.dumps(record)) + # Emit PATH update if npm bin dir not already in PATH + npm_bin_dir = str(npm_prefix / 'bin') + current_path = os.environ.get('PATH', '') + + # Check if npm_bin_dir is already in PATH + path_dirs = current_path.split(':') + if npm_bin_dir not in path_dirs: + # Prepend npm_bin_dir to PATH + new_path = f"{npm_bin_dir}:{current_path}" if current_path else npm_bin_dir + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/PATH', + 'value': new_path, + })) + click.echo(f" Added {npm_bin_dir} to PATH", err=True) + + # Also emit NODE_MODULES_DIR for JS module resolution + node_modules_dir = str(npm_prefix / 'node_modules') + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/NODE_MODULES_DIR', + 'value': node_modules_dir, + })) + # Log human-readable info to stderr click.echo(f"Installed {name} at {binary.abspath}", err=True) click.echo(f" version: {binary.version}", err=True) diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js index 766710b2..9eb86c26 100755 --- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js +++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js @@ -20,6 +20,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); // Extractor metadata diff --git a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js index 47db7478..e42a8a6e 100644 --- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js +++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js @@ -40,7 +40,10 @@ if (!getEnvBool('PDF_ENABLED', true)) { // Now safe to require puppeteer const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); +const { findChromium } = require('../chrome/chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'pdf'; @@ -96,33 +99,6 @@ function getCdpUrl() { return null; } -// Find Chrome binary -function findChrome() { - const chromeBinary = getEnv('CHROME_BINARY'); - if (chromeBinary && fs.existsSync(chromeBinary)) { - return chromeBinary; - } - - const candidates = [ - // Linux - '/usr/bin/google-chrome', - '/usr/bin/google-chrome-stable', - '/usr/bin/chromium', - '/usr/bin/chromium-browser', - // macOS - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - ]; - - for (const candidate of candidates) { - if (candidate.startsWith('/') && fs.existsSync(candidate)) { - return candidate; - } - } - - return null; -} - // Parse resolution string function parseResolution(resolution) { const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); @@ -175,7 +151,7 @@ async function printToPdf(url) { // Fall back to launching new browser if (!browser) { - const executablePath = findChrome(); + const executablePath = findChromium(); if (!executablePath) { return { success: false, error: 'Chrome binary not found' }; } diff --git a/archivebox/plugins/pdf/tests/test_pdf.py b/archivebox/plugins/pdf/tests/test_pdf.py index 39244152..681e7225 100644 --- a/archivebox/plugins/pdf/tests/test_pdf.py +++ b/archivebox/plugins/pdf/tests/test_pdf.py @@ -28,7 +28,7 @@ PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None) NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py' TEST_URL = 'https://example.com' -# Get LIB_DIR for NODE_PATH +# Get LIB_DIR for NODE_MODULES_DIR def get_lib_dir(): """Get LIB_DIR for tests.""" from archivebox.config.common import STORAGE_CONFIG @@ -38,9 +38,9 @@ LIB_DIR = get_lib_dir() NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' def get_test_env(): - """Get environment with NODE_PATH set correctly.""" + """Get environment with NODE_MODULES_DIR set correctly.""" env = os.environ.copy() - env['NODE_PATH'] = str(NODE_MODULES_DIR) + env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) env['LIB_DIR'] = str(LIB_DIR) return env diff --git a/archivebox/plugins/pip/on_Binary__install_using_pip_provider.py b/archivebox/plugins/pip/on_Binary__install_using_pip_provider.py index d0ab1925..edbeef4b 100644 --- a/archivebox/plugins/pip/on_Binary__install_using_pip_provider.py +++ b/archivebox/plugins/pip/on_Binary__install_using_pip_provider.py @@ -15,7 +15,7 @@ import sys from pathlib import Path import rich_click as click -from abx_pkg import Binary, PipProvider +from abx_pkg import Binary, PipProvider, BinProviderOverrides # Fix pydantic forward reference issue PipProvider.model_rebuild() @@ -87,6 +87,23 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override } print(json.dumps(record)) + # Emit PATH update if pip bin dir not already in PATH + pip_bin_dir = str(pip_venv_path / 'bin') + current_path = os.environ.get('PATH', '') + + # Check if pip_bin_dir is already in PATH + path_dirs = current_path.split(':') + if pip_bin_dir not in path_dirs: + # Prepend pip_bin_dir to PATH + new_path = f"{pip_bin_dir}:{current_path}" if current_path else pip_bin_dir + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/PATH', + 'value': new_path, + })) + click.echo(f" Added {pip_bin_dir} to PATH", err=True) + # Log human-readable info to stderr click.echo(f"Installed {name} at {binary.abspath}", err=True) click.echo(f" version: {binary.version}", err=True) diff --git a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js index af95e40b..d6c2497f 100755 --- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js +++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js @@ -12,6 +12,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'redirects'; diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js index f4252801..33697f55 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js @@ -13,6 +13,8 @@ const fs = require('fs'); const path = require('path'); const crypto = require('crypto'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'responses'; diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js index 71a5995c..da25c459 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js @@ -40,7 +40,10 @@ if (!getEnvBool('SCREENSHOT_ENABLED', true)) { // Now safe to require puppeteer const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); +const { findChromium } = require('../chrome/chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'screenshot'; @@ -96,36 +99,6 @@ function getCdpUrl() { return null; } -// Find Chrome binary -function findChrome() { - const chromeBinary = getEnv('CHROME_BINARY'); - if (chromeBinary && fs.existsSync(chromeBinary)) { - return chromeBinary; - } - - const candidates = [ - // Linux - '/usr/bin/google-chrome', - '/usr/bin/google-chrome-stable', - '/usr/bin/chromium', - '/usr/bin/chromium-browser', - // macOS - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - // Common paths - 'google-chrome', - 'chromium', - ]; - - for (const candidate of candidates) { - if (candidate.startsWith('/') && fs.existsSync(candidate)) { - return candidate; - } - } - - return null; -} - // Parse resolution string function parseResolution(resolution) { const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); @@ -178,7 +151,7 @@ async function takeScreenshot(url) { // Fall back to launching new browser if (!browser) { - const executablePath = findChrome(); + const executablePath = findChromium(); if (!executablePath) { return { success: false, error: 'Chrome binary not found' }; } diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index bd29b395..edfbd54a 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -26,7 +26,7 @@ PLUGINS_ROOT = PLUGIN_DIR.parent SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None) TEST_URL = 'https://example.com' -# Get LIB_DIR for NODE_PATH +# Get LIB_DIR for NODE_MODULES_DIR def get_lib_dir(): """Get LIB_DIR for tests.""" from archivebox.config.common import STORAGE_CONFIG @@ -36,9 +36,9 @@ LIB_DIR = get_lib_dir() NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' def get_test_env(): - """Get environment with NODE_PATH set correctly.""" + """Get environment with NODE_MODULES_DIR set correctly.""" env = os.environ.copy() - env['NODE_PATH'] = str(NODE_MODULES_DIR) + env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) env['LIB_DIR'] = str(LIB_DIR) return env diff --git a/archivebox/plugins/seo/on_Snapshot__38_seo.js b/archivebox/plugins/seo/on_Snapshot__38_seo.js index d034468f..bbe1177a 100755 --- a/archivebox/plugins/seo/on_Snapshot__38_seo.js +++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js @@ -17,6 +17,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); // Extractor metadata diff --git a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js index 3eec6c1a..7637bf98 100755 --- a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js +++ b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js @@ -25,7 +25,7 @@ const { exec } = require('child_process'); const execAsync = promisify(exec); // Import extension utilities -const extensionUtils = require('../chrome/chrome_extension_utils.js'); +const extensionUtils = require('../chrome/chrome_utils.js'); // Extension metadata const EXTENSION = { diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js index cad2e142..83ff4d61 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js @@ -12,6 +12,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'ssl'; diff --git a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js index ddbd933c..5a501694 100644 --- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js +++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js @@ -12,6 +12,8 @@ const fs = require('fs'); const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'staticfile'; diff --git a/archivebox/plugins/ublock/on_Crawl__03_ublock.js b/archivebox/plugins/ublock/on_Crawl__03_ublock.js index 738d8d82..b8a0219c 100755 --- a/archivebox/plugins/ublock/on_Crawl__03_ublock.js +++ b/archivebox/plugins/ublock/on_Crawl__03_ublock.js @@ -22,7 +22,7 @@ const path = require('path'); const fs = require('fs'); // Import extension utilities -const extensionUtils = require('../chrome/chrome_extension_utils.js'); +const extensionUtils = require('../chrome/chrome_utils.js'); // Extension metadata const EXTENSION = { diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index 8a1ae211..2ba718e0 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -155,3 +155,461 @@ def test_large_extension_size(): # uBlock Origin with filter lists is typically 2-5 MB size_bytes = crx_file.stat().st_size assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes" + + +def setup_test_lib_dirs(tmpdir: Path) -> dict: + """Create isolated lib directories for tests and return env dict. + + Sets up: + LIB_DIR: tmpdir/lib/ + NODE_MODULES_DIR: tmpdir/lib//npm/node_modules + NPM_BIN_DIR: tmpdir/lib//npm/bin + PIP_VENV_DIR: tmpdir/lib//pip/venv + PIP_BIN_DIR: tmpdir/lib//pip/venv/bin + """ + import platform + arch = platform.machine() + system = platform.system().lower() + arch_dir = f"{arch}-{system}" + + lib_dir = tmpdir / 'lib' / arch_dir + npm_dir = lib_dir / 'npm' + node_modules_dir = npm_dir / 'node_modules' + npm_bin_dir = npm_dir / 'bin' + pip_venv_dir = lib_dir / 'pip' / 'venv' + pip_bin_dir = pip_venv_dir / 'bin' + + # Create directories + node_modules_dir.mkdir(parents=True, exist_ok=True) + npm_bin_dir.mkdir(parents=True, exist_ok=True) + pip_bin_dir.mkdir(parents=True, exist_ok=True) + + # Install puppeteer-core to the test node_modules if not present + if not (node_modules_dir / 'puppeteer-core').exists(): + result = subprocess.run( + ['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'], + capture_output=True, + text=True, + timeout=120 + ) + if result.returncode != 0: + pytest.skip(f"Failed to install puppeteer-core: {result.stderr}") + + return { + 'LIB_DIR': str(lib_dir), + 'NODE_MODULES_DIR': str(node_modules_dir), + 'NPM_BIN_DIR': str(npm_bin_dir), + 'PIP_VENV_DIR': str(pip_venv_dir), + 'PIP_BIN_DIR': str(pip_bin_dir), + } + + +def find_chromium_binary(): + """Find the Chromium binary installed by @puppeteer/browsers.""" + chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium' + if not chromium_dir.exists(): + return None + + for version_dir in sorted(chromium_dir.iterdir(), reverse=True): + if not version_dir.is_dir(): + continue + # macOS ARM + mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' + if mac_arm.exists(): + return str(mac_arm) + # macOS x64 + mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' + if mac_x64.exists(): + return str(mac_x64) + # Linux + linux = version_dir / 'chrome-linux' / 'chrome' + if linux.exists(): + return str(linux) + return None + + +PLUGINS_ROOT = PLUGIN_DIR.parent +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' + +# Test URL: ad blocker test page that shows if ads are blocked +TEST_URL = 'https://d3ward.github.io/toolz/adblock.html' + + +def test_extension_loads_in_chromium(): + """Verify uBlock extension loads in Chromium by visiting its dashboard page. + + Uses Chromium with --load-extension to load the extension, then navigates + to chrome-extension:///dashboard.html and checks that "uBlock" appears + in the page content. + """ + import signal + import time + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Set up isolated lib directories for this test + lib_env = setup_test_lib_dirs(tmpdir) + + # Set up extensions directory + ext_dir = tmpdir / 'chrome_extensions' + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env.update(lib_env) + env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) + env['CHROME_HEADLESS'] = 'true' + + # Ensure CHROME_BINARY points to Chromium + chromium = find_chromium_binary() + if chromium: + env['CHROME_BINARY'] = chromium + + # Step 1: Install the uBlock extension + result = subprocess.run( + ['node', str(INSTALL_SCRIPT)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=120 + ) + assert result.returncode == 0, f"Extension install failed: {result.stderr}" + + # Verify extension cache was created + cache_file = ext_dir / 'ublock.extension.json' + assert cache_file.exists(), "Extension cache not created" + ext_data = json.loads(cache_file.read_text()) + print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") + + # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) + crawl_dir = tmpdir / 'crawl' + crawl_dir.mkdir() + chrome_dir = crawl_dir / 'chrome' + + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'], + cwd=str(crawl_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chromium to launch and CDP URL to be available + cdp_url = None + for i in range(20): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) + + assert cdp_url, "Chromium CDP URL not found after 20s" + print(f"Chromium launched with CDP URL: {cdp_url}") + + # Print chrome hook stderr for debugging + # Read what's available without blocking + import select + if select.select([chrome_launch_process.stderr], [], [], 0.1)[0]: + chrome_stderr = chrome_launch_process.stderr.read() + print(f"Chrome hook stderr:\n{chrome_stderr}") + + # Check what extensions were loaded by chrome hook + extensions_file = chrome_dir / 'extensions.json' + if extensions_file.exists(): + loaded_exts = json.loads(extensions_file.read_text()) + print(f"Extensions loaded by chrome hook: {[e.get('name') for e in loaded_exts]}") + else: + print("Warning: extensions.json not found") + + # Get the unpacked extension ID - Chrome computes this from the path + unpacked_path = ext_data.get('unpacked_path', '') + print(f"Extension unpacked path: {unpacked_path}") + + try: + # Step 3: Connect to Chromium and verify extension loads + # First use CDP to get all targets and find extension ID + test_script = f''' +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); + +(async () => {{ + const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + + // Wait for extension to initialize + await new Promise(r => setTimeout(r, 3000)); + + // Use CDP to get all targets including service workers + const pages = await browser.pages(); + const page = pages[0] || await browser.newPage(); + const client = await page.createCDPSession(); + + const {{ targetInfos }} = await client.send('Target.getTargets'); + console.error('All CDP targets:'); + targetInfos.forEach(t => console.error(' -', t.type, t.url.slice(0, 100))); + + // Find any chrome-extension:// URLs + const extTargets = targetInfos.filter(t => t.url.startsWith('chrome-extension://')); + console.error('Extension targets:', extTargets.length); + + // Filter out built-in extensions + const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf', + 'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai']; + const customExts = extTargets.filter(t => {{ + const extId = t.url.split('://')[1].split('/')[0]; + return !builtinIds.includes(extId); + }}); + + if (customExts.length === 0) {{ + console.log(JSON.stringify({{ loaded: false, error: 'No custom extension found via CDP' }})); + browser.disconnect(); + return; + }} + + // Get extension ID from first custom extension + const extId = customExts[0].url.split('://')[1].split('/')[0]; + console.error('Found extension ID:', extId); + + // Try to load dashboard.html + const newPage = await browser.newPage(); + const dashboardUrl = 'chrome-extension://' + extId + '/dashboard.html'; + console.error('Loading:', dashboardUrl); + + try {{ + await newPage.goto(dashboardUrl, {{ waitUntil: 'domcontentloaded', timeout: 15000 }}); + const title = await newPage.title(); + const content = await newPage.content(); + const hasUblock = content.toLowerCase().includes('ublock') || title.toLowerCase().includes('ublock'); + + console.log(JSON.stringify({{ + loaded: true, + extensionId: extId, + pageTitle: title, + hasExtensionName: hasUblock, + contentLength: content.length + }})); + }} catch (e) {{ + console.error('Dashboard load failed:', e.message); + console.log(JSON.stringify({{ loaded: true, extensionId: extId, dashboardError: e.message }})); + }} + + browser.disconnect(); +}})(); +''' + script_path = tmpdir / 'test_ublock.js' + script_path.write_text(test_script) + + result = subprocess.run( + ['node', str(script_path)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=90 + ) + + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + + assert result.returncode == 0, f"Test failed: {result.stderr}" + + output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] + assert output_lines, f"No JSON output: {result.stdout}" + + test_result = json.loads(output_lines[-1]) + assert test_result.get('loaded'), \ + f"uBlock extension should be loaded in Chromium. Result: {test_result}" + print(f"Extension loaded successfully: {test_result}") + + finally: + # Clean up Chromium + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + chrome_pid_file = chrome_dir / 'chrome.pid' + if chrome_pid_file.exists(): + try: + chrome_pid = int(chrome_pid_file.read_text().strip()) + os.kill(chrome_pid, signal.SIGKILL) + except (OSError, ValueError): + pass + + +def test_blocks_ads_on_test_page(): + """Live test: verify uBlock Origin blocks ads on a test page. + + Uses Chromium with extensions loaded automatically via chrome hook. + Tests against d3ward's ad blocker test page which checks ad domains. + """ + import signal + import time + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Set up isolated lib directories for this test + lib_env = setup_test_lib_dirs(tmpdir) + + # Set up extensions directory + ext_dir = tmpdir / 'chrome_extensions' + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env.update(lib_env) + env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) + env['CHROME_HEADLESS'] = 'true' + + # Ensure CHROME_BINARY points to Chromium + chromium = find_chromium_binary() + if chromium: + env['CHROME_BINARY'] = chromium + + # Step 1: Install the uBlock extension + result = subprocess.run( + ['node', str(INSTALL_SCRIPT)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=120 + ) + assert result.returncode == 0, f"Extension install failed: {result.stderr}" + + # Verify extension cache was created + cache_file = ext_dir / 'ublock.extension.json' + assert cache_file.exists(), "Extension cache not created" + ext_data = json.loads(cache_file.read_text()) + print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") + + # Step 2: Launch Chrome using the chrome hook (loads extensions automatically) + crawl_dir = tmpdir / 'crawl' + crawl_dir.mkdir() + chrome_dir = crawl_dir / 'chrome' + + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'], + cwd=str(crawl_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chrome to launch and CDP URL to be available + cdp_url = None + for i in range(20): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) + + assert cdp_url, "Chrome CDP URL not found after 20s" + print(f"Chrome launched with CDP URL: {cdp_url}") + + # Check that extensions were loaded + extensions_file = chrome_dir / 'extensions.json' + if extensions_file.exists(): + loaded_exts = json.loads(extensions_file.read_text()) + print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") + + try: + # Step 3: Connect to Chrome and test ad blocking + test_script = f''' +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); + +(async () => {{ + const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + + // Wait for extension to initialize + await new Promise(r => setTimeout(r, 3000)); + + // Check extension loaded by looking at targets + const targets = browser.targets(); + const extTargets = targets.filter(t => + t.url().startsWith('chrome-extension://') || + t.type() === 'service_worker' || + t.type() === 'background_page' + ); + console.error('Extension targets found:', extTargets.length); + extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60))); + + const page = await browser.newPage(); + await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'); + await page.setViewport({{ width: 1440, height: 900 }}); + + console.error('Navigating to {TEST_URL}...'); + await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }}); + + // Wait for the test page to run its checks + await new Promise(r => setTimeout(r, 5000)); + + // The d3ward test page shows blocked percentage + const result = await page.evaluate(() => {{ + const scoreEl = document.querySelector('#score'); + const score = scoreEl ? scoreEl.textContent : null; + const blockedItems = document.querySelectorAll('.blocked').length; + const totalItems = document.querySelectorAll('.testlist li').length; + return {{ + score, + blockedItems, + totalItems, + percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0 + }}; + }}); + + console.error('Ad blocking result:', JSON.stringify(result)); + browser.disconnect(); + console.log(JSON.stringify(result)); +}})(); +''' + script_path = tmpdir / 'test_ublock.js' + script_path.write_text(test_script) + + result = subprocess.run( + ['node', str(script_path)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env, + timeout=90 + ) + + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + + assert result.returncode == 0, f"Test failed: {result.stderr}" + + output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] + assert output_lines, f"No JSON output: {result.stdout}" + + test_result = json.loads(output_lines[-1]) + + # uBlock should block most ad domains on the test page + assert test_result['percentBlocked'] >= 50, \ + f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}" + + finally: + # Clean up Chrome + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + chrome_pid_file = chrome_dir / 'chrome.pid' + if chrome_pid_file.exists(): + try: + chrome_pid = int(chrome_pid_file.read_text().strip()) + os.kill(chrome_pid, signal.SIGKILL) + except (OSError, ValueError): + pass