From 7e6e3be9e74795f500818a5e99d417019cbd3bc9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 29 Dec 2025 18:49:36 -0800 Subject: [PATCH] messing with chrome install process to reuse cached chromium with pinned version --- archivebox/plugins/chrome/chrome_utils.js | 245 ++++++++++++++---- .../chrome/on_Crawl__00_chrome_install.py | 166 ++++++++---- .../plugins/chrome/tests/test_chrome.py | 41 +-- .../tests/test_istilldontcareaboutcookies.py | 41 ++- .../plugins/ublock/tests/test_ublock.py | 174 +++++++------ 5 files changed, 448 insertions(+), 219 deletions(-) diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index fa331ee5..fd09fbb3 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -433,6 +433,103 @@ async function killChrome(pid, outputDir = null) { console.error('[*] Chrome process killed'); } +/** + * Install Chromium using @puppeteer/browsers programmatic API. + * Uses puppeteer's default cache location, returns the binary path. + * + * @param {Object} options - Install options + * @returns {Promise} - {success, binary, version, error} + */ +async function installChromium(options = {}) { + // Check if CHROME_BINARY is already set and valid + const configuredBinary = getEnv('CHROME_BINARY'); + if (configuredBinary && fs.existsSync(configuredBinary)) { + console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`); + return { success: true, binary: configuredBinary, version: null }; + } + + // Try to load @puppeteer/browsers from NODE_MODULES_DIR or system + let puppeteerBrowsers; + try { + if (process.env.NODE_MODULES_DIR) { + module.paths.unshift(process.env.NODE_MODULES_DIR); + } + puppeteerBrowsers = require('@puppeteer/browsers'); + } catch (e) { + console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`); + return { success: false, error: '@puppeteer/browsers not installed' }; + } + + console.error(`[*] Installing Chromium via @puppeteer/browsers...`); + + try { + const result = await puppeteerBrowsers.install({ + browser: 'chromium', + buildId: 'latest', + }); + + const binary = result.executablePath; + const version = result.buildId; + + if (!binary || !fs.existsSync(binary)) { + console.error(`[!] Chromium binary not found at: ${binary}`); + return { success: false, error: `Chromium binary not found at: ${binary}` }; + } + + console.error(`[+] Chromium installed: ${binary}`); + return { success: true, binary, version }; + } catch (e) { + console.error(`[!] Failed to install Chromium: ${e.message}`); + return { success: false, error: e.message }; + } +} + +/** + * Install puppeteer-core npm package. + * + * @param {Object} options - Install options + * @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib//npm or ./node_modules parent) + * @param {number} [options.timeout=60000] - Timeout in milliseconds + * @returns {Promise} - {success, path, error} + */ +async function installPuppeteerCore(options = {}) { + const arch = `${process.arch}-${process.platform}`; + const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm'); + const { + npmPrefix = defaultPrefix, + timeout = 60000, + } = options; + + const nodeModulesDir = path.join(npmPrefix, 'node_modules'); + const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core'); + + // Check if already installed + if (fs.existsSync(puppeteerPath)) { + console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`); + return { success: true, path: puppeteerPath }; + } + + console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`); + + // Create directory + if (!fs.existsSync(npmPrefix)) { + fs.mkdirSync(npmPrefix, { recursive: true }); + } + + try { + const { execSync } = require('child_process'); + execSync( + `npm install --prefix "${npmPrefix}" puppeteer-core`, + { encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] } + ); + console.error(`[+] puppeteer-core installed successfully`); + return { success: true, path: puppeteerPath }; + } catch (e) { + console.error(`[!] Failed to install puppeteer-core: ${e.message}`); + return { success: false, error: e.message }; + } +} + // Try to import unzipper, fallback to system unzip if not available let unzip = null; try { @@ -932,78 +1029,88 @@ function getExtensionTargets(browser) { /** * Find Chromium/Chrome binary path. - * Prefers Chromium over Chrome because Chrome 137+ removed --load-extension support. + * Checks CHROME_BINARY env var first, then falls back to system locations. * - * @param {string} [dataDir] - Data directory to check for puppeteer installs * @returns {string|null} - Absolute path to browser binary or null if not found */ -function findChromium(dataDir = null) { - // Check CHROME_BINARY env var first - const chromeBinary = (process.env.CHROME_BINARY || '').trim(); - if (chromeBinary && fs.existsSync(chromeBinary)) { - // Ensure absolute path - return path.resolve(chromeBinary); +function findChromium() { + const { execSync } = require('child_process'); + + // Helper to validate a binary by running --version + const validateBinary = (binaryPath) => { + if (!binaryPath || !fs.existsSync(binaryPath)) return false; + try { + execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' }); + return true; + } catch (e) { + return false; + } + }; + + // 1. Check CHROME_BINARY env var first + const chromeBinary = getEnv('CHROME_BINARY'); + if (chromeBinary) { + const absPath = path.resolve(chromeBinary); + if (validateBinary(absPath)) { + return absPath; + } + console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`); + } + + // 2. Warn that no CHROME_BINARY is configured, searching fallbacks + if (!chromeBinary) { + console.error('[!] Warning: CHROME_BINARY not set, searching system locations...'); } // Helper to find Chromium in @puppeteer/browsers directory structure - // Always returns absolute paths const findInPuppeteerDir = (baseDir) => { - const absBaseDir = path.resolve(baseDir); - if (!fs.existsSync(absBaseDir)) return null; + if (!fs.existsSync(baseDir)) return null; try { - const versions = fs.readdirSync(absBaseDir); + const versions = fs.readdirSync(baseDir); for (const version of versions.sort().reverse()) { - const versionDir = path.join(absBaseDir, version); - // Check for macOS ARM structure - const macArmBinary = path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'); - if (fs.existsSync(macArmBinary)) return macArmBinary; - // Check for macOS x64 structure - const macX64Binary = path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'); - if (fs.existsSync(macX64Binary)) return macX64Binary; - // Check for Linux structure - const linuxBinary = path.join(versionDir, 'chrome-linux/chrome'); - if (fs.existsSync(linuxBinary)) return linuxBinary; + const versionDir = path.join(baseDir, version); + const candidates = [ + path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'), + path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'), + path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'), + path.join(versionDir, 'chrome-linux64/chrome'), + path.join(versionDir, 'chrome-linux/chrome'), + ]; + for (const c of candidates) { + if (fs.existsSync(c)) return c; + } } - } catch (e) { - // Continue - } + } catch (e) {} return null; }; - // Check @puppeteer/browsers install locations - const puppeteerDirs = [ - // Local project install (from npx @puppeteer/browsers install) - path.join(dataDir || process.env.DATA_DIR || '.', 'chromium'), - path.join(process.cwd(), 'chromium'), - // User cache locations - path.join(process.env.HOME || '', '.cache/puppeteer/chromium'), - ]; - - for (const puppeteerDir of puppeteerDirs) { - const binary = findInPuppeteerDir(puppeteerDir); - if (binary) return binary; - } - - // Check standard system locations - const candidates = [ - // Linux Chromium + // 3. Search fallback locations (Chromium first, then Chrome) + const fallbackLocations = [ + // System Chromium + '/Applications/Chromium.app/Contents/MacOS/Chromium', '/usr/bin/chromium', '/usr/bin/chromium-browser', - // macOS Chromium (Homebrew or manual install) - '/Applications/Chromium.app/Contents/MacOS/Chromium', - // Fallback to Chrome (extension loading may not work in Chrome 137+) + // Puppeteer cache + path.join(process.env.HOME || '', '.cache/puppeteer/chromium'), + path.join(process.env.HOME || '', '.cache/puppeteer'), + // Chrome (fallback - extensions may not work in 137+) + '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', '/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', ]; - for (const candidate of candidates) { - if (fs.existsSync(candidate)) { - // Warn if falling back to Chrome - if (candidate.includes('google-chrome') || candidate.includes('Google Chrome')) { + for (const loc of fallbackLocations) { + // Check if it's a puppeteer cache dir + if (loc.includes('.cache/puppeteer')) { + const binary = findInPuppeteerDir(loc); + if (binary && validateBinary(binary)) { + return binary; + } + } else if (validateBinary(loc)) { + if (loc.includes('Google Chrome') || loc.includes('google-chrome')) { console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+'); } - return candidate; + return loc; } } @@ -1028,6 +1135,9 @@ module.exports = { // Chrome launching launchChromium, killChrome, + // Chrome/Chromium install + installChromium, + installPuppeteerCore, // Chrome/Chromium binary finding findChromium, // Extension utilities @@ -1055,7 +1165,9 @@ if (require.main === module) { console.log('Usage: chrome_utils.js [args...]'); console.log(''); console.log('Commands:'); - console.log(' findChromium [data_dir]'); + console.log(' findChromium'); + console.log(' installChromium'); + console.log(' installPuppeteerCore [npm_prefix]'); console.log(' launchChromium [output_dir] [extension_paths_json]'); console.log(' killChrome [output_dir]'); console.log(' killZombieChrome [data_dir]'); @@ -1072,8 +1184,7 @@ if (require.main === module) { try { switch (command) { case 'findChromium': { - const [dataDir] = commandArgs; - const binary = findChromium(dataDir); + const binary = findChromium(); if (binary) { console.log(binary); } else { @@ -1083,6 +1194,32 @@ if (require.main === module) { break; } + case 'installChromium': { + const result = await installChromium(); + if (result.success) { + console.log(JSON.stringify({ + binary: result.binary, + version: result.version, + })); + } else { + console.error(result.error); + process.exit(1); + } + break; + } + + case 'installPuppeteerCore': { + const [npmPrefix] = commandArgs; + const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined }); + if (result.success) { + console.log(JSON.stringify({ path: result.path })); + } else { + console.error(result.error); + process.exit(1); + } + break; + } + case 'launchChromium': { const [outputDir, extensionPathsJson] = commandArgs; const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : []; diff --git a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py index 589c58c0..4c6bbbdd 100644 --- a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py +++ b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 """ -Install hook for Chrome/Chromium binary. +Install hook for Chrome/Chromium and puppeteer-core. -Runs at crawl start to verify Chromium is available. +Runs at crawl start to install/find Chromium and puppeteer-core. Outputs JSONL for Binary and Machine config updates. Respects CHROME_BINARY env var for custom binary paths. -Falls back to `npx @puppeteer/browsers install chromium@latest` if not found. +Uses `npx @puppeteer/browsers install chromium@latest` and parses output. NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for --load-extension and --disable-extensions-except flags, which are needed for @@ -16,73 +16,139 @@ import os import sys import json import subprocess +from pathlib import Path -def install_chromium_via_puppeteer() -> bool: - """Install Chromium using @puppeteer/browsers.""" +def get_chrome_version(binary_path: str) -> str | None: + """Get Chrome/Chromium version string.""" try: - print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr) result = subprocess.run( - ['npx', '@puppeteer/browsers', 'install', 'chromium@latest'], + [binary_path, '--version'], capture_output=True, text=True, - timeout=300 + timeout=5 ) - return result.returncode == 0 - except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e: - print(f"Failed to install Chromium: {e}", file=sys.stderr) + if result.returncode == 0: + return result.stdout.strip() + except Exception: + pass + return None + + +def install_puppeteer_core() -> bool: + """Install puppeteer-core to NODE_MODULES_DIR if not present.""" + node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip() + if not node_modules_dir: + # No isolated node_modules, skip (will use global) + return True + + node_modules_path = Path(node_modules_dir) + if (node_modules_path / 'puppeteer-core').exists(): + return True + + # Get npm prefix from NODE_MODULES_DIR (parent of node_modules) + npm_prefix = node_modules_path.parent + + try: + print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr) + result = subprocess.run( + ['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'], + capture_output=True, + text=True, + timeout=60 + ) + if result.returncode == 0: + print(f"[+] puppeteer-core installed", file=sys.stderr) + return True + else: + print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr) + return False + except Exception as e: + print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr) return False -def find_chromium() -> dict | None: - """Find Chromium binary, respecting CHROME_BINARY env var.""" - # Quick check: if CHROME_BINARY is set and exists, skip expensive lookup - configured_binary = os.environ.get('CHROME_BINARY', '').strip() - if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): - # Binary is already configured and valid - exit immediately - sys.exit(0) +def install_chromium() -> dict | None: + """Install Chromium using @puppeteer/browsers and parse output for binary path. + Output format: "chromium@ " + e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium" + + Note: npx is fast when chromium is already cached - it returns the path without re-downloading. + """ try: - from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider + print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr) - # Try to find chromium using abx-pkg - # Prefer chromium over chrome because Chrome 137+ removed --load-extension support - binary = Binary( - name='chromium', - binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()], - overrides={'npm': {'packages': ['@puppeteer/browsers']}} + # Use --path to install to puppeteer's standard cache location + cache_path = os.path.expanduser('~/.cache/puppeteer') + + result = subprocess.run( + ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'], + capture_output=True, + text=True, + stdin=subprocess.DEVNULL, + timeout=300 ) - loaded = binary.load() - if loaded and loaded.abspath: - return { - 'name': 'chromium', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, - 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', - } + if result.returncode != 0: + print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr) + return None - # If not found, try to install via @puppeteer/browsers - if install_chromium_via_puppeteer(): - # Try loading again after install - loaded = binary.load() - if loaded and loaded.abspath: - return { - 'name': 'chromium', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, - 'binprovider': loaded.binprovider.name if loaded.binprovider else 'npm', - } - except Exception: - pass + # Parse output: "chromium@1563294 /path/to/Chromium" + output = result.stdout.strip() + parts = output.split(' ', 1) + if len(parts) != 2: + print(f"[!] Failed to parse install output: {output}", file=sys.stderr) + return None + + version_str = parts[0] # "chromium@1563294" + binary_path = parts[1].strip() + + if not binary_path or not os.path.exists(binary_path): + print(f"[!] Binary not found at: {binary_path}", file=sys.stderr) + return None + + # Extract version number + version = version_str.split('@')[1] if '@' in version_str else None + + print(f"[+] Chromium installed: {binary_path}", file=sys.stderr) + + return { + 'name': 'chromium', + 'abspath': binary_path, + 'version': version, + 'binprovider': 'puppeteer', + } + + except subprocess.TimeoutExpired: + print("[!] Chromium install timed out", file=sys.stderr) + except FileNotFoundError: + print("[!] npx not found - is Node.js installed?", file=sys.stderr) + except Exception as e: + print(f"[!] Failed to install Chromium: {e}", file=sys.stderr) return None def main(): - result = find_chromium() + # Install puppeteer-core if NODE_MODULES_DIR is set + install_puppeteer_core() + + # Check if CHROME_BINARY is already set and valid + configured_binary = os.environ.get('CHROME_BINARY', '').strip() + if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): + version = get_chrome_version(configured_binary) + print(json.dumps({ + 'type': 'Binary', + 'name': 'chromium', + 'abspath': configured_binary, + 'version': version, + 'binprovider': 'env', + })) + sys.exit(0) + + # Install/find Chromium via puppeteer + result = install_chromium() if result and result.get('abspath'): print(json.dumps({ @@ -110,7 +176,7 @@ def main(): sys.exit(0) else: - print(f"Chromium binary not found", file=sys.stderr) + print("Chromium binary not found", file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 0d580244..699dad70 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -67,28 +67,29 @@ def get_test_env(): return env -def find_chromium_binary(): - """Find the Chromium binary installed by @puppeteer/browsers.""" - if not CHROMIUM_INSTALL_DIR.exists(): - return None +def find_chromium_binary(data_dir=None): + """Find the Chromium binary using chrome_utils.js findChromium(). - # Look for versioned directories - for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True): - if not version_dir.is_dir(): - continue - # macOS ARM - mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' - if mac_arm.exists(): - return str(mac_arm) - # macOS x64 - mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' - if mac_x64.exists(): - return str(mac_x64) - # Linux - linux = version_dir / 'chrome-linux' / 'chrome' - if linux.exists(): - return str(linux) + This uses the centralized findChromium() function which checks: + - CHROME_BINARY env var + - @puppeteer/browsers install locations (in data_dir/chromium) + - System Chromium locations + - Falls back to Chrome (with warning) + Args: + data_dir: Directory where chromium was installed (contains chromium/ subdir) + """ + chrome_utils = PLUGIN_DIR / 'chrome_utils.js' + # Use provided data_dir, or fall back to env var, or current dir + search_dir = data_dir or os.environ.get('DATA_DIR', '.') + result = subprocess.run( + ['node', str(chrome_utils), 'findChromium', str(search_dir)], + capture_output=True, + text=True, + timeout=10 + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() return None diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py index dfc34a90..63fa0f9a 100644 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py @@ -171,31 +171,30 @@ def setup_test_lib_dirs(tmpdir: Path) -> dict: } -def find_chromium_binary(): - """Find the Chromium binary installed by @puppeteer/browsers.""" - chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium' - if not chromium_dir.exists(): - return None +PLUGINS_ROOT = PLUGIN_DIR.parent - for version_dir in sorted(chromium_dir.iterdir(), reverse=True): - if not version_dir.is_dir(): - continue - # macOS ARM - mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' - if mac_arm.exists(): - return str(mac_arm) - # macOS x64 - mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' - if mac_x64.exists(): - return str(mac_x64) - # Linux - linux = version_dir / 'chrome-linux' / 'chrome' - if linux.exists(): - return str(linux) + +def find_chromium_binary(): + """Find the Chromium binary using chrome_utils.js findChromium(). + + This uses the centralized findChromium() function which checks: + - CHROME_BINARY env var + - @puppeteer/browsers install locations + - System Chromium locations + - Falls back to Chrome (with warning) + """ + chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js' + result = subprocess.run( + ['node', str(chrome_utils), 'findChromium'], + capture_output=True, + text=True, + timeout=10 + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() return None -PLUGINS_ROOT = PLUGIN_DIR.parent CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' TEST_URL = 'https://www.filmin.es/' diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index 5780e0b2..dd203d86 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -157,54 +157,94 @@ def test_large_extension_size(): assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes" -def setup_test_lib_dirs(tmpdir: Path) -> dict: - """Get lib directories for tests, using project's existing node_modules. - - Uses the project's node_modules to avoid slow npm install during tests. - """ - # Use project's existing node_modules (puppeteer-core already installed) - project_root = Path(__file__).parent.parent.parent.parent.parent - node_modules_dir = project_root / 'node_modules' - - if not (node_modules_dir / 'puppeteer-core').exists(): - pytest.skip("puppeteer-core not installed in project node_modules") - - return { - 'NODE_MODULES_DIR': str(node_modules_dir), - } - - -def find_chromium_binary(): - """Find the Chromium binary installed by @puppeteer/browsers.""" - chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium' - if not chromium_dir.exists(): - return None - - for version_dir in sorted(chromium_dir.iterdir(), reverse=True): - if not version_dir.is_dir(): - continue - # macOS ARM - mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' - if mac_arm.exists(): - return str(mac_arm) - # macOS x64 - mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' - if mac_x64.exists(): - return str(mac_x64) - # Linux - linux = version_dir / 'chrome-linux' / 'chrome' - if linux.exists(): - return str(linux) - return None - - PLUGINS_ROOT = PLUGIN_DIR.parent +CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py' CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' + +def setup_test_env(tmpdir: Path) -> dict: + """Set up isolated data/lib directory structure for tests. + + Creates structure like: + /data/ + lib/ + arm64-darwin/ (or x86_64-linux, etc.) + npm/ + bin/ + node_modules/ + chrome_extensions/ + + Calls chrome install hook which handles puppeteer-core and chromium installation. + Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. + """ + import platform + + # Determine machine type (matches archivebox.config.paths.get_machine_type()) + machine = platform.machine().lower() + system = platform.system().lower() + if machine in ('arm64', 'aarch64'): + machine = 'arm64' + elif machine in ('x86_64', 'amd64'): + machine = 'x86_64' + machine_type = f"{machine}-{system}" + + # Create proper directory structure + data_dir = tmpdir / 'data' + lib_dir = data_dir / 'lib' / machine_type + npm_dir = lib_dir / 'npm' + npm_bin_dir = npm_dir / 'bin' + node_modules_dir = npm_dir / 'node_modules' + chrome_extensions_dir = data_dir / 'chrome_extensions' + + # Create all directories + node_modules_dir.mkdir(parents=True, exist_ok=True) + npm_bin_dir.mkdir(parents=True, exist_ok=True) + chrome_extensions_dir.mkdir(parents=True, exist_ok=True) + + # Build complete env dict + env = os.environ.copy() + env.update({ + 'DATA_DIR': str(data_dir), + 'LIB_DIR': str(lib_dir), + 'MACHINE_TYPE': machine_type, + 'NPM_BIN_DIR': str(npm_bin_dir), + 'NODE_MODULES_DIR': str(node_modules_dir), + 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), + }) + + # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) + result = subprocess.run( + ['python', str(CHROME_INSTALL_HOOK)], + capture_output=True, text=True, timeout=10, env=env + ) + if result.returncode != 0: + pytest.skip(f"Chrome install hook failed: {result.stderr}") + + # Parse JSONL output to get CHROME_BINARY + chrome_binary = None + for line in result.stdout.strip().split('\n'): + if not line.strip(): + continue + try: + data = json.loads(line) + if data.get('type') == 'Binary' and data.get('abspath'): + chrome_binary = data['abspath'] + break + except json.JSONDecodeError: + continue + + if not chrome_binary or not Path(chrome_binary).exists(): + pytest.skip(f"Chromium binary not found: {chrome_binary}") + + env['CHROME_BINARY'] = chrome_binary + return env + + # Test URL: ad blocker test page that shows if ads are blocked TEST_URL = 'https://d3ward.github.io/toolz/adblock.html' +@pytest.mark.timeout(15) def test_extension_loads_in_chromium(): """Verify uBlock extension loads in Chromium by visiting its dashboard page. @@ -214,35 +254,30 @@ def test_extension_loads_in_chromium(): """ import signal import time + print("[test] Starting test_extension_loads_in_chromium", flush=True) with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) + print(f"[test] tmpdir={tmpdir}", flush=True) - # Set up isolated lib directories for this test - lib_env = setup_test_lib_dirs(tmpdir) + # Set up isolated env with proper directory structure + env = setup_test_env(tmpdir) + env.setdefault('CHROME_HEADLESS', 'true') + print(f"[test] DATA_DIR={env.get('DATA_DIR')}", flush=True) + print(f"[test] CHROME_BINARY={env.get('CHROME_BINARY')}", flush=True) - # Set up extensions directory - ext_dir = tmpdir / 'chrome_extensions' - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env.update(lib_env) - env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) - env['CHROME_HEADLESS'] = 'true' - - # Ensure CHROME_BINARY points to Chromium - chromium = find_chromium_binary() - if chromium: - env['CHROME_BINARY'] = chromium + ext_dir = Path(env['CHROME_EXTENSIONS_DIR']) # Step 1: Install the uBlock extension + print("[test] Installing uBlock extension...", flush=True) result = subprocess.run( ['node', str(INSTALL_SCRIPT)], capture_output=True, text=True, env=env, - timeout=15 + timeout=5 ) + print(f"[test] Extension install rc={result.returncode}", flush=True) assert result.returncode == 0, f"Extension install failed: {result.stderr}" # Verify extension cache was created @@ -252,7 +287,8 @@ def test_extension_loads_in_chromium(): print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) - crawl_dir = tmpdir / 'crawl' + data_dir = Path(env['DATA_DIR']) + crawl_dir = data_dir / 'crawl' crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome' @@ -422,22 +458,11 @@ def test_blocks_ads_on_test_page(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Set up isolated lib directories for this test - lib_env = setup_test_lib_dirs(tmpdir) - - # Set up extensions directory - ext_dir = tmpdir / 'chrome_extensions' - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env.update(lib_env) - env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) + # Set up isolated env with proper directory structure + env = setup_test_env(tmpdir) env['CHROME_HEADLESS'] = 'true' - # Ensure CHROME_BINARY points to Chromium - chromium = find_chromium_binary() - if chromium: - env['CHROME_BINARY'] = chromium + ext_dir = Path(env['CHROME_EXTENSIONS_DIR']) # Step 1: Install the uBlock extension result = subprocess.run( @@ -455,8 +480,9 @@ def test_blocks_ads_on_test_page(): ext_data = json.loads(cache_file.read_text()) print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") - # Step 2: Launch Chrome using the chrome hook (loads extensions automatically) - crawl_dir = tmpdir / 'crawl' + # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) + data_dir = Path(env['DATA_DIR']) + crawl_dir = data_dir / 'crawl' crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome'