messing with chrome install process to reuse cached chromium with pinned version

This commit is contained in:
Nick Sweeting
2025-12-29 18:49:36 -08:00
parent b670612685
commit 7e6e3be9e7
5 changed files with 448 additions and 219 deletions

View File

@@ -433,6 +433,103 @@ async function killChrome(pid, outputDir = null) {
console.error('[*] Chrome process killed'); console.error('[*] Chrome process killed');
} }
/**
* Install Chromium using @puppeteer/browsers programmatic API.
* Uses puppeteer's default cache location, returns the binary path.
*
* @param {Object} options - Install options
* @returns {Promise<Object>} - {success, binary, version, error}
*/
async function installChromium(options = {}) {
// Check if CHROME_BINARY is already set and valid
const configuredBinary = getEnv('CHROME_BINARY');
if (configuredBinary && fs.existsSync(configuredBinary)) {
console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`);
return { success: true, binary: configuredBinary, version: null };
}
// Try to load @puppeteer/browsers from NODE_MODULES_DIR or system
let puppeteerBrowsers;
try {
if (process.env.NODE_MODULES_DIR) {
module.paths.unshift(process.env.NODE_MODULES_DIR);
}
puppeteerBrowsers = require('@puppeteer/browsers');
} catch (e) {
console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`);
return { success: false, error: '@puppeteer/browsers not installed' };
}
console.error(`[*] Installing Chromium via @puppeteer/browsers...`);
try {
const result = await puppeteerBrowsers.install({
browser: 'chromium',
buildId: 'latest',
});
const binary = result.executablePath;
const version = result.buildId;
if (!binary || !fs.existsSync(binary)) {
console.error(`[!] Chromium binary not found at: ${binary}`);
return { success: false, error: `Chromium binary not found at: ${binary}` };
}
console.error(`[+] Chromium installed: ${binary}`);
return { success: true, binary, version };
} catch (e) {
console.error(`[!] Failed to install Chromium: ${e.message}`);
return { success: false, error: e.message };
}
}
/**
* Install puppeteer-core npm package.
*
* @param {Object} options - Install options
* @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib/<arch>/npm or ./node_modules parent)
* @param {number} [options.timeout=60000] - Timeout in milliseconds
* @returns {Promise<Object>} - {success, path, error}
*/
async function installPuppeteerCore(options = {}) {
const arch = `${process.arch}-${process.platform}`;
const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm');
const {
npmPrefix = defaultPrefix,
timeout = 60000,
} = options;
const nodeModulesDir = path.join(npmPrefix, 'node_modules');
const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core');
// Check if already installed
if (fs.existsSync(puppeteerPath)) {
console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`);
return { success: true, path: puppeteerPath };
}
console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`);
// Create directory
if (!fs.existsSync(npmPrefix)) {
fs.mkdirSync(npmPrefix, { recursive: true });
}
try {
const { execSync } = require('child_process');
execSync(
`npm install --prefix "${npmPrefix}" puppeteer-core`,
{ encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] }
);
console.error(`[+] puppeteer-core installed successfully`);
return { success: true, path: puppeteerPath };
} catch (e) {
console.error(`[!] Failed to install puppeteer-core: ${e.message}`);
return { success: false, error: e.message };
}
}
// Try to import unzipper, fallback to system unzip if not available // Try to import unzipper, fallback to system unzip if not available
let unzip = null; let unzip = null;
try { try {
@@ -932,78 +1029,88 @@ function getExtensionTargets(browser) {
/** /**
* Find Chromium/Chrome binary path. * Find Chromium/Chrome binary path.
* Prefers Chromium over Chrome because Chrome 137+ removed --load-extension support. * Checks CHROME_BINARY env var first, then falls back to system locations.
* *
* @param {string} [dataDir] - Data directory to check for puppeteer installs
* @returns {string|null} - Absolute path to browser binary or null if not found * @returns {string|null} - Absolute path to browser binary or null if not found
*/ */
function findChromium(dataDir = null) { function findChromium() {
// Check CHROME_BINARY env var first const { execSync } = require('child_process');
const chromeBinary = (process.env.CHROME_BINARY || '').trim();
if (chromeBinary && fs.existsSync(chromeBinary)) { // Helper to validate a binary by running --version
// Ensure absolute path const validateBinary = (binaryPath) => {
return path.resolve(chromeBinary); if (!binaryPath || !fs.existsSync(binaryPath)) return false;
try {
execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' });
return true;
} catch (e) {
return false;
}
};
// 1. Check CHROME_BINARY env var first
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary) {
const absPath = path.resolve(chromeBinary);
if (validateBinary(absPath)) {
return absPath;
}
console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
}
// 2. Warn that no CHROME_BINARY is configured, searching fallbacks
if (!chromeBinary) {
console.error('[!] Warning: CHROME_BINARY not set, searching system locations...');
} }
// Helper to find Chromium in @puppeteer/browsers directory structure // Helper to find Chromium in @puppeteer/browsers directory structure
// Always returns absolute paths
const findInPuppeteerDir = (baseDir) => { const findInPuppeteerDir = (baseDir) => {
const absBaseDir = path.resolve(baseDir); if (!fs.existsSync(baseDir)) return null;
if (!fs.existsSync(absBaseDir)) return null;
try { try {
const versions = fs.readdirSync(absBaseDir); const versions = fs.readdirSync(baseDir);
for (const version of versions.sort().reverse()) { for (const version of versions.sort().reverse()) {
const versionDir = path.join(absBaseDir, version); const versionDir = path.join(baseDir, version);
// Check for macOS ARM structure const candidates = [
const macArmBinary = path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'); path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'),
if (fs.existsSync(macArmBinary)) return macArmBinary; path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
// Check for macOS x64 structure path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'),
const macX64Binary = path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'); path.join(versionDir, 'chrome-linux64/chrome'),
if (fs.existsSync(macX64Binary)) return macX64Binary; path.join(versionDir, 'chrome-linux/chrome'),
// Check for Linux structure ];
const linuxBinary = path.join(versionDir, 'chrome-linux/chrome'); for (const c of candidates) {
if (fs.existsSync(linuxBinary)) return linuxBinary; if (fs.existsSync(c)) return c;
}
} }
} catch (e) { } catch (e) {}
// Continue
}
return null; return null;
}; };
// Check @puppeteer/browsers install locations // 3. Search fallback locations (Chromium first, then Chrome)
const puppeteerDirs = [ const fallbackLocations = [
// Local project install (from npx @puppeteer/browsers install) // System Chromium
path.join(dataDir || process.env.DATA_DIR || '.', 'chromium'), '/Applications/Chromium.app/Contents/MacOS/Chromium',
path.join(process.cwd(), 'chromium'),
// User cache locations
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
];
for (const puppeteerDir of puppeteerDirs) {
const binary = findInPuppeteerDir(puppeteerDir);
if (binary) return binary;
}
// Check standard system locations
const candidates = [
// Linux Chromium
'/usr/bin/chromium', '/usr/bin/chromium',
'/usr/bin/chromium-browser', '/usr/bin/chromium-browser',
// macOS Chromium (Homebrew or manual install) // Puppeteer cache
'/Applications/Chromium.app/Contents/MacOS/Chromium', path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
// Fallback to Chrome (extension loading may not work in Chrome 137+) path.join(process.env.HOME || '', '.cache/puppeteer'),
// Chrome (fallback - extensions may not work in 137+)
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/usr/bin/google-chrome', '/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-stable',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
]; ];
for (const candidate of candidates) { for (const loc of fallbackLocations) {
if (fs.existsSync(candidate)) { // Check if it's a puppeteer cache dir
// Warn if falling back to Chrome if (loc.includes('.cache/puppeteer')) {
if (candidate.includes('google-chrome') || candidate.includes('Google Chrome')) { const binary = findInPuppeteerDir(loc);
if (binary && validateBinary(binary)) {
return binary;
}
} else if (validateBinary(loc)) {
if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+'); console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
} }
return candidate; return loc;
} }
} }
@@ -1028,6 +1135,9 @@ module.exports = {
// Chrome launching // Chrome launching
launchChromium, launchChromium,
killChrome, killChrome,
// Chrome/Chromium install
installChromium,
installPuppeteerCore,
// Chrome/Chromium binary finding // Chrome/Chromium binary finding
findChromium, findChromium,
// Extension utilities // Extension utilities
@@ -1055,7 +1165,9 @@ if (require.main === module) {
console.log('Usage: chrome_utils.js <command> [args...]'); console.log('Usage: chrome_utils.js <command> [args...]');
console.log(''); console.log('');
console.log('Commands:'); console.log('Commands:');
console.log(' findChromium [data_dir]'); console.log(' findChromium');
console.log(' installChromium');
console.log(' installPuppeteerCore [npm_prefix]');
console.log(' launchChromium [output_dir] [extension_paths_json]'); console.log(' launchChromium [output_dir] [extension_paths_json]');
console.log(' killChrome <pid> [output_dir]'); console.log(' killChrome <pid> [output_dir]');
console.log(' killZombieChrome [data_dir]'); console.log(' killZombieChrome [data_dir]');
@@ -1072,8 +1184,7 @@ if (require.main === module) {
try { try {
switch (command) { switch (command) {
case 'findChromium': { case 'findChromium': {
const [dataDir] = commandArgs; const binary = findChromium();
const binary = findChromium(dataDir);
if (binary) { if (binary) {
console.log(binary); console.log(binary);
} else { } else {
@@ -1083,6 +1194,32 @@ if (require.main === module) {
break; break;
} }
case 'installChromium': {
const result = await installChromium();
if (result.success) {
console.log(JSON.stringify({
binary: result.binary,
version: result.version,
}));
} else {
console.error(result.error);
process.exit(1);
}
break;
}
case 'installPuppeteerCore': {
const [npmPrefix] = commandArgs;
const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined });
if (result.success) {
console.log(JSON.stringify({ path: result.path }));
} else {
console.error(result.error);
process.exit(1);
}
break;
}
case 'launchChromium': { case 'launchChromium': {
const [outputDir, extensionPathsJson] = commandArgs; const [outputDir, extensionPathsJson] = commandArgs;
const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : []; const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : [];

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Install hook for Chrome/Chromium binary. Install hook for Chrome/Chromium and puppeteer-core.
Runs at crawl start to verify Chromium is available. Runs at crawl start to install/find Chromium and puppeteer-core.
Outputs JSONL for Binary and Machine config updates. Outputs JSONL for Binary and Machine config updates.
Respects CHROME_BINARY env var for custom binary paths. Respects CHROME_BINARY env var for custom binary paths.
Falls back to `npx @puppeteer/browsers install chromium@latest` if not found. Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for --load-extension and --disable-extensions-except flags, which are needed for
@@ -16,73 +16,139 @@ import os
import sys import sys
import json import json
import subprocess import subprocess
from pathlib import Path
def install_chromium_via_puppeteer() -> bool: def get_chrome_version(binary_path: str) -> str | None:
"""Install Chromium using @puppeteer/browsers.""" """Get Chrome/Chromium version string."""
try: try:
print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
result = subprocess.run( result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'], [binary_path, '--version'],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=300 timeout=5
) )
return result.returncode == 0 if result.returncode == 0:
except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e: return result.stdout.strip()
print(f"Failed to install Chromium: {e}", file=sys.stderr) except Exception:
pass
return None
def install_puppeteer_core() -> bool:
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
if not node_modules_dir:
# No isolated node_modules, skip (will use global)
return True
node_modules_path = Path(node_modules_dir)
if (node_modules_path / 'puppeteer-core').exists():
return True
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
npm_prefix = node_modules_path.parent
try:
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
print(f"[+] puppeteer-core installed", file=sys.stderr)
return True
else:
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
return False
except Exception as e:
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
return False return False
def find_chromium() -> dict | None: def install_chromium() -> dict | None:
"""Find Chromium binary, respecting CHROME_BINARY env var.""" """Install Chromium using @puppeteer/browsers and parse output for binary path.
# Quick check: if CHROME_BINARY is set and exists, skip expensive lookup
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
# Binary is already configured and valid - exit immediately
sys.exit(0)
Output format: "chromium@<version> <path_to_binary>"
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
"""
try: try:
from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
# Try to find chromium using abx-pkg # Use --path to install to puppeteer's standard cache location
# Prefer chromium over chrome because Chrome 137+ removed --load-extension support cache_path = os.path.expanduser('~/.cache/puppeteer')
binary = Binary(
name='chromium', result = subprocess.run(
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()], ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
overrides={'npm': {'packages': ['@puppeteer/browsers']}} capture_output=True,
text=True,
stdin=subprocess.DEVNULL,
timeout=300
) )
loaded = binary.load() if result.returncode != 0:
if loaded and loaded.abspath: print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
return { return None
'name': 'chromium',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
# If not found, try to install via @puppeteer/browsers # Parse output: "chromium@1563294 /path/to/Chromium"
if install_chromium_via_puppeteer(): output = result.stdout.strip()
# Try loading again after install parts = output.split(' ', 1)
loaded = binary.load() if len(parts) != 2:
if loaded and loaded.abspath: print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
return { return None
'name': 'chromium',
'abspath': str(loaded.abspath), version_str = parts[0] # "chromium@1563294"
'version': str(loaded.version) if loaded.version else None, binary_path = parts[1].strip()
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'npm', if not binary_path or not os.path.exists(binary_path):
} print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
except Exception: return None
pass
# Extract version number
version = version_str.split('@')[1] if '@' in version_str else None
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
return {
'name': 'chromium',
'abspath': binary_path,
'version': version,
'binprovider': 'puppeteer',
}
except subprocess.TimeoutExpired:
print("[!] Chromium install timed out", file=sys.stderr)
except FileNotFoundError:
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
except Exception as e:
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
return None return None
def main(): def main():
result = find_chromium() # Install puppeteer-core if NODE_MODULES_DIR is set
install_puppeteer_core()
# Check if CHROME_BINARY is already set and valid
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
version = get_chrome_version(configured_binary)
print(json.dumps({
'type': 'Binary',
'name': 'chromium',
'abspath': configured_binary,
'version': version,
'binprovider': 'env',
}))
sys.exit(0)
# Install/find Chromium via puppeteer
result = install_chromium()
if result and result.get('abspath'): if result and result.get('abspath'):
print(json.dumps({ print(json.dumps({
@@ -110,7 +176,7 @@ def main():
sys.exit(0) sys.exit(0)
else: else:
print(f"Chromium binary not found", file=sys.stderr) print("Chromium binary not found", file=sys.stderr)
sys.exit(1) sys.exit(1)

View File

@@ -67,28 +67,29 @@ def get_test_env():
return env return env
def find_chromium_binary(): def find_chromium_binary(data_dir=None):
"""Find the Chromium binary installed by @puppeteer/browsers.""" """Find the Chromium binary using chrome_utils.js findChromium().
if not CHROMIUM_INSTALL_DIR.exists():
return None
# Look for versioned directories This uses the centralized findChromium() function which checks:
for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True): - CHROME_BINARY env var
if not version_dir.is_dir(): - @puppeteer/browsers install locations (in data_dir/chromium)
continue - System Chromium locations
# macOS ARM - Falls back to Chrome (with warning)
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
Args:
data_dir: Directory where chromium was installed (contains chromium/ subdir)
"""
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
# Use provided data_dir, or fall back to env var, or current dir
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
result = subprocess.run(
['node', str(chrome_utils), 'findChromium', str(search_dir)],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None return None

View File

@@ -171,31 +171,30 @@ def setup_test_lib_dirs(tmpdir: Path) -> dict:
} }
def find_chromium_binary(): PLUGINS_ROOT = PLUGIN_DIR.parent
"""Find the Chromium binary installed by @puppeteer/browsers."""
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
if not chromium_dir.exists():
return None
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
if not version_dir.is_dir(): def find_chromium_binary():
continue """Find the Chromium binary using chrome_utils.js findChromium().
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' This uses the centralized findChromium() function which checks:
if mac_arm.exists(): - CHROME_BINARY env var
return str(mac_arm) - @puppeteer/browsers install locations
# macOS x64 - System Chromium locations
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium' - Falls back to Chrome (with warning)
if mac_x64.exists(): """
return str(mac_x64) chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
# Linux result = subprocess.run(
linux = version_dir / 'chrome-linux' / 'chrome' ['node', str(chrome_utils), 'findChromium'],
if linux.exists(): capture_output=True,
return str(linux) text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None return None
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
TEST_URL = 'https://www.filmin.es/' TEST_URL = 'https://www.filmin.es/'

View File

@@ -157,54 +157,94 @@ def test_large_extension_size():
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes" assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
def setup_test_lib_dirs(tmpdir: Path) -> dict:
"""Get lib directories for tests, using project's existing node_modules.
Uses the project's node_modules to avoid slow npm install during tests.
"""
# Use project's existing node_modules (puppeteer-core already installed)
project_root = Path(__file__).parent.parent.parent.parent.parent
node_modules_dir = project_root / 'node_modules'
if not (node_modules_dir / 'puppeteer-core').exists():
pytest.skip("puppeteer-core not installed in project node_modules")
return {
'NODE_MODULES_DIR': str(node_modules_dir),
}
def find_chromium_binary():
"""Find the Chromium binary installed by @puppeteer/browsers."""
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
if not chromium_dir.exists():
return None
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
if not version_dir.is_dir():
continue
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
return None
PLUGINS_ROOT = PLUGIN_DIR.parent PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure like:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
bin/
node_modules/
chrome_extensions/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / 'bin'
node_modules_dir = npm_dir / 'node_modules'
chrome_extensions_dir = data_dir / 'chrome_extensions'
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
})
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=10, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
# Test URL: ad blocker test page that shows if ads are blocked # Test URL: ad blocker test page that shows if ads are blocked
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html' TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
@pytest.mark.timeout(15)
def test_extension_loads_in_chromium(): def test_extension_loads_in_chromium():
"""Verify uBlock extension loads in Chromium by visiting its dashboard page. """Verify uBlock extension loads in Chromium by visiting its dashboard page.
@@ -214,35 +254,30 @@ def test_extension_loads_in_chromium():
""" """
import signal import signal
import time import time
print("[test] Starting test_extension_loads_in_chromium", flush=True)
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir) tmpdir = Path(tmpdir)
print(f"[test] tmpdir={tmpdir}", flush=True)
# Set up isolated lib directories for this test # Set up isolated env with proper directory structure
lib_env = setup_test_lib_dirs(tmpdir) env = setup_test_env(tmpdir)
env.setdefault('CHROME_HEADLESS', 'true')
print(f"[test] DATA_DIR={env.get('DATA_DIR')}", flush=True)
print(f"[test] CHROME_BINARY={env.get('CHROME_BINARY')}", flush=True)
# Set up extensions directory ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the uBlock extension # Step 1: Install the uBlock extension
print("[test] Installing uBlock extension...", flush=True)
result = subprocess.run( result = subprocess.run(
['node', str(INSTALL_SCRIPT)], ['node', str(INSTALL_SCRIPT)],
capture_output=True, capture_output=True,
text=True, text=True,
env=env, env=env,
timeout=15 timeout=5
) )
print(f"[test] Extension install rc={result.returncode}", flush=True)
assert result.returncode == 0, f"Extension install failed: {result.stderr}" assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created # Verify extension cache was created
@@ -252,7 +287,8 @@ def test_extension_loads_in_chromium():
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically) # Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl' data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir() crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome' chrome_dir = crawl_dir / 'chrome'
@@ -422,22 +458,11 @@ def test_blocks_ads_on_test_page():
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir) tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test # Set up isolated env with proper directory structure
lib_env = setup_test_lib_dirs(tmpdir) env = setup_test_env(tmpdir)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true' env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the uBlock extension # Step 1: Install the uBlock extension
result = subprocess.run( result = subprocess.run(
@@ -455,8 +480,9 @@ def test_blocks_ads_on_test_page():
ext_data = json.loads(cache_file.read_text()) ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chrome using the chrome hook (loads extensions automatically) # Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl' data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir() crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome' chrome_dir = crawl_dir / 'chrome'