messing with chrome install process to reuse cached chromium with pinned version

This commit is contained in:
Nick Sweeting
2025-12-29 18:49:36 -08:00
parent b670612685
commit 7e6e3be9e7
5 changed files with 448 additions and 219 deletions

View File

@@ -433,6 +433,103 @@ async function killChrome(pid, outputDir = null) {
console.error('[*] Chrome process killed');
}
/**
* Install Chromium using @puppeteer/browsers programmatic API.
* Uses puppeteer's default cache location, returns the binary path.
*
* @param {Object} options - Install options
* @returns {Promise<Object>} - {success, binary, version, error}
*/
async function installChromium(options = {}) {
// Check if CHROME_BINARY is already set and valid
const configuredBinary = getEnv('CHROME_BINARY');
if (configuredBinary && fs.existsSync(configuredBinary)) {
console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`);
return { success: true, binary: configuredBinary, version: null };
}
// Try to load @puppeteer/browsers from NODE_MODULES_DIR or system
let puppeteerBrowsers;
try {
if (process.env.NODE_MODULES_DIR) {
module.paths.unshift(process.env.NODE_MODULES_DIR);
}
puppeteerBrowsers = require('@puppeteer/browsers');
} catch (e) {
console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`);
return { success: false, error: '@puppeteer/browsers not installed' };
}
console.error(`[*] Installing Chromium via @puppeteer/browsers...`);
try {
const result = await puppeteerBrowsers.install({
browser: 'chromium',
buildId: 'latest',
});
const binary = result.executablePath;
const version = result.buildId;
if (!binary || !fs.existsSync(binary)) {
console.error(`[!] Chromium binary not found at: ${binary}`);
return { success: false, error: `Chromium binary not found at: ${binary}` };
}
console.error(`[+] Chromium installed: ${binary}`);
return { success: true, binary, version };
} catch (e) {
console.error(`[!] Failed to install Chromium: ${e.message}`);
return { success: false, error: e.message };
}
}
/**
* Install puppeteer-core npm package.
*
* @param {Object} options - Install options
* @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib/<arch>/npm or ./node_modules parent)
* @param {number} [options.timeout=60000] - Timeout in milliseconds
* @returns {Promise<Object>} - {success, path, error}
*/
async function installPuppeteerCore(options = {}) {
const arch = `${process.arch}-${process.platform}`;
const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm');
const {
npmPrefix = defaultPrefix,
timeout = 60000,
} = options;
const nodeModulesDir = path.join(npmPrefix, 'node_modules');
const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core');
// Check if already installed
if (fs.existsSync(puppeteerPath)) {
console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`);
return { success: true, path: puppeteerPath };
}
console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`);
// Create directory
if (!fs.existsSync(npmPrefix)) {
fs.mkdirSync(npmPrefix, { recursive: true });
}
try {
const { execSync } = require('child_process');
execSync(
`npm install --prefix "${npmPrefix}" puppeteer-core`,
{ encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] }
);
console.error(`[+] puppeteer-core installed successfully`);
return { success: true, path: puppeteerPath };
} catch (e) {
console.error(`[!] Failed to install puppeteer-core: ${e.message}`);
return { success: false, error: e.message };
}
}
// Try to import unzipper, fallback to system unzip if not available
let unzip = null;
try {
@@ -932,78 +1029,88 @@ function getExtensionTargets(browser) {
/**
* Find Chromium/Chrome binary path.
* Prefers Chromium over Chrome because Chrome 137+ removed --load-extension support.
* Checks CHROME_BINARY env var first, then falls back to system locations.
*
* @param {string} [dataDir] - Data directory to check for puppeteer installs
* @returns {string|null} - Absolute path to browser binary or null if not found
*/
function findChromium(dataDir = null) {
// Check CHROME_BINARY env var first
const chromeBinary = (process.env.CHROME_BINARY || '').trim();
if (chromeBinary && fs.existsSync(chromeBinary)) {
// Ensure absolute path
return path.resolve(chromeBinary);
function findChromium() {
const { execSync } = require('child_process');
// Helper to validate a binary by running --version
const validateBinary = (binaryPath) => {
if (!binaryPath || !fs.existsSync(binaryPath)) return false;
try {
execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' });
return true;
} catch (e) {
return false;
}
};
// 1. Check CHROME_BINARY env var first
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary) {
const absPath = path.resolve(chromeBinary);
if (validateBinary(absPath)) {
return absPath;
}
console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
}
// 2. Warn that no CHROME_BINARY is configured, searching fallbacks
if (!chromeBinary) {
console.error('[!] Warning: CHROME_BINARY not set, searching system locations...');
}
// Helper to find Chromium in @puppeteer/browsers directory structure
// Always returns absolute paths
const findInPuppeteerDir = (baseDir) => {
const absBaseDir = path.resolve(baseDir);
if (!fs.existsSync(absBaseDir)) return null;
if (!fs.existsSync(baseDir)) return null;
try {
const versions = fs.readdirSync(absBaseDir);
const versions = fs.readdirSync(baseDir);
for (const version of versions.sort().reverse()) {
const versionDir = path.join(absBaseDir, version);
// Check for macOS ARM structure
const macArmBinary = path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium');
if (fs.existsSync(macArmBinary)) return macArmBinary;
// Check for macOS x64 structure
const macX64Binary = path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium');
if (fs.existsSync(macX64Binary)) return macX64Binary;
// Check for Linux structure
const linuxBinary = path.join(versionDir, 'chrome-linux/chrome');
if (fs.existsSync(linuxBinary)) return linuxBinary;
const versionDir = path.join(baseDir, version);
const candidates = [
path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'),
path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'),
path.join(versionDir, 'chrome-linux64/chrome'),
path.join(versionDir, 'chrome-linux/chrome'),
];
for (const c of candidates) {
if (fs.existsSync(c)) return c;
}
}
} catch (e) {
// Continue
}
} catch (e) {}
return null;
};
// Check @puppeteer/browsers install locations
const puppeteerDirs = [
// Local project install (from npx @puppeteer/browsers install)
path.join(dataDir || process.env.DATA_DIR || '.', 'chromium'),
path.join(process.cwd(), 'chromium'),
// User cache locations
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
];
for (const puppeteerDir of puppeteerDirs) {
const binary = findInPuppeteerDir(puppeteerDir);
if (binary) return binary;
}
// Check standard system locations
const candidates = [
// Linux Chromium
// 3. Search fallback locations (Chromium first, then Chrome)
const fallbackLocations = [
// System Chromium
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS Chromium (Homebrew or manual install)
'/Applications/Chromium.app/Contents/MacOS/Chromium',
// Fallback to Chrome (extension loading may not work in Chrome 137+)
// Puppeteer cache
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
path.join(process.env.HOME || '', '.cache/puppeteer'),
// Chrome (fallback - extensions may not work in 137+)
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
];
for (const candidate of candidates) {
if (fs.existsSync(candidate)) {
// Warn if falling back to Chrome
if (candidate.includes('google-chrome') || candidate.includes('Google Chrome')) {
for (const loc of fallbackLocations) {
// Check if it's a puppeteer cache dir
if (loc.includes('.cache/puppeteer')) {
const binary = findInPuppeteerDir(loc);
if (binary && validateBinary(binary)) {
return binary;
}
} else if (validateBinary(loc)) {
if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
}
return candidate;
return loc;
}
}
@@ -1028,6 +1135,9 @@ module.exports = {
// Chrome launching
launchChromium,
killChrome,
// Chrome/Chromium install
installChromium,
installPuppeteerCore,
// Chrome/Chromium binary finding
findChromium,
// Extension utilities
@@ -1055,7 +1165,9 @@ if (require.main === module) {
console.log('Usage: chrome_utils.js <command> [args...]');
console.log('');
console.log('Commands:');
console.log(' findChromium [data_dir]');
console.log(' findChromium');
console.log(' installChromium');
console.log(' installPuppeteerCore [npm_prefix]');
console.log(' launchChromium [output_dir] [extension_paths_json]');
console.log(' killChrome <pid> [output_dir]');
console.log(' killZombieChrome [data_dir]');
@@ -1072,8 +1184,7 @@ if (require.main === module) {
try {
switch (command) {
case 'findChromium': {
const [dataDir] = commandArgs;
const binary = findChromium(dataDir);
const binary = findChromium();
if (binary) {
console.log(binary);
} else {
@@ -1083,6 +1194,32 @@ if (require.main === module) {
break;
}
case 'installChromium': {
const result = await installChromium();
if (result.success) {
console.log(JSON.stringify({
binary: result.binary,
version: result.version,
}));
} else {
console.error(result.error);
process.exit(1);
}
break;
}
case 'installPuppeteerCore': {
const [npmPrefix] = commandArgs;
const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined });
if (result.success) {
console.log(JSON.stringify({ path: result.path }));
} else {
console.error(result.error);
process.exit(1);
}
break;
}
case 'launchChromium': {
const [outputDir, extensionPathsJson] = commandArgs;
const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : [];

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env python3
"""
Install hook for Chrome/Chromium binary.
Install hook for Chrome/Chromium and puppeteer-core.
Runs at crawl start to verify Chromium is available.
Runs at crawl start to install/find Chromium and puppeteer-core.
Outputs JSONL for Binary and Machine config updates.
Respects CHROME_BINARY env var for custom binary paths.
Falls back to `npx @puppeteer/browsers install chromium@latest` if not found.
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
@@ -16,73 +16,139 @@ import os
import sys
import json
import subprocess
from pathlib import Path
def install_chromium_via_puppeteer() -> bool:
"""Install Chromium using @puppeteer/browsers."""
def get_chrome_version(binary_path: str) -> str | None:
"""Get Chrome/Chromium version string."""
try:
print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
[binary_path, '--version'],
capture_output=True,
text=True,
timeout=300
timeout=5
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
print(f"Failed to install Chromium: {e}", file=sys.stderr)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def install_puppeteer_core() -> bool:
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
if not node_modules_dir:
# No isolated node_modules, skip (will use global)
return True
node_modules_path = Path(node_modules_dir)
if (node_modules_path / 'puppeteer-core').exists():
return True
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
npm_prefix = node_modules_path.parent
try:
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
print(f"[+] puppeteer-core installed", file=sys.stderr)
return True
else:
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
return False
except Exception as e:
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
return False
def find_chromium() -> dict | None:
"""Find Chromium binary, respecting CHROME_BINARY env var."""
# Quick check: if CHROME_BINARY is set and exists, skip expensive lookup
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
# Binary is already configured and valid - exit immediately
sys.exit(0)
def install_chromium() -> dict | None:
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
Output format: "chromium@<version> <path_to_binary>"
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
"""
try:
from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
# Try to find chromium using abx-pkg
# Prefer chromium over chrome because Chrome 137+ removed --load-extension support
binary = Binary(
name='chromium',
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()],
overrides={'npm': {'packages': ['@puppeteer/browsers']}}
# Use --path to install to puppeteer's standard cache location
cache_path = os.path.expanduser('~/.cache/puppeteer')
result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
capture_output=True,
text=True,
stdin=subprocess.DEVNULL,
timeout=300
)
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'chromium',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
if result.returncode != 0:
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
return None
# If not found, try to install via @puppeteer/browsers
if install_chromium_via_puppeteer():
# Try loading again after install
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'chromium',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'npm',
}
except Exception:
pass
# Parse output: "chromium@1563294 /path/to/Chromium"
output = result.stdout.strip()
parts = output.split(' ', 1)
if len(parts) != 2:
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
return None
version_str = parts[0] # "chromium@1563294"
binary_path = parts[1].strip()
if not binary_path or not os.path.exists(binary_path):
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
return None
# Extract version number
version = version_str.split('@')[1] if '@' in version_str else None
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
return {
'name': 'chromium',
'abspath': binary_path,
'version': version,
'binprovider': 'puppeteer',
}
except subprocess.TimeoutExpired:
print("[!] Chromium install timed out", file=sys.stderr)
except FileNotFoundError:
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
except Exception as e:
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
return None
def main():
result = find_chromium()
# Install puppeteer-core if NODE_MODULES_DIR is set
install_puppeteer_core()
# Check if CHROME_BINARY is already set and valid
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
version = get_chrome_version(configured_binary)
print(json.dumps({
'type': 'Binary',
'name': 'chromium',
'abspath': configured_binary,
'version': version,
'binprovider': 'env',
}))
sys.exit(0)
# Install/find Chromium via puppeteer
result = install_chromium()
if result and result.get('abspath'):
print(json.dumps({
@@ -110,7 +176,7 @@ def main():
sys.exit(0)
else:
print(f"Chromium binary not found", file=sys.stderr)
print("Chromium binary not found", file=sys.stderr)
sys.exit(1)

View File

@@ -67,28 +67,29 @@ def get_test_env():
return env
def find_chromium_binary():
"""Find the Chromium binary installed by @puppeteer/browsers."""
if not CHROMIUM_INSTALL_DIR.exists():
return None
def find_chromium_binary(data_dir=None):
"""Find the Chromium binary using chrome_utils.js findChromium().
# Look for versioned directories
for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True):
if not version_dir.is_dir():
continue
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
This uses the centralized findChromium() function which checks:
- CHROME_BINARY env var
- @puppeteer/browsers install locations (in data_dir/chromium)
- System Chromium locations
- Falls back to Chrome (with warning)
Args:
data_dir: Directory where chromium was installed (contains chromium/ subdir)
"""
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
# Use provided data_dir, or fall back to env var, or current dir
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
result = subprocess.run(
['node', str(chrome_utils), 'findChromium', str(search_dir)],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None

View File

@@ -171,31 +171,30 @@ def setup_test_lib_dirs(tmpdir: Path) -> dict:
}
def find_chromium_binary():
"""Find the Chromium binary installed by @puppeteer/browsers."""
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
if not chromium_dir.exists():
return None
PLUGINS_ROOT = PLUGIN_DIR.parent
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
if not version_dir.is_dir():
continue
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
def find_chromium_binary():
"""Find the Chromium binary using chrome_utils.js findChromium().
This uses the centralized findChromium() function which checks:
- CHROME_BINARY env var
- @puppeteer/browsers install locations
- System Chromium locations
- Falls back to Chrome (with warning)
"""
chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
result = subprocess.run(
['node', str(chrome_utils), 'findChromium'],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
TEST_URL = 'https://www.filmin.es/'

View File

@@ -157,54 +157,94 @@ def test_large_extension_size():
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
def setup_test_lib_dirs(tmpdir: Path) -> dict:
"""Get lib directories for tests, using project's existing node_modules.
Uses the project's node_modules to avoid slow npm install during tests.
"""
# Use project's existing node_modules (puppeteer-core already installed)
project_root = Path(__file__).parent.parent.parent.parent.parent
node_modules_dir = project_root / 'node_modules'
if not (node_modules_dir / 'puppeteer-core').exists():
pytest.skip("puppeteer-core not installed in project node_modules")
return {
'NODE_MODULES_DIR': str(node_modules_dir),
}
def find_chromium_binary():
"""Find the Chromium binary installed by @puppeteer/browsers."""
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
if not chromium_dir.exists():
return None
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
if not version_dir.is_dir():
continue
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
return None
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure like:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
bin/
node_modules/
chrome_extensions/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / 'bin'
node_modules_dir = npm_dir / 'node_modules'
chrome_extensions_dir = data_dir / 'chrome_extensions'
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
})
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=10, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
# Test URL: ad blocker test page that shows if ads are blocked
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
@pytest.mark.timeout(15)
def test_extension_loads_in_chromium():
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
@@ -214,35 +254,30 @@ def test_extension_loads_in_chromium():
"""
import signal
import time
print("[test] Starting test_extension_loads_in_chromium", flush=True)
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
print(f"[test] tmpdir={tmpdir}", flush=True)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up isolated env with proper directory structure
env = setup_test_env(tmpdir)
env.setdefault('CHROME_HEADLESS', 'true')
print(f"[test] DATA_DIR={env.get('DATA_DIR')}", flush=True)
print(f"[test] CHROME_BINARY={env.get('CHROME_BINARY')}", flush=True)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
# Step 1: Install the uBlock extension
print("[test] Installing uBlock extension...", flush=True)
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=15
timeout=5
)
print(f"[test] Extension install rc={result.returncode}", flush=True)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
@@ -252,7 +287,8 @@ def test_extension_loads_in_chromium():
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
@@ -422,22 +458,11 @@ def test_blocks_ads_on_test_page():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
# Set up isolated env with proper directory structure
env = setup_test_env(tmpdir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
# Step 1: Install the uBlock extension
result = subprocess.run(
@@ -455,8 +480,9 @@ def test_blocks_ads_on_test_page():
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chrome using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'