fix extension loading and consolidate chromium logic

This commit is contained in:
Nick Sweeting
2025-12-29 17:47:37 -08:00
parent 638b3ba774
commit 4ba3e8d120
35 changed files with 2503 additions and 1115 deletions

View File

@@ -328,6 +328,21 @@ def run_hook(
env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
# Use Machine.config.PATH if set (includes pip/npm bin dirs from providers)
try:
from archivebox.machine.models import Machine
machine = Machine.current()
if machine and machine.config:
machine_path = machine.config.get('config/PATH')
if machine_path:
env['PATH'] = machine_path
# Also set NODE_MODULES_DIR if configured
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
if node_modules_dir:
env['NODE_MODULES_DIR'] = node_modules_dir
except Exception:
pass # Fall back to system PATH if Machine not available
# Export all config values to environment (already merged by get_config())
for key, value in config.items():
if value is None:

View File

@@ -17,6 +17,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Extractor metadata

View File

@@ -20,7 +20,7 @@ const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_extension_utils.js');
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {

View File

@@ -15,6 +15,8 @@
const path = require('path');
const fs = require('fs');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Get crawl's chrome directory from environment variable set by hooks.py

View File

@@ -1,483 +0,0 @@
#!/usr/bin/env node
/**
* Chrome Extension Management Utilities
*
* Handles downloading, installing, and managing Chrome extensions for browser automation.
* Ported from the TypeScript implementation in archivebox.ts
*/
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const { exec } = require('child_process');
const { promisify } = require('util');
const { Readable } = require('stream');
const { finished } = require('stream/promises');
const execAsync = promisify(exec);
// Try to import unzipper, fallback to system unzip if not available
let unzip = null;
try {
const unzipper = require('unzipper');
unzip = async (sourcePath, destPath) => {
const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath }));
return stream.promise();
};
} catch (err) {
// Will use system unzip command as fallback
}
/**
* Compute the extension ID from the unpacked path.
* Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id.
*
* @param {string} unpacked_path - Path to the unpacked extension directory
* @returns {string} - 32-character extension ID
*/
function getExtensionId(unpacked_path) {
// Chrome uses a SHA256 hash of the unpacked extension directory path
const hash = crypto.createHash('sha256');
hash.update(Buffer.from(unpacked_path, 'utf-8'));
// Convert first 32 hex chars to characters in the range 'a'-'p'
const detected_extension_id = Array.from(hash.digest('hex'))
.slice(0, 32)
.map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
.join('');
return detected_extension_id;
}
/**
* Download and install a Chrome extension from the Chrome Web Store.
*
* @param {Object} extension - Extension metadata object
* @param {string} extension.webstore_id - Chrome Web Store extension ID
* @param {string} extension.name - Human-readable extension name
* @param {string} extension.crx_url - URL to download the CRX file
* @param {string} extension.crx_path - Local path to save the CRX file
* @param {string} extension.unpacked_path - Path to extract the extension
* @returns {Promise<boolean>} - True if installation succeeded
*/
async function installExtension(extension) {
const manifest_path = path.join(extension.unpacked_path, 'manifest.json');
// Download CRX file if not already downloaded
if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`);
try {
// Ensure parent directory exists
const crxDir = path.dirname(extension.crx_path);
if (!fs.existsSync(crxDir)) {
fs.mkdirSync(crxDir, { recursive: true });
}
// Download CRX file from Chrome Web Store
const response = await fetch(extension.crx_url);
if (!response.ok) {
console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`);
return false;
}
if (response.body) {
const crx_file = fs.createWriteStream(extension.crx_path);
const crx_stream = Readable.fromWeb(response.body);
await finished(crx_stream.pipe(crx_file));
} else {
console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`);
return false;
}
} catch (err) {
console.error(`[❌] Failed to download extension ${extension.name}:`, err);
return false;
}
}
// Unzip CRX file to unpacked_path
await fs.promises.mkdir(extension.unpacked_path, { recursive: true });
try {
// Try system unzip command first
await execAsync(`/usr/bin/unzip -o ${extension.crx_path} -d ${extension.unpacked_path}`);
} catch (err1) {
if (unzip) {
// Fallback to unzipper library
try {
await unzip(extension.crx_path, extension.unpacked_path);
} catch (err2) {
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
return false;
}
} else {
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
return false;
}
}
if (!fs.existsSync(manifest_path)) {
console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`);
return false;
}
return true;
}
/**
* Load or install a Chrome extension, computing all metadata.
*
* @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path)
* @param {string} [ext.webstore_id] - Chrome Web Store extension ID
* @param {string} [ext.name] - Human-readable extension name
* @param {string} [ext.unpacked_path] - Path to unpacked extension
* @param {string} [extensions_dir] - Directory to store extensions
* @returns {Promise<Object>} - Complete extension metadata object
*/
async function loadOrInstallExtension(ext, extensions_dir = null) {
if (!(ext.webstore_id || ext.unpacked_path)) {
throw new Error('Extension must have either {webstore_id} or {unpacked_path}');
}
// Determine extensions directory
const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions';
// Set statically computable extension metadata
ext.webstore_id = ext.webstore_id || ext.id;
ext.name = ext.name || ext.webstore_id;
ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`;
ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`;
ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`);
ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`);
const manifest_path = path.join(ext.unpacked_path, 'manifest.json');
ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'));
ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null;
// If extension is not installed, download and unpack it
if (!ext.read_version()) {
await installExtension(ext);
}
// Autodetect ID from filesystem path (unpacked extensions don't have stable IDs)
ext.id = getExtensionId(ext.unpacked_path);
ext.version = ext.read_version();
if (!ext.version) {
console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`);
} else {
console.log(`[] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`);
}
return ext;
}
/**
* Check if a Puppeteer target is an extension background page/service worker.
*
* @param {Object} target - Puppeteer target object
* @returns {Promise<Object>} - Object with target_is_bg, extension_id, manifest_version, etc.
*/
async function isTargetExtension(target) {
let target_type;
let target_ctx;
let target_url;
try {
target_type = target.type();
target_ctx = (await target.worker()) || (await target.page()) || null;
target_url = target.url() || target_ctx?.url() || null;
} catch (err) {
if (String(err).includes('No target with given id found')) {
// Target closed during check, ignore harmless race condition
target_type = 'closed';
target_ctx = null;
target_url = 'about:closed';
} else {
throw err;
}
}
// Check if this is an extension background page or service worker
const is_chrome_extension = target_url?.startsWith('chrome-extension://');
const is_background_page = target_type === 'background_page';
const is_service_worker = target_type === 'service_worker';
const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker);
let extension_id = null;
let manifest_version = null;
const target_is_extension = is_chrome_extension || target_is_bg;
if (target_is_extension) {
try {
extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
if (target_ctx) {
const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
manifest_version = manifest?.manifest_version || null;
}
} catch (err) {
// Failed to get extension metadata
}
}
return {
target_is_extension,
target_is_bg,
target_type,
target_ctx,
target_url,
extension_id,
manifest_version,
};
}
/**
* Load extension metadata and connection handlers from a browser target.
*
* @param {Array} extensions - Array of extension metadata objects to update
* @param {Object} target - Puppeteer target object
* @returns {Promise<Object|null>} - Updated extension object or null if not an extension
*/
async function loadExtensionFromTarget(extensions, target) {
const {
target_is_bg,
target_is_extension,
target_type,
target_ctx,
target_url,
extension_id,
manifest_version,
} = await isTargetExtension(target);
if (!(target_is_bg && extension_id && target_ctx)) {
return null;
}
// Find matching extension in our list
const extension = extensions.find(ext => ext.id === extension_id);
if (!extension) {
console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`);
return null;
}
// Load manifest from the extension context
let manifest = null;
try {
manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
} catch (err) {
console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err);
return null;
}
// Create dispatch methods for communicating with the extension
const new_extension = {
...extension,
target,
target_type,
target_url,
manifest,
manifest_version,
// Trigger extension toolbar button click
dispatchAction: async (tab) => {
return await target_ctx.evaluate((tabId) => {
return new Promise((resolve) => {
chrome.action.onClicked.addListener((tab) => {
resolve({ success: true, tab });
});
chrome.action.openPopup();
});
}, tab?.id || null);
},
// Send message to extension
dispatchMessage: async (message, options = {}) => {
return await target_ctx.evaluate((msg, opts) => {
return new Promise((resolve) => {
chrome.runtime.sendMessage(msg, opts, (response) => {
resolve(response);
});
});
}, message, options);
},
// Trigger extension command (keyboard shortcut)
dispatchCommand: async (command) => {
return await target_ctx.evaluate((cmd) => {
return new Promise((resolve) => {
chrome.commands.onCommand.addListener((receivedCommand) => {
if (receivedCommand === cmd) {
resolve({ success: true, command: receivedCommand });
}
});
// Note: Actually triggering commands programmatically is not directly supported
// This would need to be done via CDP or keyboard simulation
});
}, command);
},
};
// Update the extension in the array
Object.assign(extension, new_extension);
console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`);
return new_extension;
}
/**
* Install all extensions in the list if not already installed.
*
* @param {Array} extensions - Array of extension metadata objects
* @param {string} [extensions_dir] - Directory to store extensions
* @returns {Promise<Array>} - Array of installed extension objects
*/
async function installAllExtensions(extensions, extensions_dir = null) {
console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`);
for (const extension of extensions) {
await loadOrInstallExtension(extension, extensions_dir);
}
return extensions;
}
/**
* Load and connect to all extensions from a running browser.
*
* @param {Object} browser - Puppeteer browser instance
* @param {Array} extensions - Array of extension metadata objects
* @returns {Promise<Array>} - Array of loaded extension objects with connection handlers
*/
async function loadAllExtensionsFromBrowser(browser, extensions) {
console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`);
// Find loaded extensions at runtime by examining browser targets
for (const target of browser.targets()) {
await loadExtensionFromTarget(extensions, target);
}
return extensions;
}
/**
* Load extension manifest.json file
*
* @param {string} unpacked_path - Path to unpacked extension directory
* @returns {object|null} - Parsed manifest object or null if not found/invalid
*/
function loadExtensionManifest(unpacked_path) {
const manifest_path = path.join(unpacked_path, 'manifest.json');
if (!fs.existsSync(manifest_path)) {
return null;
}
try {
const manifest_content = fs.readFileSync(manifest_path, 'utf-8');
return JSON.parse(manifest_content);
} catch (error) {
// Invalid JSON or read error
return null;
}
}
/**
* Generate Chrome launch arguments for loading extensions.
*
* @param {Array} extensions - Array of extension metadata objects
* @returns {Array<string>} - Chrome CLI arguments for loading extensions
*/
function getExtensionLaunchArgs(extensions) {
if (!extensions || extensions.length === 0) {
return [];
}
// Filter out extensions without unpacked_path first
const validExtensions = extensions.filter(ext => ext.unpacked_path);
const unpacked_paths = validExtensions.map(ext => ext.unpacked_path);
const webstore_ids = validExtensions.map(ext => ext.webstore_id || ext.id);
return [
`--load-extension=${unpacked_paths.join(',')}`,
`--allowlisted-extension-id=${webstore_ids.join(',')}`,
'--allow-legacy-extension-manifests',
'--disable-extensions-auto-update',
];
}
// Export all functions
module.exports = {
getExtensionId,
loadExtensionManifest,
installExtension,
loadOrInstallExtension,
isTargetExtension,
loadExtensionFromTarget,
installAllExtensions,
loadAllExtensionsFromBrowser,
getExtensionLaunchArgs,
};
// CLI usage
if (require.main === module) {
const args = process.argv.slice(2);
if (args.length === 0) {
console.log('Usage: chrome_extension_utils.js <command> [args...]');
console.log('');
console.log('Commands:');
console.log(' getExtensionId <path>');
console.log(' loadExtensionManifest <path>');
console.log(' getExtensionLaunchArgs <extensions_json>');
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
process.exit(1);
}
const [command, ...commandArgs] = args;
(async () => {
try {
switch (command) {
case 'getExtensionId': {
const [unpacked_path] = commandArgs;
const id = getExtensionId(unpacked_path);
console.log(id);
break;
}
case 'loadExtensionManifest': {
const [unpacked_path] = commandArgs;
const manifest = loadExtensionManifest(unpacked_path);
console.log(JSON.stringify(manifest));
break;
}
case 'getExtensionLaunchArgs': {
const [extensions_json] = commandArgs;
const extensions = JSON.parse(extensions_json);
const args = getExtensionLaunchArgs(extensions);
console.log(JSON.stringify(args));
break;
}
case 'loadOrInstallExtension': {
const [webstore_id, name, extensions_dir] = commandArgs;
const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir);
console.log(JSON.stringify(ext, null, 2));
break;
}
default:
console.error(`Unknown command: ${command}`);
process.exit(1);
}
} catch (error) {
console.error(`Error: ${error.message}`);
process.exit(1);
}
})();
}

File diff suppressed because it is too large Load Diff

View File

@@ -2,10 +2,14 @@
"""
Install hook for Chrome/Chromium binary.
Runs at crawl start to verify Chrome is available.
Runs at crawl start to verify Chromium is available.
Outputs JSONL for Binary and Machine config updates.
Respects CHROME_BINARY env var for custom binary paths.
Falls back to `npx @puppeteer/browsers install chrome@stable` if not found.
Falls back to `npx @puppeteer/browsers install chromium@latest` if not found.
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
loading unpacked extensions in headless mode.
"""
import os
@@ -14,24 +18,24 @@ import json
import subprocess
def install_chrome_via_puppeteer() -> bool:
"""Install Chrome using @puppeteer/browsers."""
def install_chromium_via_puppeteer() -> bool:
"""Install Chromium using @puppeteer/browsers."""
try:
print("Chrome not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chrome@stable'],
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
capture_output=True,
text=True,
timeout=300
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
print(f"Failed to install Chrome: {e}", file=sys.stderr)
print(f"Failed to install Chromium: {e}", file=sys.stderr)
return False
def find_chrome() -> dict | None:
"""Find Chrome/Chromium binary, respecting CHROME_BINARY env var."""
def find_chromium() -> dict | None:
"""Find Chromium binary, respecting CHROME_BINARY env var."""
# Quick check: if CHROME_BINARY is set and exists, skip expensive lookup
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
@@ -41,9 +45,10 @@ def find_chrome() -> dict | None:
try:
from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider
# Try to find chrome using abx-pkg
# Try to find chromium using abx-pkg
# Prefer chromium over chrome because Chrome 137+ removed --load-extension support
binary = Binary(
name='chrome',
name='chromium',
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()],
overrides={'npm': {'packages': ['@puppeteer/browsers']}}
)
@@ -51,7 +56,7 @@ def find_chrome() -> dict | None:
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'chrome',
'name': 'chromium',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
@@ -59,12 +64,12 @@ def find_chrome() -> dict | None:
}
# If not found, try to install via @puppeteer/browsers
if install_chrome_via_puppeteer():
if install_chromium_via_puppeteer():
# Try loading again after install
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'chrome',
'name': 'chromium',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
@@ -77,7 +82,7 @@ def find_chrome() -> dict | None:
def main():
result = find_chrome()
result = find_chromium()
if result and result.get('abspath'):
print(json.dumps({
@@ -99,13 +104,13 @@ def main():
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROME_VERSION',
'key': 'config/CHROMIUM_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(f"Chrome/Chromium binary not found", file=sys.stderr)
print(f"Chromium binary not found", file=sys.stderr)
sys.exit(1)

View File

@@ -1,55 +1,57 @@
#!/usr/bin/env node
/**
* Launch a shared Chrome browser session for the entire crawl.
* Launch a shared Chromium browser session for the entire crawl.
*
* This runs once per crawl and keeps Chrome alive for all snapshots to share.
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
* Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
*
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Creates chrome/ directory under crawl output dir with:
* - cdp_url.txt: WebSocket URL for CDP connection
* - pid.txt: Chrome process ID (for cleanup)
* - chrome.pid: Chromium process ID (for cleanup)
* - port.txt: Debug port number
* - extensions.json: Loaded extensions metadata
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_HEADLESS: Run in headless mode (default: true)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
*/
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) {
module.paths.unshift(process.env.NODE_MODULES_DIR);
}
const fs = require('fs');
const path = require('path');
const { spawn } = require('child_process');
const http = require('http');
const puppeteer = require('puppeteer-core');
const {
findChromium,
launchChromium,
killChrome,
getEnv,
writePidWithMtime,
} = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_launch';
const OUTPUT_DIR = 'chrome';
// Helpers for PID file creation
function writePidWithMtime(filePath, pid, startTimeSeconds) {
fs.writeFileSync(filePath, String(pid));
const startTimeMs = startTimeSeconds * 1000;
fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
}
function writeCmdScript(filePath, binary, args) {
const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$'))
? `"${arg.replace(/"/g, '\\"')}"` : arg;
fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`);
fs.chmodSync(filePath, 0o755);
}
// Global state for cleanup
let chromePid = null;
let browserInstance = null;
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
process.argv.slice(2).forEach((arg) => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
@@ -58,63 +60,27 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Cleanup handler for SIGTERM - kill Chrome and all child processes
// Cleanup handler for SIGTERM
async function cleanup() {
if (!chromePid) {
process.exit(0);
return;
}
console.error('[*] Cleaning up Chrome session...');
console.log(`[*] Killing Chrome process tree (PID ${chromePid})...`);
try {
// Try to kill the entire process group
process.kill(-chromePid, 'SIGTERM');
} catch (e) {
// Fall back to killing just the process
// Try graceful browser close first
if (browserInstance) {
try {
process.kill(chromePid, 'SIGTERM');
} catch (e2) {
// Already dead
console.error('[*] Closing browser gracefully...');
await browserInstance.close();
browserInstance = null;
console.error('[+] Browser closed gracefully');
} catch (e) {
console.error(`[!] Graceful close failed: ${e.message}`);
}
}
// Wait 2 seconds for graceful shutdown
await new Promise(resolve => setTimeout(resolve, 2000));
// Force kill with SIGKILL
try {
process.kill(-chromePid, 'SIGKILL');
} catch (e) {
try {
process.kill(chromePid, 'SIGKILL');
} catch (e2) {
// Already dead
}
// Kill Chrome process
if (chromePid) {
await killChrome(chromePid, OUTPUT_DIR);
}
console.log('[*] Chrome process tree killed');
// Delete PID files to prevent PID reuse issues
try {
fs.unlinkSync(path.join(OUTPUT_DIR, 'chrome.pid'));
} catch (e) {}
try {
fs.unlinkSync(path.join(OUTPUT_DIR, 'hook.pid'));
} catch (e) {}
process.exit(0);
}
@@ -122,379 +88,158 @@ async function cleanup() {
process.on('SIGTERM', cleanup);
process.on('SIGINT', cleanup);
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
// Find a free port
function findFreePort() {
return new Promise((resolve, reject) => {
const server = require('net').createServer();
server.unref();
server.on('error', reject);
server.listen(0, () => {
const port = server.address().port;
server.close(() => resolve(port));
});
});
}
// Wait for Chrome's DevTools port to be ready
function waitForDebugPort(port, timeout = 30000) {
const startTime = Date.now();
return new Promise((resolve, reject) => {
const tryConnect = () => {
if (Date.now() - startTime > timeout) {
reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
return;
}
const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
let data = '';
res.on('data', chunk => data += chunk);
res.on('end', () => {
try {
const info = JSON.parse(data);
resolve(info);
} catch (e) {
setTimeout(tryConnect, 100);
}
});
});
req.on('error', () => {
setTimeout(tryConnect, 100);
});
req.setTimeout(1000, () => {
req.destroy();
setTimeout(tryConnect, 100);
});
};
tryConnect();
});
}
// Kill zombie Chrome processes from stale crawls
function killZombieChrome() {
const dataDir = getEnv('DATA_DIR', '.');
const crawlsDir = path.join(dataDir, 'crawls');
const now = Date.now();
const fiveMinutesAgo = now - 300000;
let killed = 0;
console.error('[*] Checking for zombie Chrome processes...');
if (!fs.existsSync(crawlsDir)) {
console.error('[+] No crawls directory found');
return;
}
try {
// Only scan data/crawls/*/chrome/*.pid - no recursion into archive dirs
const crawls = fs.readdirSync(crawlsDir, { withFileTypes: true });
for (const crawl of crawls) {
if (!crawl.isDirectory()) continue;
const crawlDir = path.join(crawlsDir, crawl.name);
const chromeDir = path.join(crawlDir, 'chrome');
if (!fs.existsSync(chromeDir)) continue;
// Check if crawl was modified recently (still active)
try {
const crawlStats = fs.statSync(crawlDir);
if (crawlStats.mtimeMs > fiveMinutesAgo) {
continue; // Crawl modified recently, likely still active
}
} catch (e) {
continue;
}
// Crawl is stale (> 5 minutes since modification), check for PIDs
try {
const pidFiles = fs.readdirSync(chromeDir).filter(f => f.endsWith('.pid'));
for (const pidFileName of pidFiles) {
const pidFile = path.join(chromeDir, pidFileName);
try {
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
if (isNaN(pid) || pid <= 0) continue;
// Check if process exists (simple check, Python will validate properly)
try {
process.kill(pid, 0);
} catch (e) {
// Process dead, remove stale PID file
try { fs.unlinkSync(pidFile); } catch (e) {}
continue;
}
// Process alive and crawl is stale - zombie!
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
try {
// Kill process group
try {
process.kill(-pid, 'SIGKILL');
} catch (e) {
process.kill(pid, 'SIGKILL');
}
killed++;
console.error(`[+] Killed zombie (PID ${pid})`);
try { fs.unlinkSync(pidFile); } catch (e) {}
} catch (e) {
console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
}
} catch (e) {
// Skip invalid PID files
}
}
} catch (e) {
// Skip if can't read chrome dir
}
}
} catch (e) {
console.error(`[!] Error scanning crawls: ${e.message}`);
}
if (killed > 0) {
console.error(`[+] Killed ${killed} zombie process(es)`);
} else {
console.error('[+] No zombies found');
}
}
async function launchChrome(binary) {
// First, kill any zombie Chrome from crashed crawls
killZombieChrome();
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Find a free port for Chrome DevTools
const debugPort = await findFreePort();
console.error(`[*] Using debug port: ${debugPort}`);
// Load any installed extensions
const extensionUtils = require('./chrome_extension_utils.js');
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
const installedExtensions = [];
if (fs.existsSync(extensionsDir)) {
const files = fs.readdirSync(extensionsDir);
for (const file of files) {
if (file.endsWith('.extension.json')) {
try {
const extPath = path.join(extensionsDir, file);
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
installedExtensions.push(extData);
console.error(`[*] Loading extension: ${extData.name || file}`);
}
} catch (e) {
// Skip invalid cache files
console.warn(`[!] Skipping invalid extension cache: ${file}`);
}
}
}
}
// Get extension launch arguments
const extensionArgs = extensionUtils.getExtensionLaunchArgs(installedExtensions);
if (extensionArgs.length > 0) {
console.error(`[+] Loaded ${installedExtensions.length} extension(s)`);
// Write extensions metadata for config hooks to use
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
// Build Chrome arguments
const chromeArgs = [
`--remote-debugging-port=${debugPort}`,
'--remote-debugging-address=127.0.0.1',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-sync',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
'--password-store=basic',
'--use-mock-keychain',
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...extensionArgs, // Load extensions
...(headless ? ['--headless=new'] : []),
...(checkSsl ? [] : ['--ignore-certificate-errors']),
'about:blank', // Start with blank page
];
// Launch Chrome as a detached process group leader
// This allows us to kill Chrome and all its child processes as a group
const chromeProcess = spawn(binary, chromeArgs, {
detached: true,
stdio: ['ignore', 'ignore', 'ignore'],
});
chromeProcess.unref(); // Don't keep Node.js process running
chromePid = chromeProcess.pid;
const chromeStartTime = Date.now() / 1000; // Unix epoch seconds
console.error(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
// Write Chrome PID with mtime set to start time for validation
writePidWithMtime(path.join(OUTPUT_DIR, 'chrome.pid'), chromePid, chromeStartTime);
// Write command script for validation
writeCmdScript(path.join(OUTPUT_DIR, 'cmd.sh'), binary, chromeArgs);
fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort));
// Write hook's own PID with mtime for validation
const hookStartTime = Date.now() / 1000;
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
try {
// Wait for Chrome to be ready
const versionInfo = await waitForDebugPort(debugPort, 30000);
console.error(`[+] Chrome ready: ${versionInfo.Browser}`);
// Build WebSocket URL
const wsUrl = versionInfo.webSocketDebuggerUrl;
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
return { success: true, cdpUrl: wsUrl, pid: chromePid, port: debugPort };
} catch (e) {
// Kill Chrome if setup failed
try {
process.kill(chromePid, 'SIGTERM');
} catch (killErr) {
// Ignore
}
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const crawlId = args.crawl_id;
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let version = '';
try {
const binary = findChrome();
const binary = findChromium();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chrome');
console.error('ERROR: Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chromium');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
process.exit(1);
}
// Get Chrome version
// Get Chromium version
let version = '';
try {
const { execSync } = require('child_process');
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
} catch (e) {
version = '';
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
.trim()
.slice(0, 64);
} catch (e) {}
console.error(`[*] Using browser: ${binary}`);
if (version) console.error(`[*] Version: ${version}`);
// Load installed extensions
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
const installedExtensions = [];
const extensionPaths = [];
if (fs.existsSync(extensionsDir)) {
const files = fs.readdirSync(extensionsDir);
for (const file of files) {
if (file.endsWith('.extension.json')) {
try {
const extPath = path.join(extensionsDir, file);
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
installedExtensions.push(extData);
extensionPaths.push(extData.unpacked_path);
console.error(`[*] Loading extension: ${extData.name || file}`);
}
} catch (e) {
console.warn(`[!] Skipping invalid extension cache: ${file}`);
}
}
}
}
const result = await launchChrome(binary);
if (result.success) {
status = 'succeeded';
output = OUTPUT_DIR;
console.error(`[+] Chrome session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${result.cdpUrl}`);
console.error(`[+] PID: ${result.pid}`);
} else {
status = 'failed';
error = result.error;
if (installedExtensions.length > 0) {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
// Write hook's own PID
const hookStartTime = Date.now() / 1000;
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
// Launch Chromium using consolidated function
const result = await launchChromium({
binary,
outputDir: OUTPUT_DIR,
extensionPaths,
});
if (!result.success) {
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
// Write extensions metadata
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
// Connect puppeteer for extension verification
console.error(`[*] Connecting puppeteer to CDP...`);
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
browserInstance = browser;
// Verify extensions loaded
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 3000));
const targets = browser.targets();
console.error(`[*] All browser targets (${targets.length}):`);
for (const t of targets) {
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
}
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
// Filter out built-in extensions
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
const customExtTargets = extTargets.filter(t => {
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
return !builtinIds.includes(extId);
});
console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
for (const target of customExtTargets) {
const url = target.url();
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
}
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
}
}
console.error(`[+] Chromium session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${cdpUrl}`);
console.error(`[+] PID: ${chromePid}`);
// Stay alive to handle cleanup on SIGTERM
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
setInterval(() => {}, 1000000);
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
if (error) {
console.error(`ERROR: ${error}`);
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
// Background hook - stay running to handle cleanup on SIGTERM
console.log('[*] Chrome launch hook staying alive to handle cleanup...');
// Keep process alive by setting an interval (won't actually do anything)
// This allows us to receive SIGTERM when crawl ends
setInterval(() => {}, 1000000);
}
main().catch(e => {
main().catch((e) => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -26,7 +26,11 @@ const fs = require('fs');
const path = require('path');
const { spawn } = require('child_process');
const http = require('http');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_tab';
@@ -87,31 +91,6 @@ async function cleanup() {
process.on('SIGTERM', cleanup);
process.on('SIGINT', cleanup);
// Find Chrome binary (for fallback)
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
@@ -367,7 +346,7 @@ async function main() {
let version = '';
try {
const binary = findChrome();
const binary = findChromium();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chrome');

View File

@@ -17,6 +17,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'chrome_navigate';

View File

@@ -2,14 +2,18 @@
Integration tests for chrome plugin
Tests verify:
1. Chrome install hook checks for Chrome/Chromium binary
1. Chromium install via @puppeteer/browsers
2. Verify deps with abx-pkg
3. Chrome hooks exist
4. Chrome launches at crawl level
4. Chromium launches at crawl level
5. Tab creation at snapshot level
6. Tab navigation works
7. Tab cleanup on SIGTERM
8. Chrome cleanup on crawl end
8. Chromium cleanup on crawl end
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
loading unpacked extensions in headless mode.
"""
import json
@@ -40,49 +44,104 @@ def get_lib_dir_and_machine_type():
return Path(lib_dir), machine_type
# Setup NODE_PATH to find npm packages
# Setup NODE_MODULES_DIR to find npm packages
LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type()
# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin)
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
NPM_PREFIX = LIB_DIR / 'npm'
# Chromium install location (relative to DATA_DIR)
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
"""Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
env['MACHINE_TYPE'] = MACHINE_TYPE
# Ensure CHROME_BINARY is set to Chromium
if 'CHROME_BINARY' not in env:
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
return env
def find_chromium_binary():
"""Find the Chromium binary installed by @puppeteer/browsers."""
if not CHROMIUM_INSTALL_DIR.exists():
return None
# Look for versioned directories
for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True):
if not version_dir.is_dir():
continue
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
return None
@pytest.fixture(scope="session", autouse=True)
def ensure_puppeteer_installed():
"""Ensure puppeteer is installed in LIB_DIR before running tests."""
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
def ensure_chromium_and_puppeteer_installed():
"""Ensure Chromium and puppeteer are installed before running tests."""
from abx_pkg import Binary, NpmProvider
# Rebuild pydantic models
NpmProvider.model_rebuild()
# Check if puppeteer-core is already available
# Install puppeteer-core if not available
puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core'
if puppeteer_core_path.exists():
return # Already installed
if not puppeteer_core_path.exists():
print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
NPM_PREFIX.mkdir(parents=True, exist_ok=True)
print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
NPM_PREFIX.mkdir(parents=True, exist_ok=True)
provider = NpmProvider(npm_prefix=NPM_PREFIX)
try:
binary = Binary(
name='puppeteer',
binproviders=[provider],
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
)
binary.install()
print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
except Exception as e:
pytest.skip(f"Failed to install puppeteer: {e}")
# Install puppeteer using NpmProvider with custom prefix
provider = NpmProvider(npm_prefix=NPM_PREFIX)
try:
binary = Binary(
name='puppeteer',
binproviders=[provider],
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
# Install Chromium via @puppeteer/browsers if not available
chromium_binary = find_chromium_binary()
if not chromium_binary:
print(f"\n[*] Installing Chromium to {CHROMIUM_INSTALL_DIR}...")
CHROMIUM_INSTALL_DIR.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
cwd=str(CHROMIUM_INSTALL_DIR.parent),
capture_output=True,
text=True,
timeout=300
)
binary.install()
print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
except Exception as e:
pytest.skip(f"Failed to install puppeteer: {e}")
if result.returncode != 0:
pytest.skip(f"Failed to install Chromium: {result.stderr}")
chromium_binary = find_chromium_binary()
if not chromium_binary:
pytest.skip("Chromium installed but binary not found")
print(f"[*] Chromium installed: {chromium_binary}")
# Set CHROME_BINARY env var for tests
os.environ['CHROME_BINARY'] = chromium_binary
def test_hook_scripts_exist():
@@ -92,26 +151,22 @@ def test_hook_scripts_exist():
assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}"
def test_verify_deps_with_abx_pkg():
"""Verify chrome is available via abx-pkg."""
from abx_pkg import Binary, NpmProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
def test_verify_chromium_available():
"""Verify Chromium is available via CHROME_BINARY env var."""
chromium_binary = os.environ.get('CHROME_BINARY') or find_chromium_binary()
NpmProvider.model_rebuild()
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
assert chromium_binary, "Chromium binary should be available (set by fixture or found)"
assert Path(chromium_binary).exists(), f"Chromium binary should exist at {chromium_binary}"
# Try to find chrome using same config as install hook
chrome_binary = Binary(
name='chrome',
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()],
overrides={'npm': {'packages': ['@puppeteer/browsers']}}
# Verify it's actually Chromium by checking version
result = subprocess.run(
[chromium_binary, '--version'],
capture_output=True,
text=True,
timeout=10
)
chrome_loaded = chrome_binary.load()
# Chrome should be available (either found by install hook or at explicit path)
assert chrome_loaded and chrome_loaded.abspath, "Chrome should be available via abx-pkg after install hook runs"
assert Path(chrome_loaded.abspath).exists(), f"Chrome binary should exist at {chrome_loaded.abspath}"
assert result.returncode == 0, f"Failed to get Chromium version: {result.stderr}"
assert 'Chromium' in result.stdout or 'Chrome' in result.stdout, f"Unexpected version output: {result.stdout}"
def test_chrome_launch_and_tab_creation():
@@ -121,7 +176,7 @@ def test_chrome_launch_and_tab_creation():
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
# Get test environment with NODE_PATH set
# Get test environment with NODE_MODULES_DIR set
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'

View File

@@ -12,6 +12,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'consolelog';

View File

@@ -40,7 +40,11 @@ if (!getEnvBool('DOM_ENABLED', true)) {
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'dom';
@@ -96,33 +100,6 @@ function getCdpUrl() {
return null;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
@@ -175,7 +152,7 @@ async function dumpDom(url) {
// Fall back to launching new browser
if (!browser) {
const executablePath = findChrome();
const executablePath = findChromium();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}

View File

@@ -27,7 +27,7 @@ DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_PATH
# Get LIB_DIR for NODE_MODULES_DIR
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
@@ -37,9 +37,9 @@ LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env

View File

@@ -45,6 +45,8 @@ if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'infiniscroll';

View File

@@ -34,9 +34,9 @@ TEST_URL = 'https://www.singsing.movie/'
def get_node_modules_dir():
"""Get NODE_MODULES_DIR for tests, checking env first."""
# Check if NODE_PATH is already set in environment
if os.environ.get('NODE_PATH'):
return Path(os.environ['NODE_PATH'])
# Check if NODE_MODULES_DIR is already set in environment
if os.environ.get('NODE_MODULES_DIR'):
return Path(os.environ['NODE_MODULES_DIR'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
@@ -47,9 +47,9 @@ NODE_MODULES_DIR = get_node_modules_dir()
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
return env

View File

@@ -21,7 +21,7 @@ const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_extension_utils.js');
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {

View File

@@ -6,8 +6,10 @@ Tests invoke the plugin hook as an external process and verify outputs/side effe
import json
import os
import signal
import subprocess
import tempfile
import time
from pathlib import Path
import pytest
@@ -120,3 +122,435 @@ def test_no_configuration_required():
# Should not require any API keys or configuration
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
def setup_test_lib_dirs(tmpdir: Path) -> dict:
"""Create isolated lib directories for tests and return env dict.
Sets up:
LIB_DIR: tmpdir/lib/<arch>
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
"""
import platform
arch = platform.machine()
system = platform.system().lower()
arch_dir = f"{arch}-{system}"
lib_dir = tmpdir / 'lib' / arch_dir
npm_dir = lib_dir / 'npm'
node_modules_dir = npm_dir / 'node_modules'
npm_bin_dir = npm_dir / 'bin'
pip_venv_dir = lib_dir / 'pip' / 'venv'
pip_bin_dir = pip_venv_dir / 'bin'
# Create directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
pip_bin_dir.mkdir(parents=True, exist_ok=True)
# Install puppeteer-core to the test node_modules if not present
if not (node_modules_dir / 'puppeteer-core').exists():
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
capture_output=True,
text=True,
timeout=120
)
if result.returncode != 0:
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
return {
'LIB_DIR': str(lib_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'NPM_BIN_DIR': str(npm_bin_dir),
'PIP_VENV_DIR': str(pip_venv_dir),
'PIP_BIN_DIR': str(pip_bin_dir),
}
def find_chromium_binary():
"""Find the Chromium binary installed by @puppeteer/browsers."""
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
if not chromium_dir.exists():
return None
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
if not version_dir.is_dir():
continue
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
return None
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
TEST_URL = 'https://www.filmin.es/'
def test_extension_loads_in_chromium():
"""Verify extension loads in Chromium by visiting its options page.
Uses Chromium with --load-extension to load the extension, then navigates
to chrome-extension://<id>/options.html and checks that the extension name
appears in the page content.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chromium CDP URL not found after 20s"
print(f"Chromium launched with CDP URL: {cdp_url}")
# Check that extensions were loaded
extensions_file = chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
try:
# Step 3: Connect to Chromium and verify extension loaded via options page
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 2000));
// Find extension targets to get the extension ID
const targets = browser.targets();
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
// Filter out Chrome's built-in extensions
const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai'];
const customExtTargets = extTargets.filter(t => {{
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
return !builtinIds.includes(extId);
}});
console.error('Custom extension targets found:', customExtTargets.length);
customExtTargets.forEach(t => console.error(' -', t.type(), t.url()));
if (customExtTargets.length === 0) {{
console.log(JSON.stringify({{ loaded: false, error: 'No custom extension targets found' }}));
browser.disconnect();
return;
}}
// Get the extension ID from the first custom extension target
const extUrl = customExtTargets[0].url();
const extId = extUrl.split('://')[1].split('/')[0];
console.error('Extension ID:', extId);
// Try to navigate to the extension's options.html page
const page = await browser.newPage();
const optionsUrl = 'chrome-extension://' + extId + '/options.html';
console.error('Navigating to options page:', optionsUrl);
try {{
await page.goto(optionsUrl, {{ waitUntil: 'domcontentloaded', timeout: 10000 }});
const pageContent = await page.content();
const pageTitle = await page.title();
// Check if extension name appears in the page
const hasExtensionName = pageContent.toLowerCase().includes('cookie') ||
pageContent.toLowerCase().includes('idontcareaboutcookies') ||
pageTitle.toLowerCase().includes('cookie');
console.log(JSON.stringify({{
loaded: true,
extensionId: extId,
optionsPageLoaded: true,
pageTitle: pageTitle,
hasExtensionName: hasExtensionName,
contentLength: pageContent.length
}}));
}} catch (e) {{
// options.html may not exist, but extension is still loaded
console.log(JSON.stringify({{
loaded: true,
extensionId: extId,
optionsPageLoaded: false,
error: e.message
}}));
}}
browser.disconnect();
}})();
'''
script_path = tmpdir / 'test_extension.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=90
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
assert test_result.get('loaded'), \
f"Extension should be loaded in Chromium. Result: {test_result}"
print(f"Extension loaded successfully: {test_result}")
finally:
# Clean up Chromium
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def test_hides_cookie_consent_on_filmin():
"""Live test: verify extension hides cookie consent popup on filmin.es.
Uses Chromium with extensions loaded automatically via chrome hook.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chromium CDP URL not found after 20s"
print(f"Chromium launched with CDP URL: {cdp_url}")
try:
# Step 3: Connect to Chromium and test cookie consent hiding
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 2000));
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
console.error('Navigating to {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
// Wait for extension content script to process page
await new Promise(r => setTimeout(r, 5000));
// Check cookie consent visibility
const result = await page.evaluate(() => {{
const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay'];
for (const sel of selectors) {{
const el = document.querySelector(sel);
if (el) {{
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const visible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
rect.width > 0 && rect.height > 0;
if (visible) return {{ visible: true, selector: sel }};
}}
}}
return {{ visible: false }};
}});
console.error('Cookie consent:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = tmpdir / 'test_extension.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=90
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
assert not test_result['visible'], \
f"Cookie consent should be hidden by extension. Result: {test_result}"
finally:
# Clean up Chromium
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass

View File

@@ -44,6 +44,8 @@ if (!getEnvBool('MODALCLOSER_ENABLED', true)) {
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'modalcloser';
@@ -156,22 +158,59 @@ async function closeModals(page) {
// Generic fallback - hide unrecognized modals with CSS
const genericSelectors = [
// CookieYes (cky) - popular cookie consent library
'.cky-consent-container',
'.cky-popup-center',
'.cky-overlay',
'.cky-modal',
'#ckyPreferenceCenter',
// CookieYes (cky)
'.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal', '#ckyPreferenceCenter',
// OneTrust
'#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter', '#onetrust-pc-sdk',
// CookieBot
'#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay', '#CookiebotWidget',
// Quantcast / CMP
'.qc-cmp-ui-container', '#qc-cmp2-container', '.qc-cmp2-summary-buttons',
// TrustArc / TrustE
'#truste-consent-track', '.truste-banner', '#truste-consent-content',
// Osano
'.osano-cm-window', '.osano-cm-dialog',
// Klaro
'.klaro .cookie-modal', '.klaro .cookie-notice',
// Tarteaucitron
'#tarteaucitronRoot', '#tarteaucitronAlertBig',
// Complianz (WordPress)
'.cmplz-cookiebanner', '#cmplz-cookiebanner-container',
// GDPR Cookie Consent (WordPress)
'#gdpr-cookie-consent-bar', '.gdpr-cookie-consent-popup',
// Cookie Notice (WordPress)
'#cookie-notice', '.cookie-notice-container',
// EU Cookie Law
'.eupopup', '#eu-cookie-law',
// Didomi
'#didomi-popup', '#didomi-host', '.didomi-popup-container',
// Usercentrics
'#usercentrics-root', '.uc-banner',
// Axeptio
'#axeptio_overlay', '#axeptio_btn',
// iubenda
'#iubenda-cs-banner', '.iubenda-cs-container',
// Termly
'.termly-consent-banner', '#termly-code-snippet-support',
// Borlabs Cookie (WordPress)
'#BorlabsCookieBox', '.BorlabsCookie',
// CookieFirst
'.cookiefirst-root', '#cookiefirst-root',
// CookieScript
'#cookiescript_injected', '.cookiescript_injected_wrapper',
// Civic Cookie Control
'#ccc', '#ccc-overlay',
// Generic patterns
'#cookie-consent', '.cookie-banner', '.cookie-notice',
'#cookieConsent', '.cookie-consent', '.cookies-banner',
'[class*="cookie"][class*="banner"]', '[class*="cookie"][class*="notice"]',
'[class*="cookie"][class*="popup"]', '[class*="cookie"][class*="modal"]',
'[class*="consent"][class*="banner"]', '[class*="consent"][class*="popup"]',
'[class*="gdpr"]', '[class*="privacy"][class*="banner"]',
// Modal overlays and backdrops
'.modal-overlay:not([style*="display: none"])',
'.modal-backdrop:not([style*="display: none"])',
'.overlay-visible',
// Cookie consent banners
'#cookie-consent', '.cookie-banner', '.cookie-notice',
'#cookieConsent', '.cookie-consent', '.cookies-banner',
'[class*="cookie"][class*="banner"]',
'[class*="cookie"][class*="notice"]',
'[class*="gdpr"]',
// Popup overlays
'.popup-overlay', '.newsletter-popup', '.age-gate',
'.subscribe-popup', '.subscription-modal',

View File

@@ -35,9 +35,9 @@ COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/'
def get_node_modules_dir():
"""Get NODE_MODULES_DIR for tests, checking env first."""
# Check if NODE_PATH is already set in environment
if os.environ.get('NODE_PATH'):
return Path(os.environ['NODE_PATH'])
# Check if NODE_MODULES_DIR is already set in environment
if os.environ.get('NODE_MODULES_DIR'):
return Path(os.environ['NODE_MODULES_DIR'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
@@ -48,9 +48,9 @@ NODE_MODULES_DIR = get_node_modules_dir()
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
return env

View File

@@ -90,6 +90,32 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
}
print(json.dumps(record))
# Emit PATH update if npm bin dir not already in PATH
npm_bin_dir = str(npm_prefix / 'bin')
current_path = os.environ.get('PATH', '')
# Check if npm_bin_dir is already in PATH
path_dirs = current_path.split(':')
if npm_bin_dir not in path_dirs:
# Prepend npm_bin_dir to PATH
new_path = f"{npm_bin_dir}:{current_path}" if current_path else npm_bin_dir
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/PATH',
'value': new_path,
}))
click.echo(f" Added {npm_bin_dir} to PATH", err=True)
# Also emit NODE_MODULES_DIR for JS module resolution
node_modules_dir = str(npm_prefix / 'node_modules')
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/NODE_MODULES_DIR',
'value': node_modules_dir,
}))
# Log human-readable info to stderr
click.echo(f"Installed {name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)

View File

@@ -20,6 +20,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Extractor metadata

View File

@@ -40,7 +40,10 @@ if (!getEnvBool('PDF_ENABLED', true)) {
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'pdf';
@@ -96,33 +99,6 @@ function getCdpUrl() {
return null;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
@@ -175,7 +151,7 @@ async function printToPdf(url) {
// Fall back to launching new browser
if (!browser) {
const executablePath = findChrome();
const executablePath = findChromium();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}

View File

@@ -28,7 +28,7 @@ PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_PATH
# Get LIB_DIR for NODE_MODULES_DIR
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
@@ -38,9 +38,9 @@ LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env

View File

@@ -15,7 +15,7 @@ import sys
from pathlib import Path
import rich_click as click
from abx_pkg import Binary, PipProvider
from abx_pkg import Binary, PipProvider, BinProviderOverrides
# Fix pydantic forward reference issue
PipProvider.model_rebuild()
@@ -87,6 +87,23 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
}
print(json.dumps(record))
# Emit PATH update if pip bin dir not already in PATH
pip_bin_dir = str(pip_venv_path / 'bin')
current_path = os.environ.get('PATH', '')
# Check if pip_bin_dir is already in PATH
path_dirs = current_path.split(':')
if pip_bin_dir not in path_dirs:
# Prepend pip_bin_dir to PATH
new_path = f"{pip_bin_dir}:{current_path}" if current_path else pip_bin_dir
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/PATH',
'value': new_path,
}))
click.echo(f" Added {pip_bin_dir} to PATH", err=True)
# Log human-readable info to stderr
click.echo(f"Installed {name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)

View File

@@ -12,6 +12,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'redirects';

View File

@@ -13,6 +13,8 @@
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'responses';

View File

@@ -40,7 +40,10 @@ if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'screenshot';
@@ -96,36 +99,6 @@ function getCdpUrl() {
return null;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
// Common paths
'google-chrome',
'chromium',
];
for (const candidate of candidates) {
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
@@ -178,7 +151,7 @@ async function takeScreenshot(url) {
// Fall back to launching new browser
if (!browser) {
const executablePath = findChrome();
const executablePath = findChromium();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}

View File

@@ -26,7 +26,7 @@ PLUGINS_ROOT = PLUGIN_DIR.parent
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_PATH
# Get LIB_DIR for NODE_MODULES_DIR
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
@@ -36,9 +36,9 @@ LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env

View File

@@ -17,6 +17,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Extractor metadata

View File

@@ -25,7 +25,7 @@ const { exec } = require('child_process');
const execAsync = promisify(exec);
// Import extension utilities
const extensionUtils = require('../chrome/chrome_extension_utils.js');
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {

View File

@@ -12,6 +12,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'ssl';

View File

@@ -12,6 +12,8 @@
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'staticfile';

View File

@@ -22,7 +22,7 @@ const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_extension_utils.js');
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {

View File

@@ -155,3 +155,461 @@ def test_large_extension_size():
# uBlock Origin with filter lists is typically 2-5 MB
size_bytes = crx_file.stat().st_size
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
def setup_test_lib_dirs(tmpdir: Path) -> dict:
"""Create isolated lib directories for tests and return env dict.
Sets up:
LIB_DIR: tmpdir/lib/<arch>
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
"""
import platform
arch = platform.machine()
system = platform.system().lower()
arch_dir = f"{arch}-{system}"
lib_dir = tmpdir / 'lib' / arch_dir
npm_dir = lib_dir / 'npm'
node_modules_dir = npm_dir / 'node_modules'
npm_bin_dir = npm_dir / 'bin'
pip_venv_dir = lib_dir / 'pip' / 'venv'
pip_bin_dir = pip_venv_dir / 'bin'
# Create directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
pip_bin_dir.mkdir(parents=True, exist_ok=True)
# Install puppeteer-core to the test node_modules if not present
if not (node_modules_dir / 'puppeteer-core').exists():
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
capture_output=True,
text=True,
timeout=120
)
if result.returncode != 0:
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
return {
'LIB_DIR': str(lib_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'NPM_BIN_DIR': str(npm_bin_dir),
'PIP_VENV_DIR': str(pip_venv_dir),
'PIP_BIN_DIR': str(pip_bin_dir),
}
def find_chromium_binary():
"""Find the Chromium binary installed by @puppeteer/browsers."""
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
if not chromium_dir.exists():
return None
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
if not version_dir.is_dir():
continue
# macOS ARM
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_arm.exists():
return str(mac_arm)
# macOS x64
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
if mac_x64.exists():
return str(mac_x64)
# Linux
linux = version_dir / 'chrome-linux' / 'chrome'
if linux.exists():
return str(linux)
return None
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
# Test URL: ad blocker test page that shows if ads are blocked
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
def test_extension_loads_in_chromium():
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
Uses Chromium with --load-extension to load the extension, then navigates
to chrome-extension://<id>/dashboard.html and checks that "uBlock" appears
in the page content.
"""
import signal
import time
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the uBlock extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=120
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'ublock.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chromium CDP URL not found after 20s"
print(f"Chromium launched with CDP URL: {cdp_url}")
# Print chrome hook stderr for debugging
# Read what's available without blocking
import select
if select.select([chrome_launch_process.stderr], [], [], 0.1)[0]:
chrome_stderr = chrome_launch_process.stderr.read()
print(f"Chrome hook stderr:\n{chrome_stderr}")
# Check what extensions were loaded by chrome hook
extensions_file = chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded by chrome hook: {[e.get('name') for e in loaded_exts]}")
else:
print("Warning: extensions.json not found")
# Get the unpacked extension ID - Chrome computes this from the path
unpacked_path = ext_data.get('unpacked_path', '')
print(f"Extension unpacked path: {unpacked_path}")
try:
# Step 3: Connect to Chromium and verify extension loads
# First use CDP to get all targets and find extension ID
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 3000));
// Use CDP to get all targets including service workers
const pages = await browser.pages();
const page = pages[0] || await browser.newPage();
const client = await page.createCDPSession();
const {{ targetInfos }} = await client.send('Target.getTargets');
console.error('All CDP targets:');
targetInfos.forEach(t => console.error(' -', t.type, t.url.slice(0, 100)));
// Find any chrome-extension:// URLs
const extTargets = targetInfos.filter(t => t.url.startsWith('chrome-extension://'));
console.error('Extension targets:', extTargets.length);
// Filter out built-in extensions
const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai'];
const customExts = extTargets.filter(t => {{
const extId = t.url.split('://')[1].split('/')[0];
return !builtinIds.includes(extId);
}});
if (customExts.length === 0) {{
console.log(JSON.stringify({{ loaded: false, error: 'No custom extension found via CDP' }}));
browser.disconnect();
return;
}}
// Get extension ID from first custom extension
const extId = customExts[0].url.split('://')[1].split('/')[0];
console.error('Found extension ID:', extId);
// Try to load dashboard.html
const newPage = await browser.newPage();
const dashboardUrl = 'chrome-extension://' + extId + '/dashboard.html';
console.error('Loading:', dashboardUrl);
try {{
await newPage.goto(dashboardUrl, {{ waitUntil: 'domcontentloaded', timeout: 15000 }});
const title = await newPage.title();
const content = await newPage.content();
const hasUblock = content.toLowerCase().includes('ublock') || title.toLowerCase().includes('ublock');
console.log(JSON.stringify({{
loaded: true,
extensionId: extId,
pageTitle: title,
hasExtensionName: hasUblock,
contentLength: content.length
}}));
}} catch (e) {{
console.error('Dashboard load failed:', e.message);
console.log(JSON.stringify({{ loaded: true, extensionId: extId, dashboardError: e.message }}));
}}
browser.disconnect();
}})();
'''
script_path = tmpdir / 'test_ublock.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=90
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
assert test_result.get('loaded'), \
f"uBlock extension should be loaded in Chromium. Result: {test_result}"
print(f"Extension loaded successfully: {test_result}")
finally:
# Clean up Chromium
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def test_blocks_ads_on_test_page():
"""Live test: verify uBlock Origin blocks ads on a test page.
Uses Chromium with extensions loaded automatically via chrome hook.
Tests against d3ward's ad blocker test page which checks ad domains.
"""
import signal
import time
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the uBlock extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=120
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'ublock.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chrome using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chrome CDP URL not found after 20s"
print(f"Chrome launched with CDP URL: {cdp_url}")
# Check that extensions were loaded
extensions_file = chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
try:
# Step 3: Connect to Chrome and test ad blocking
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 3000));
// Check extension loaded by looking at targets
const targets = browser.targets();
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
console.error('Extension targets found:', extTargets.length);
extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60)));
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
console.error('Navigating to {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
// Wait for the test page to run its checks
await new Promise(r => setTimeout(r, 5000));
// The d3ward test page shows blocked percentage
const result = await page.evaluate(() => {{
const scoreEl = document.querySelector('#score');
const score = scoreEl ? scoreEl.textContent : null;
const blockedItems = document.querySelectorAll('.blocked').length;
const totalItems = document.querySelectorAll('.testlist li').length;
return {{
score,
blockedItems,
totalItems,
percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
}};
}});
console.error('Ad blocking result:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = tmpdir / 'test_ublock.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=90
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
# uBlock should block most ad domains on the test page
assert test_result['percentBlocked'] >= 50, \
f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
finally:
# Clean up Chrome
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass