mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-02 17:05:38 +10:00
fix extension loading and consolidate chromium logic
This commit is contained in:
@@ -328,6 +328,21 @@ def run_hook(
|
||||
env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
|
||||
env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
|
||||
|
||||
# Use Machine.config.PATH if set (includes pip/npm bin dirs from providers)
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
machine = Machine.current()
|
||||
if machine and machine.config:
|
||||
machine_path = machine.config.get('config/PATH')
|
||||
if machine_path:
|
||||
env['PATH'] = machine_path
|
||||
# Also set NODE_MODULES_DIR if configured
|
||||
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
|
||||
if node_modules_dir:
|
||||
env['NODE_MODULES_DIR'] = node_modules_dir
|
||||
except Exception:
|
||||
pass # Fall back to system PATH if Machine not available
|
||||
|
||||
# Export all config values to environment (already merged by get_config())
|
||||
for key, value in config.items():
|
||||
if value is None:
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
|
||||
@@ -20,7 +20,7 @@ const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome/chrome_extension_utils.js');
|
||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Get crawl's chrome directory from environment variable set by hooks.py
|
||||
|
||||
@@ -1,483 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Chrome Extension Management Utilities
|
||||
*
|
||||
* Handles downloading, installing, and managing Chrome extensions for browser automation.
|
||||
* Ported from the TypeScript implementation in archivebox.ts
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const crypto = require('crypto');
|
||||
const { exec } = require('child_process');
|
||||
const { promisify } = require('util');
|
||||
const { Readable } = require('stream');
|
||||
const { finished } = require('stream/promises');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// Try to import unzipper, fallback to system unzip if not available
|
||||
let unzip = null;
|
||||
try {
|
||||
const unzipper = require('unzipper');
|
||||
unzip = async (sourcePath, destPath) => {
|
||||
const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath }));
|
||||
return stream.promise();
|
||||
};
|
||||
} catch (err) {
|
||||
// Will use system unzip command as fallback
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the extension ID from the unpacked path.
|
||||
* Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id.
|
||||
*
|
||||
* @param {string} unpacked_path - Path to the unpacked extension directory
|
||||
* @returns {string} - 32-character extension ID
|
||||
*/
|
||||
function getExtensionId(unpacked_path) {
|
||||
// Chrome uses a SHA256 hash of the unpacked extension directory path
|
||||
const hash = crypto.createHash('sha256');
|
||||
hash.update(Buffer.from(unpacked_path, 'utf-8'));
|
||||
|
||||
// Convert first 32 hex chars to characters in the range 'a'-'p'
|
||||
const detected_extension_id = Array.from(hash.digest('hex'))
|
||||
.slice(0, 32)
|
||||
.map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
|
||||
.join('');
|
||||
|
||||
return detected_extension_id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download and install a Chrome extension from the Chrome Web Store.
|
||||
*
|
||||
* @param {Object} extension - Extension metadata object
|
||||
* @param {string} extension.webstore_id - Chrome Web Store extension ID
|
||||
* @param {string} extension.name - Human-readable extension name
|
||||
* @param {string} extension.crx_url - URL to download the CRX file
|
||||
* @param {string} extension.crx_path - Local path to save the CRX file
|
||||
* @param {string} extension.unpacked_path - Path to extract the extension
|
||||
* @returns {Promise<boolean>} - True if installation succeeded
|
||||
*/
|
||||
async function installExtension(extension) {
|
||||
const manifest_path = path.join(extension.unpacked_path, 'manifest.json');
|
||||
|
||||
// Download CRX file if not already downloaded
|
||||
if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
|
||||
console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`);
|
||||
|
||||
try {
|
||||
// Ensure parent directory exists
|
||||
const crxDir = path.dirname(extension.crx_path);
|
||||
if (!fs.existsSync(crxDir)) {
|
||||
fs.mkdirSync(crxDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Download CRX file from Chrome Web Store
|
||||
const response = await fetch(extension.crx_url);
|
||||
|
||||
if (!response.ok) {
|
||||
console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (response.body) {
|
||||
const crx_file = fs.createWriteStream(extension.crx_path);
|
||||
const crx_stream = Readable.fromWeb(response.body);
|
||||
await finished(crx_stream.pipe(crx_file));
|
||||
} else {
|
||||
console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`);
|
||||
return false;
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`[❌] Failed to download extension ${extension.name}:`, err);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Unzip CRX file to unpacked_path
|
||||
await fs.promises.mkdir(extension.unpacked_path, { recursive: true });
|
||||
|
||||
try {
|
||||
// Try system unzip command first
|
||||
await execAsync(`/usr/bin/unzip -o ${extension.crx_path} -d ${extension.unpacked_path}`);
|
||||
} catch (err1) {
|
||||
if (unzip) {
|
||||
// Fallback to unzipper library
|
||||
try {
|
||||
await unzip(extension.crx_path, extension.unpacked_path);
|
||||
} catch (err2) {
|
||||
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fs.existsSync(manifest_path)) {
|
||||
console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load or install a Chrome extension, computing all metadata.
|
||||
*
|
||||
* @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path)
|
||||
* @param {string} [ext.webstore_id] - Chrome Web Store extension ID
|
||||
* @param {string} [ext.name] - Human-readable extension name
|
||||
* @param {string} [ext.unpacked_path] - Path to unpacked extension
|
||||
* @param {string} [extensions_dir] - Directory to store extensions
|
||||
* @returns {Promise<Object>} - Complete extension metadata object
|
||||
*/
|
||||
async function loadOrInstallExtension(ext, extensions_dir = null) {
|
||||
if (!(ext.webstore_id || ext.unpacked_path)) {
|
||||
throw new Error('Extension must have either {webstore_id} or {unpacked_path}');
|
||||
}
|
||||
|
||||
// Determine extensions directory
|
||||
const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions';
|
||||
|
||||
// Set statically computable extension metadata
|
||||
ext.webstore_id = ext.webstore_id || ext.id;
|
||||
ext.name = ext.name || ext.webstore_id;
|
||||
ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`;
|
||||
ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`;
|
||||
ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`);
|
||||
ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`);
|
||||
|
||||
const manifest_path = path.join(ext.unpacked_path, 'manifest.json');
|
||||
ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'));
|
||||
ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null;
|
||||
|
||||
// If extension is not installed, download and unpack it
|
||||
if (!ext.read_version()) {
|
||||
await installExtension(ext);
|
||||
}
|
||||
|
||||
// Autodetect ID from filesystem path (unpacked extensions don't have stable IDs)
|
||||
ext.id = getExtensionId(ext.unpacked_path);
|
||||
ext.version = ext.read_version();
|
||||
|
||||
if (!ext.version) {
|
||||
console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`);
|
||||
} else {
|
||||
console.log(`[➕] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`);
|
||||
}
|
||||
|
||||
return ext;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a Puppeteer target is an extension background page/service worker.
|
||||
*
|
||||
* @param {Object} target - Puppeteer target object
|
||||
* @returns {Promise<Object>} - Object with target_is_bg, extension_id, manifest_version, etc.
|
||||
*/
|
||||
async function isTargetExtension(target) {
|
||||
let target_type;
|
||||
let target_ctx;
|
||||
let target_url;
|
||||
|
||||
try {
|
||||
target_type = target.type();
|
||||
target_ctx = (await target.worker()) || (await target.page()) || null;
|
||||
target_url = target.url() || target_ctx?.url() || null;
|
||||
} catch (err) {
|
||||
if (String(err).includes('No target with given id found')) {
|
||||
// Target closed during check, ignore harmless race condition
|
||||
target_type = 'closed';
|
||||
target_ctx = null;
|
||||
target_url = 'about:closed';
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this is an extension background page or service worker
|
||||
const is_chrome_extension = target_url?.startsWith('chrome-extension://');
|
||||
const is_background_page = target_type === 'background_page';
|
||||
const is_service_worker = target_type === 'service_worker';
|
||||
const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker);
|
||||
|
||||
let extension_id = null;
|
||||
let manifest_version = null;
|
||||
const target_is_extension = is_chrome_extension || target_is_bg;
|
||||
|
||||
if (target_is_extension) {
|
||||
try {
|
||||
extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
|
||||
|
||||
if (target_ctx) {
|
||||
const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
|
||||
manifest_version = manifest?.manifest_version || null;
|
||||
}
|
||||
} catch (err) {
|
||||
// Failed to get extension metadata
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
target_is_extension,
|
||||
target_is_bg,
|
||||
target_type,
|
||||
target_ctx,
|
||||
target_url,
|
||||
extension_id,
|
||||
manifest_version,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Load extension metadata and connection handlers from a browser target.
|
||||
*
|
||||
* @param {Array} extensions - Array of extension metadata objects to update
|
||||
* @param {Object} target - Puppeteer target object
|
||||
* @returns {Promise<Object|null>} - Updated extension object or null if not an extension
|
||||
*/
|
||||
async function loadExtensionFromTarget(extensions, target) {
|
||||
const {
|
||||
target_is_bg,
|
||||
target_is_extension,
|
||||
target_type,
|
||||
target_ctx,
|
||||
target_url,
|
||||
extension_id,
|
||||
manifest_version,
|
||||
} = await isTargetExtension(target);
|
||||
|
||||
if (!(target_is_bg && extension_id && target_ctx)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find matching extension in our list
|
||||
const extension = extensions.find(ext => ext.id === extension_id);
|
||||
if (!extension) {
|
||||
console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Load manifest from the extension context
|
||||
let manifest = null;
|
||||
try {
|
||||
manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
|
||||
} catch (err) {
|
||||
console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Create dispatch methods for communicating with the extension
|
||||
const new_extension = {
|
||||
...extension,
|
||||
target,
|
||||
target_type,
|
||||
target_url,
|
||||
manifest,
|
||||
manifest_version,
|
||||
|
||||
// Trigger extension toolbar button click
|
||||
dispatchAction: async (tab) => {
|
||||
return await target_ctx.evaluate((tabId) => {
|
||||
return new Promise((resolve) => {
|
||||
chrome.action.onClicked.addListener((tab) => {
|
||||
resolve({ success: true, tab });
|
||||
});
|
||||
chrome.action.openPopup();
|
||||
});
|
||||
}, tab?.id || null);
|
||||
},
|
||||
|
||||
// Send message to extension
|
||||
dispatchMessage: async (message, options = {}) => {
|
||||
return await target_ctx.evaluate((msg, opts) => {
|
||||
return new Promise((resolve) => {
|
||||
chrome.runtime.sendMessage(msg, opts, (response) => {
|
||||
resolve(response);
|
||||
});
|
||||
});
|
||||
}, message, options);
|
||||
},
|
||||
|
||||
// Trigger extension command (keyboard shortcut)
|
||||
dispatchCommand: async (command) => {
|
||||
return await target_ctx.evaluate((cmd) => {
|
||||
return new Promise((resolve) => {
|
||||
chrome.commands.onCommand.addListener((receivedCommand) => {
|
||||
if (receivedCommand === cmd) {
|
||||
resolve({ success: true, command: receivedCommand });
|
||||
}
|
||||
});
|
||||
// Note: Actually triggering commands programmatically is not directly supported
|
||||
// This would need to be done via CDP or keyboard simulation
|
||||
});
|
||||
}, command);
|
||||
},
|
||||
};
|
||||
|
||||
// Update the extension in the array
|
||||
Object.assign(extension, new_extension);
|
||||
|
||||
console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`);
|
||||
|
||||
return new_extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install all extensions in the list if not already installed.
|
||||
*
|
||||
* @param {Array} extensions - Array of extension metadata objects
|
||||
* @param {string} [extensions_dir] - Directory to store extensions
|
||||
* @returns {Promise<Array>} - Array of installed extension objects
|
||||
*/
|
||||
async function installAllExtensions(extensions, extensions_dir = null) {
|
||||
console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`);
|
||||
|
||||
for (const extension of extensions) {
|
||||
await loadOrInstallExtension(extension, extensions_dir);
|
||||
}
|
||||
|
||||
return extensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load and connect to all extensions from a running browser.
|
||||
*
|
||||
* @param {Object} browser - Puppeteer browser instance
|
||||
* @param {Array} extensions - Array of extension metadata objects
|
||||
* @returns {Promise<Array>} - Array of loaded extension objects with connection handlers
|
||||
*/
|
||||
async function loadAllExtensionsFromBrowser(browser, extensions) {
|
||||
console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`);
|
||||
|
||||
// Find loaded extensions at runtime by examining browser targets
|
||||
for (const target of browser.targets()) {
|
||||
await loadExtensionFromTarget(extensions, target);
|
||||
}
|
||||
|
||||
return extensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load extension manifest.json file
|
||||
*
|
||||
* @param {string} unpacked_path - Path to unpacked extension directory
|
||||
* @returns {object|null} - Parsed manifest object or null if not found/invalid
|
||||
*/
|
||||
function loadExtensionManifest(unpacked_path) {
|
||||
const manifest_path = path.join(unpacked_path, 'manifest.json');
|
||||
|
||||
if (!fs.existsSync(manifest_path)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const manifest_content = fs.readFileSync(manifest_path, 'utf-8');
|
||||
return JSON.parse(manifest_content);
|
||||
} catch (error) {
|
||||
// Invalid JSON or read error
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate Chrome launch arguments for loading extensions.
|
||||
*
|
||||
* @param {Array} extensions - Array of extension metadata objects
|
||||
* @returns {Array<string>} - Chrome CLI arguments for loading extensions
|
||||
*/
|
||||
function getExtensionLaunchArgs(extensions) {
|
||||
if (!extensions || extensions.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Filter out extensions without unpacked_path first
|
||||
const validExtensions = extensions.filter(ext => ext.unpacked_path);
|
||||
|
||||
const unpacked_paths = validExtensions.map(ext => ext.unpacked_path);
|
||||
const webstore_ids = validExtensions.map(ext => ext.webstore_id || ext.id);
|
||||
|
||||
return [
|
||||
`--load-extension=${unpacked_paths.join(',')}`,
|
||||
`--allowlisted-extension-id=${webstore_ids.join(',')}`,
|
||||
'--allow-legacy-extension-manifests',
|
||||
'--disable-extensions-auto-update',
|
||||
];
|
||||
}
|
||||
|
||||
// Export all functions
|
||||
module.exports = {
|
||||
getExtensionId,
|
||||
loadExtensionManifest,
|
||||
installExtension,
|
||||
loadOrInstallExtension,
|
||||
isTargetExtension,
|
||||
loadExtensionFromTarget,
|
||||
installAllExtensions,
|
||||
loadAllExtensionsFromBrowser,
|
||||
getExtensionLaunchArgs,
|
||||
};
|
||||
|
||||
// CLI usage
|
||||
if (require.main === module) {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
if (args.length === 0) {
|
||||
console.log('Usage: chrome_extension_utils.js <command> [args...]');
|
||||
console.log('');
|
||||
console.log('Commands:');
|
||||
console.log(' getExtensionId <path>');
|
||||
console.log(' loadExtensionManifest <path>');
|
||||
console.log(' getExtensionLaunchArgs <extensions_json>');
|
||||
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const [command, ...commandArgs] = args;
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
switch (command) {
|
||||
case 'getExtensionId': {
|
||||
const [unpacked_path] = commandArgs;
|
||||
const id = getExtensionId(unpacked_path);
|
||||
console.log(id);
|
||||
break;
|
||||
}
|
||||
|
||||
case 'loadExtensionManifest': {
|
||||
const [unpacked_path] = commandArgs;
|
||||
const manifest = loadExtensionManifest(unpacked_path);
|
||||
console.log(JSON.stringify(manifest));
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getExtensionLaunchArgs': {
|
||||
const [extensions_json] = commandArgs;
|
||||
const extensions = JSON.parse(extensions_json);
|
||||
const args = getExtensionLaunchArgs(extensions);
|
||||
console.log(JSON.stringify(args));
|
||||
break;
|
||||
}
|
||||
|
||||
case 'loadOrInstallExtension': {
|
||||
const [webstore_id, name, extensions_dir] = commandArgs;
|
||||
const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir);
|
||||
console.log(JSON.stringify(ext, null, 2));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
console.error(`Unknown command: ${command}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error: ${error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
}
|
||||
1150
archivebox/plugins/chrome/chrome_utils.js
Executable file
1150
archivebox/plugins/chrome/chrome_utils.js
Executable file
File diff suppressed because it is too large
Load Diff
@@ -2,10 +2,14 @@
|
||||
"""
|
||||
Install hook for Chrome/Chromium binary.
|
||||
|
||||
Runs at crawl start to verify Chrome is available.
|
||||
Runs at crawl start to verify Chromium is available.
|
||||
Outputs JSONL for Binary and Machine config updates.
|
||||
Respects CHROME_BINARY env var for custom binary paths.
|
||||
Falls back to `npx @puppeteer/browsers install chrome@stable` if not found.
|
||||
Falls back to `npx @puppeteer/browsers install chromium@latest` if not found.
|
||||
|
||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
--load-extension and --disable-extensions-except flags, which are needed for
|
||||
loading unpacked extensions in headless mode.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -14,24 +18,24 @@ import json
|
||||
import subprocess
|
||||
|
||||
|
||||
def install_chrome_via_puppeteer() -> bool:
|
||||
"""Install Chrome using @puppeteer/browsers."""
|
||||
def install_chromium_via_puppeteer() -> bool:
|
||||
"""Install Chromium using @puppeteer/browsers."""
|
||||
try:
|
||||
print("Chrome not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
|
||||
print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
|
||||
result = subprocess.run(
|
||||
['npx', '@puppeteer/browsers', 'install', 'chrome@stable'],
|
||||
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
|
||||
print(f"Failed to install Chrome: {e}", file=sys.stderr)
|
||||
print(f"Failed to install Chromium: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def find_chrome() -> dict | None:
|
||||
"""Find Chrome/Chromium binary, respecting CHROME_BINARY env var."""
|
||||
def find_chromium() -> dict | None:
|
||||
"""Find Chromium binary, respecting CHROME_BINARY env var."""
|
||||
# Quick check: if CHROME_BINARY is set and exists, skip expensive lookup
|
||||
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
||||
@@ -41,9 +45,10 @@ def find_chrome() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider
|
||||
|
||||
# Try to find chrome using abx-pkg
|
||||
# Try to find chromium using abx-pkg
|
||||
# Prefer chromium over chrome because Chrome 137+ removed --load-extension support
|
||||
binary = Binary(
|
||||
name='chrome',
|
||||
name='chromium',
|
||||
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()],
|
||||
overrides={'npm': {'packages': ['@puppeteer/browsers']}}
|
||||
)
|
||||
@@ -51,7 +56,7 @@ def find_chrome() -> dict | None:
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'name': 'chromium',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
@@ -59,12 +64,12 @@ def find_chrome() -> dict | None:
|
||||
}
|
||||
|
||||
# If not found, try to install via @puppeteer/browsers
|
||||
if install_chrome_via_puppeteer():
|
||||
if install_chromium_via_puppeteer():
|
||||
# Try loading again after install
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'name': 'chromium',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
@@ -77,7 +82,7 @@ def find_chrome() -> dict | None:
|
||||
|
||||
|
||||
def main():
|
||||
result = find_chrome()
|
||||
result = find_chromium()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
@@ -99,13 +104,13 @@ def main():
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/CHROME_VERSION',
|
||||
'key': 'config/CHROMIUM_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f"Chrome/Chromium binary not found", file=sys.stderr)
|
||||
print(f"Chromium binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
@@ -1,55 +1,57 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Launch a shared Chrome browser session for the entire crawl.
|
||||
* Launch a shared Chromium browser session for the entire crawl.
|
||||
*
|
||||
* This runs once per crawl and keeps Chrome alive for all snapshots to share.
|
||||
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
|
||||
* Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
|
||||
*
|
||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Creates chrome/ directory under crawl output dir with:
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - pid.txt: Chrome process ID (for cleanup)
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
* - port.txt: Debug port number
|
||||
* - extensions.json: Loaded extensions metadata
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
|
||||
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
|
||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
|
||||
*/
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { spawn } = require('child_process');
|
||||
const http = require('http');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const {
|
||||
findChromium,
|
||||
launchChromium,
|
||||
killChrome,
|
||||
getEnv,
|
||||
writePidWithMtime,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = 'chrome';
|
||||
|
||||
// Helpers for PID file creation
|
||||
function writePidWithMtime(filePath, pid, startTimeSeconds) {
|
||||
fs.writeFileSync(filePath, String(pid));
|
||||
const startTimeMs = startTimeSeconds * 1000;
|
||||
fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
|
||||
}
|
||||
|
||||
function writeCmdScript(filePath, binary, args) {
|
||||
const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$'))
|
||||
? `"${arg.replace(/"/g, '\\"')}"` : arg;
|
||||
fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`);
|
||||
fs.chmodSync(filePath, 0o755);
|
||||
}
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
let browserInstance = null;
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
process.argv.slice(2).forEach((arg) => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
@@ -58,63 +60,27 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Cleanup handler for SIGTERM - kill Chrome and all child processes
|
||||
// Cleanup handler for SIGTERM
|
||||
async function cleanup() {
|
||||
if (!chromePid) {
|
||||
process.exit(0);
|
||||
return;
|
||||
}
|
||||
console.error('[*] Cleaning up Chrome session...');
|
||||
|
||||
console.log(`[*] Killing Chrome process tree (PID ${chromePid})...`);
|
||||
|
||||
try {
|
||||
// Try to kill the entire process group
|
||||
process.kill(-chromePid, 'SIGTERM');
|
||||
} catch (e) {
|
||||
// Fall back to killing just the process
|
||||
// Try graceful browser close first
|
||||
if (browserInstance) {
|
||||
try {
|
||||
process.kill(chromePid, 'SIGTERM');
|
||||
} catch (e2) {
|
||||
// Already dead
|
||||
console.error('[*] Closing browser gracefully...');
|
||||
await browserInstance.close();
|
||||
browserInstance = null;
|
||||
console.error('[+] Browser closed gracefully');
|
||||
} catch (e) {
|
||||
console.error(`[!] Graceful close failed: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait 2 seconds for graceful shutdown
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
// Force kill with SIGKILL
|
||||
try {
|
||||
process.kill(-chromePid, 'SIGKILL');
|
||||
} catch (e) {
|
||||
try {
|
||||
process.kill(chromePid, 'SIGKILL');
|
||||
} catch (e2) {
|
||||
// Already dead
|
||||
}
|
||||
// Kill Chrome process
|
||||
if (chromePid) {
|
||||
await killChrome(chromePid, OUTPUT_DIR);
|
||||
}
|
||||
|
||||
console.log('[*] Chrome process tree killed');
|
||||
|
||||
// Delete PID files to prevent PID reuse issues
|
||||
try {
|
||||
fs.unlinkSync(path.join(OUTPUT_DIR, 'chrome.pid'));
|
||||
} catch (e) {}
|
||||
try {
|
||||
fs.unlinkSync(path.join(OUTPUT_DIR, 'hook.pid'));
|
||||
} catch (e) {}
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
@@ -122,379 +88,158 @@ async function cleanup() {
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
// Find a free port
|
||||
function findFreePort() {
|
||||
return new Promise((resolve, reject) => {
|
||||
const server = require('net').createServer();
|
||||
server.unref();
|
||||
server.on('error', reject);
|
||||
server.listen(0, () => {
|
||||
const port = server.address().port;
|
||||
server.close(() => resolve(port));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Wait for Chrome's DevTools port to be ready
|
||||
function waitForDebugPort(port, timeout = 30000) {
|
||||
const startTime = Date.now();
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const tryConnect = () => {
|
||||
if (Date.now() - startTime > timeout) {
|
||||
reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
|
||||
return;
|
||||
}
|
||||
|
||||
const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
|
||||
let data = '';
|
||||
res.on('data', chunk => data += chunk);
|
||||
res.on('end', () => {
|
||||
try {
|
||||
const info = JSON.parse(data);
|
||||
resolve(info);
|
||||
} catch (e) {
|
||||
setTimeout(tryConnect, 100);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
req.on('error', () => {
|
||||
setTimeout(tryConnect, 100);
|
||||
});
|
||||
|
||||
req.setTimeout(1000, () => {
|
||||
req.destroy();
|
||||
setTimeout(tryConnect, 100);
|
||||
});
|
||||
};
|
||||
|
||||
tryConnect();
|
||||
});
|
||||
}
|
||||
|
||||
// Kill zombie Chrome processes from stale crawls
|
||||
function killZombieChrome() {
|
||||
const dataDir = getEnv('DATA_DIR', '.');
|
||||
const crawlsDir = path.join(dataDir, 'crawls');
|
||||
const now = Date.now();
|
||||
const fiveMinutesAgo = now - 300000;
|
||||
let killed = 0;
|
||||
|
||||
console.error('[*] Checking for zombie Chrome processes...');
|
||||
|
||||
if (!fs.existsSync(crawlsDir)) {
|
||||
console.error('[+] No crawls directory found');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Only scan data/crawls/*/chrome/*.pid - no recursion into archive dirs
|
||||
const crawls = fs.readdirSync(crawlsDir, { withFileTypes: true });
|
||||
|
||||
for (const crawl of crawls) {
|
||||
if (!crawl.isDirectory()) continue;
|
||||
|
||||
const crawlDir = path.join(crawlsDir, crawl.name);
|
||||
const chromeDir = path.join(crawlDir, 'chrome');
|
||||
|
||||
if (!fs.existsSync(chromeDir)) continue;
|
||||
|
||||
// Check if crawl was modified recently (still active)
|
||||
try {
|
||||
const crawlStats = fs.statSync(crawlDir);
|
||||
if (crawlStats.mtimeMs > fiveMinutesAgo) {
|
||||
continue; // Crawl modified recently, likely still active
|
||||
}
|
||||
} catch (e) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Crawl is stale (> 5 minutes since modification), check for PIDs
|
||||
try {
|
||||
const pidFiles = fs.readdirSync(chromeDir).filter(f => f.endsWith('.pid'));
|
||||
|
||||
for (const pidFileName of pidFiles) {
|
||||
const pidFile = path.join(chromeDir, pidFileName);
|
||||
|
||||
try {
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
|
||||
if (isNaN(pid) || pid <= 0) continue;
|
||||
|
||||
// Check if process exists (simple check, Python will validate properly)
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
} catch (e) {
|
||||
// Process dead, remove stale PID file
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process alive and crawl is stale - zombie!
|
||||
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
|
||||
|
||||
try {
|
||||
// Kill process group
|
||||
try {
|
||||
process.kill(-pid, 'SIGKILL');
|
||||
} catch (e) {
|
||||
process.kill(pid, 'SIGKILL');
|
||||
}
|
||||
|
||||
killed++;
|
||||
console.error(`[+] Killed zombie (PID ${pid})`);
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip invalid PID files
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip if can't read chrome dir
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`[!] Error scanning crawls: ${e.message}`);
|
||||
}
|
||||
|
||||
if (killed > 0) {
|
||||
console.error(`[+] Killed ${killed} zombie process(es)`);
|
||||
} else {
|
||||
console.error('[+] No zombies found');
|
||||
}
|
||||
}
|
||||
|
||||
async function launchChrome(binary) {
|
||||
// First, kill any zombie Chrome from crashed crawls
|
||||
killZombieChrome();
|
||||
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
|
||||
const headless = getEnvBool('CHROME_HEADLESS', true);
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
// Find a free port for Chrome DevTools
|
||||
const debugPort = await findFreePort();
|
||||
console.error(`[*] Using debug port: ${debugPort}`);
|
||||
|
||||
// Load any installed extensions
|
||||
const extensionUtils = require('./chrome_extension_utils.js');
|
||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
|
||||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
|
||||
|
||||
const installedExtensions = [];
|
||||
if (fs.existsSync(extensionsDir)) {
|
||||
const files = fs.readdirSync(extensionsDir);
|
||||
for (const file of files) {
|
||||
if (file.endsWith('.extension.json')) {
|
||||
try {
|
||||
const extPath = path.join(extensionsDir, file);
|
||||
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
|
||||
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
|
||||
installedExtensions.push(extData);
|
||||
console.error(`[*] Loading extension: ${extData.name || file}`);
|
||||
}
|
||||
} catch (e) {
|
||||
// Skip invalid cache files
|
||||
console.warn(`[!] Skipping invalid extension cache: ${file}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get extension launch arguments
|
||||
const extensionArgs = extensionUtils.getExtensionLaunchArgs(installedExtensions);
|
||||
if (extensionArgs.length > 0) {
|
||||
console.error(`[+] Loaded ${installedExtensions.length} extension(s)`);
|
||||
// Write extensions metadata for config hooks to use
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
// Build Chrome arguments
|
||||
const chromeArgs = [
|
||||
`--remote-debugging-port=${debugPort}`,
|
||||
'--remote-debugging-address=127.0.0.1',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
'--disable-sync',
|
||||
'--no-first-run',
|
||||
'--no-default-browser-check',
|
||||
'--disable-default-apps',
|
||||
'--disable-infobars',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-component-update',
|
||||
'--disable-domain-reliability',
|
||||
'--disable-breakpad',
|
||||
'--disable-background-networking',
|
||||
'--disable-background-timer-throttling',
|
||||
'--disable-backgrounding-occluded-windows',
|
||||
'--disable-renderer-backgrounding',
|
||||
'--disable-ipc-flooding-protection',
|
||||
'--password-store=basic',
|
||||
'--use-mock-keychain',
|
||||
'--font-render-hinting=none',
|
||||
'--force-color-profile=srgb',
|
||||
`--window-size=${width},${height}`,
|
||||
...extensionArgs, // Load extensions
|
||||
...(headless ? ['--headless=new'] : []),
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
'about:blank', // Start with blank page
|
||||
];
|
||||
|
||||
// Launch Chrome as a detached process group leader
|
||||
// This allows us to kill Chrome and all its child processes as a group
|
||||
const chromeProcess = spawn(binary, chromeArgs, {
|
||||
detached: true,
|
||||
stdio: ['ignore', 'ignore', 'ignore'],
|
||||
});
|
||||
chromeProcess.unref(); // Don't keep Node.js process running
|
||||
|
||||
chromePid = chromeProcess.pid;
|
||||
const chromeStartTime = Date.now() / 1000; // Unix epoch seconds
|
||||
console.error(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
|
||||
|
||||
// Write Chrome PID with mtime set to start time for validation
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'chrome.pid'), chromePid, chromeStartTime);
|
||||
|
||||
// Write command script for validation
|
||||
writeCmdScript(path.join(OUTPUT_DIR, 'cmd.sh'), binary, chromeArgs);
|
||||
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort));
|
||||
|
||||
// Write hook's own PID with mtime for validation
|
||||
const hookStartTime = Date.now() / 1000;
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||
|
||||
try {
|
||||
// Wait for Chrome to be ready
|
||||
const versionInfo = await waitForDebugPort(debugPort, 30000);
|
||||
console.error(`[+] Chrome ready: ${versionInfo.Browser}`);
|
||||
|
||||
// Build WebSocket URL
|
||||
const wsUrl = versionInfo.webSocketDebuggerUrl;
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl);
|
||||
|
||||
return { success: true, cdpUrl: wsUrl, pid: chromePid, port: debugPort };
|
||||
|
||||
} catch (e) {
|
||||
// Kill Chrome if setup failed
|
||||
try {
|
||||
process.kill(chromePid, 'SIGTERM');
|
||||
} catch (killErr) {
|
||||
// Ignore
|
||||
}
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const crawlId = args.crawl_id;
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
let version = '';
|
||||
|
||||
try {
|
||||
const binary = findChrome();
|
||||
const binary = findChromium();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chrome/Chromium binary not found');
|
||||
console.error('DEPENDENCY_NEEDED=chrome');
|
||||
console.error('ERROR: Chromium binary not found');
|
||||
console.error('DEPENDENCY_NEEDED=chromium');
|
||||
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
|
||||
console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
|
||||
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Get Chrome version
|
||||
// Get Chromium version
|
||||
let version = '';
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
|
||||
} catch (e) {
|
||||
version = '';
|
||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
|
||||
.trim()
|
||||
.slice(0, 64);
|
||||
} catch (e) {}
|
||||
|
||||
console.error(`[*] Using browser: ${binary}`);
|
||||
if (version) console.error(`[*] Version: ${version}`);
|
||||
|
||||
// Load installed extensions
|
||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
|
||||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
|
||||
|
||||
const installedExtensions = [];
|
||||
const extensionPaths = [];
|
||||
if (fs.existsSync(extensionsDir)) {
|
||||
const files = fs.readdirSync(extensionsDir);
|
||||
for (const file of files) {
|
||||
if (file.endsWith('.extension.json')) {
|
||||
try {
|
||||
const extPath = path.join(extensionsDir, file);
|
||||
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
|
||||
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
|
||||
installedExtensions.push(extData);
|
||||
extensionPaths.push(extData.unpacked_path);
|
||||
console.error(`[*] Loading extension: ${extData.name || file}`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[!] Skipping invalid extension cache: ${file}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const result = await launchChrome(binary);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = OUTPUT_DIR;
|
||||
console.error(`[+] Chrome session started for crawl ${crawlId}`);
|
||||
console.error(`[+] CDP URL: ${result.cdpUrl}`);
|
||||
console.error(`[+] PID: ${result.pid}`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
if (installedExtensions.length > 0) {
|
||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||
}
|
||||
|
||||
// Write hook's own PID
|
||||
const hookStartTime = Date.now() / 1000;
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir: OUTPUT_DIR,
|
||||
extensionPaths,
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
|
||||
// Write extensions metadata
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
// Connect puppeteer for extension verification
|
||||
console.error(`[*] Connecting puppeteer to CDP...`);
|
||||
const browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: null,
|
||||
});
|
||||
browserInstance = browser;
|
||||
|
||||
// Verify extensions loaded
|
||||
if (extensionPaths.length > 0) {
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
const targets = browser.targets();
|
||||
console.error(`[*] All browser targets (${targets.length}):`);
|
||||
for (const t of targets) {
|
||||
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
|
||||
}
|
||||
|
||||
const extTargets = targets.filter(t =>
|
||||
t.url().startsWith('chrome-extension://') ||
|
||||
t.type() === 'service_worker' ||
|
||||
t.type() === 'background_page'
|
||||
);
|
||||
|
||||
// Filter out built-in extensions
|
||||
const builtinIds = [
|
||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
||||
];
|
||||
const customExtTargets = extTargets.filter(t => {
|
||||
const url = t.url();
|
||||
if (!url.startsWith('chrome-extension://')) return false;
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
return !builtinIds.includes(extId);
|
||||
});
|
||||
|
||||
console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
|
||||
|
||||
for (const target of customExtTargets) {
|
||||
const url = target.url();
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
|
||||
}
|
||||
|
||||
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
||||
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
|
||||
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
|
||||
}
|
||||
}
|
||||
|
||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
||||
console.error(`[+] PID: ${chromePid}`);
|
||||
|
||||
// Stay alive to handle cleanup on SIGTERM
|
||||
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
|
||||
setInterval(() => {}, 1000000);
|
||||
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR: ${error}`);
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Background hook - stay running to handle cleanup on SIGTERM
|
||||
console.log('[*] Chrome launch hook staying alive to handle cleanup...');
|
||||
|
||||
// Keep process alive by setting an interval (won't actually do anything)
|
||||
// This allows us to receive SIGTERM when crawl ends
|
||||
setInterval(() => {}, 1000000);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
main().catch((e) => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
@@ -26,7 +26,11 @@ const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { spawn } = require('child_process');
|
||||
const http = require('http');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_tab';
|
||||
@@ -87,31 +91,6 @@ async function cleanup() {
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
|
||||
// Find Chrome binary (for fallback)
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
@@ -367,7 +346,7 @@ async function main() {
|
||||
let version = '';
|
||||
|
||||
try {
|
||||
const binary = findChrome();
|
||||
const binary = findChromium();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chrome/Chromium binary not found');
|
||||
console.error('DEPENDENCY_NEEDED=chrome');
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'chrome_navigate';
|
||||
|
||||
@@ -2,14 +2,18 @@
|
||||
Integration tests for chrome plugin
|
||||
|
||||
Tests verify:
|
||||
1. Chrome install hook checks for Chrome/Chromium binary
|
||||
1. Chromium install via @puppeteer/browsers
|
||||
2. Verify deps with abx-pkg
|
||||
3. Chrome hooks exist
|
||||
4. Chrome launches at crawl level
|
||||
4. Chromium launches at crawl level
|
||||
5. Tab creation at snapshot level
|
||||
6. Tab navigation works
|
||||
7. Tab cleanup on SIGTERM
|
||||
8. Chrome cleanup on crawl end
|
||||
8. Chromium cleanup on crawl end
|
||||
|
||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
--load-extension and --disable-extensions-except flags, which are needed for
|
||||
loading unpacked extensions in headless mode.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -40,49 +44,104 @@ def get_lib_dir_and_machine_type():
|
||||
|
||||
return Path(lib_dir), machine_type
|
||||
|
||||
# Setup NODE_PATH to find npm packages
|
||||
# Setup NODE_MODULES_DIR to find npm packages
|
||||
LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type()
|
||||
# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin)
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
NPM_PREFIX = LIB_DIR / 'npm'
|
||||
|
||||
# Chromium install location (relative to DATA_DIR)
|
||||
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_PATH set correctly."""
|
||||
"""Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_PATH'] = str(NODE_MODULES_DIR)
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
env['MACHINE_TYPE'] = MACHINE_TYPE
|
||||
# Ensure CHROME_BINARY is set to Chromium
|
||||
if 'CHROME_BINARY' not in env:
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
return env
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
||||
if not CHROMIUM_INSTALL_DIR.exists():
|
||||
return None
|
||||
|
||||
# Look for versioned directories
|
||||
for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True):
|
||||
if not version_dir.is_dir():
|
||||
continue
|
||||
# macOS ARM
|
||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_arm.exists():
|
||||
return str(mac_arm)
|
||||
# macOS x64
|
||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_x64.exists():
|
||||
return str(mac_x64)
|
||||
# Linux
|
||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
||||
if linux.exists():
|
||||
return str(linux)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def ensure_puppeteer_installed():
|
||||
"""Ensure puppeteer is installed in LIB_DIR before running tests."""
|
||||
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
|
||||
def ensure_chromium_and_puppeteer_installed():
|
||||
"""Ensure Chromium and puppeteer are installed before running tests."""
|
||||
from abx_pkg import Binary, NpmProvider
|
||||
|
||||
# Rebuild pydantic models
|
||||
NpmProvider.model_rebuild()
|
||||
|
||||
# Check if puppeteer-core is already available
|
||||
# Install puppeteer-core if not available
|
||||
puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core'
|
||||
if puppeteer_core_path.exists():
|
||||
return # Already installed
|
||||
if not puppeteer_core_path.exists():
|
||||
print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
|
||||
NPM_PREFIX.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
|
||||
NPM_PREFIX.mkdir(parents=True, exist_ok=True)
|
||||
provider = NpmProvider(npm_prefix=NPM_PREFIX)
|
||||
try:
|
||||
binary = Binary(
|
||||
name='puppeteer',
|
||||
binproviders=[provider],
|
||||
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
|
||||
)
|
||||
binary.install()
|
||||
print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
|
||||
except Exception as e:
|
||||
pytest.skip(f"Failed to install puppeteer: {e}")
|
||||
|
||||
# Install puppeteer using NpmProvider with custom prefix
|
||||
provider = NpmProvider(npm_prefix=NPM_PREFIX)
|
||||
try:
|
||||
binary = Binary(
|
||||
name='puppeteer',
|
||||
binproviders=[provider],
|
||||
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
|
||||
# Install Chromium via @puppeteer/browsers if not available
|
||||
chromium_binary = find_chromium_binary()
|
||||
if not chromium_binary:
|
||||
print(f"\n[*] Installing Chromium to {CHROMIUM_INSTALL_DIR}...")
|
||||
CHROMIUM_INSTALL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = subprocess.run(
|
||||
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
|
||||
cwd=str(CHROMIUM_INSTALL_DIR.parent),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
binary.install()
|
||||
print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
|
||||
except Exception as e:
|
||||
pytest.skip(f"Failed to install puppeteer: {e}")
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Failed to install Chromium: {result.stderr}")
|
||||
|
||||
chromium_binary = find_chromium_binary()
|
||||
if not chromium_binary:
|
||||
pytest.skip("Chromium installed but binary not found")
|
||||
|
||||
print(f"[*] Chromium installed: {chromium_binary}")
|
||||
|
||||
# Set CHROME_BINARY env var for tests
|
||||
os.environ['CHROME_BINARY'] = chromium_binary
|
||||
|
||||
|
||||
def test_hook_scripts_exist():
|
||||
@@ -92,26 +151,22 @@ def test_hook_scripts_exist():
|
||||
assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify chrome is available via abx-pkg."""
|
||||
from abx_pkg import Binary, NpmProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
def test_verify_chromium_available():
|
||||
"""Verify Chromium is available via CHROME_BINARY env var."""
|
||||
chromium_binary = os.environ.get('CHROME_BINARY') or find_chromium_binary()
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
assert chromium_binary, "Chromium binary should be available (set by fixture or found)"
|
||||
assert Path(chromium_binary).exists(), f"Chromium binary should exist at {chromium_binary}"
|
||||
|
||||
# Try to find chrome using same config as install hook
|
||||
chrome_binary = Binary(
|
||||
name='chrome',
|
||||
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()],
|
||||
overrides={'npm': {'packages': ['@puppeteer/browsers']}}
|
||||
# Verify it's actually Chromium by checking version
|
||||
result = subprocess.run(
|
||||
[chromium_binary, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
chrome_loaded = chrome_binary.load()
|
||||
|
||||
# Chrome should be available (either found by install hook or at explicit path)
|
||||
assert chrome_loaded and chrome_loaded.abspath, "Chrome should be available via abx-pkg after install hook runs"
|
||||
assert Path(chrome_loaded.abspath).exists(), f"Chrome binary should exist at {chrome_loaded.abspath}"
|
||||
assert result.returncode == 0, f"Failed to get Chromium version: {result.stderr}"
|
||||
assert 'Chromium' in result.stdout or 'Chrome' in result.stdout, f"Unexpected version output: {result.stdout}"
|
||||
|
||||
|
||||
def test_chrome_launch_and_tab_creation():
|
||||
@@ -121,7 +176,7 @@ def test_chrome_launch_and_tab_creation():
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
# Get test environment with NODE_PATH set
|
||||
# Get test environment with NODE_MODULES_DIR set
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'consolelog';
|
||||
|
||||
@@ -40,7 +40,11 @@ if (!getEnvBool('DOM_ENABLED', true)) {
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'dom';
|
||||
@@ -96,33 +100,6 @@ function getCdpUrl() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
@@ -175,7 +152,7 @@ async function dumpDom(url) {
|
||||
|
||||
// Fall back to launching new browser
|
||||
if (!browser) {
|
||||
const executablePath = findChrome();
|
||||
const executablePath = findChromium();
|
||||
if (!executablePath) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
|
||||
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_PATH
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
@@ -37,9 +37,9 @@ LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_PATH set correctly."""
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_PATH'] = str(NODE_MODULES_DIR)
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
@@ -45,6 +45,8 @@ if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'infiniscroll';
|
||||
|
||||
@@ -34,9 +34,9 @@ TEST_URL = 'https://www.singsing.movie/'
|
||||
|
||||
def get_node_modules_dir():
|
||||
"""Get NODE_MODULES_DIR for tests, checking env first."""
|
||||
# Check if NODE_PATH is already set in environment
|
||||
if os.environ.get('NODE_PATH'):
|
||||
return Path(os.environ['NODE_PATH'])
|
||||
# Check if NODE_MODULES_DIR is already set in environment
|
||||
if os.environ.get('NODE_MODULES_DIR'):
|
||||
return Path(os.environ['NODE_MODULES_DIR'])
|
||||
# Otherwise compute from LIB_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
@@ -47,9 +47,9 @@ NODE_MODULES_DIR = get_node_modules_dir()
|
||||
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_PATH set correctly."""
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_PATH'] = str(NODE_MODULES_DIR)
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
return env
|
||||
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome/chrome_extension_utils.js');
|
||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
|
||||
@@ -6,8 +6,10 @@ Tests invoke the plugin hook as an external process and verify outputs/side effe
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@@ -120,3 +122,435 @@ def test_no_configuration_required():
|
||||
|
||||
# Should not require any API keys or configuration
|
||||
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
||||
|
||||
|
||||
def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
||||
"""Create isolated lib directories for tests and return env dict.
|
||||
|
||||
Sets up:
|
||||
LIB_DIR: tmpdir/lib/<arch>
|
||||
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
|
||||
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
|
||||
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
|
||||
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
|
||||
"""
|
||||
import platform
|
||||
arch = platform.machine()
|
||||
system = platform.system().lower()
|
||||
arch_dir = f"{arch}-{system}"
|
||||
|
||||
lib_dir = tmpdir / 'lib' / arch_dir
|
||||
npm_dir = lib_dir / 'npm'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
pip_venv_dir = lib_dir / 'pip' / 'venv'
|
||||
pip_bin_dir = pip_venv_dir / 'bin'
|
||||
|
||||
# Create directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
pip_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Install puppeteer-core to the test node_modules if not present
|
||||
if not (node_modules_dir / 'puppeteer-core').exists():
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
|
||||
|
||||
return {
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'PIP_VENV_DIR': str(pip_venv_dir),
|
||||
'PIP_BIN_DIR': str(pip_bin_dir),
|
||||
}
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
||||
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
if not chromium_dir.exists():
|
||||
return None
|
||||
|
||||
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
|
||||
if not version_dir.is_dir():
|
||||
continue
|
||||
# macOS ARM
|
||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_arm.exists():
|
||||
return str(mac_arm)
|
||||
# macOS x64
|
||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_x64.exists():
|
||||
return str(mac_x64)
|
||||
# Linux
|
||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
||||
if linux.exists():
|
||||
return str(linux)
|
||||
return None
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
|
||||
TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
|
||||
def test_extension_loads_in_chromium():
|
||||
"""Verify extension loads in Chromium by visiting its options page.
|
||||
|
||||
Uses Chromium with --load-extension to load the extension, then navigates
|
||||
to chrome-extension://<id>/options.html and checks that the extension name
|
||||
appears in the page content.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chromium CDP URL not found after 20s"
|
||||
print(f"Chromium launched with CDP URL: {cdp_url}")
|
||||
|
||||
# Check that extensions were loaded
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chromium and verify extension loaded via options page
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Find extension targets to get the extension ID
|
||||
const targets = browser.targets();
|
||||
const extTargets = targets.filter(t =>
|
||||
t.url().startsWith('chrome-extension://') ||
|
||||
t.type() === 'service_worker' ||
|
||||
t.type() === 'background_page'
|
||||
);
|
||||
|
||||
// Filter out Chrome's built-in extensions
|
||||
const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai'];
|
||||
const customExtTargets = extTargets.filter(t => {{
|
||||
const url = t.url();
|
||||
if (!url.startsWith('chrome-extension://')) return false;
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
return !builtinIds.includes(extId);
|
||||
}});
|
||||
|
||||
console.error('Custom extension targets found:', customExtTargets.length);
|
||||
customExtTargets.forEach(t => console.error(' -', t.type(), t.url()));
|
||||
|
||||
if (customExtTargets.length === 0) {{
|
||||
console.log(JSON.stringify({{ loaded: false, error: 'No custom extension targets found' }}));
|
||||
browser.disconnect();
|
||||
return;
|
||||
}}
|
||||
|
||||
// Get the extension ID from the first custom extension target
|
||||
const extUrl = customExtTargets[0].url();
|
||||
const extId = extUrl.split('://')[1].split('/')[0];
|
||||
console.error('Extension ID:', extId);
|
||||
|
||||
// Try to navigate to the extension's options.html page
|
||||
const page = await browser.newPage();
|
||||
const optionsUrl = 'chrome-extension://' + extId + '/options.html';
|
||||
console.error('Navigating to options page:', optionsUrl);
|
||||
|
||||
try {{
|
||||
await page.goto(optionsUrl, {{ waitUntil: 'domcontentloaded', timeout: 10000 }});
|
||||
const pageContent = await page.content();
|
||||
const pageTitle = await page.title();
|
||||
|
||||
// Check if extension name appears in the page
|
||||
const hasExtensionName = pageContent.toLowerCase().includes('cookie') ||
|
||||
pageContent.toLowerCase().includes('idontcareaboutcookies') ||
|
||||
pageTitle.toLowerCase().includes('cookie');
|
||||
|
||||
console.log(JSON.stringify({{
|
||||
loaded: true,
|
||||
extensionId: extId,
|
||||
optionsPageLoaded: true,
|
||||
pageTitle: pageTitle,
|
||||
hasExtensionName: hasExtensionName,
|
||||
contentLength: pageContent.length
|
||||
}}));
|
||||
}} catch (e) {{
|
||||
// options.html may not exist, but extension is still loaded
|
||||
console.log(JSON.stringify({{
|
||||
loaded: true,
|
||||
extensionId: extId,
|
||||
optionsPageLoaded: false,
|
||||
error: e.message
|
||||
}}));
|
||||
}}
|
||||
|
||||
browser.disconnect();
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_extension.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
assert test_result.get('loaded'), \
|
||||
f"Extension should be loaded in Chromium. Result: {test_result}"
|
||||
print(f"Extension loaded successfully: {test_result}")
|
||||
|
||||
finally:
|
||||
# Clean up Chromium
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def test_hides_cookie_consent_on_filmin():
|
||||
"""Live test: verify extension hides cookie consent popup on filmin.es.
|
||||
|
||||
Uses Chromium with extensions loaded automatically via chrome hook.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chromium CDP URL not found after 20s"
|
||||
print(f"Chromium launched with CDP URL: {cdp_url}")
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chromium and test cookie consent hiding
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
console.error('Navigating to {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
|
||||
// Wait for extension content script to process page
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
|
||||
// Check cookie consent visibility
|
||||
const result = await page.evaluate(() => {{
|
||||
const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay'];
|
||||
for (const sel of selectors) {{
|
||||
const el = document.querySelector(sel);
|
||||
if (el) {{
|
||||
const style = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
const visible = style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
rect.width > 0 && rect.height > 0;
|
||||
if (visible) return {{ visible: true, selector: sel }};
|
||||
}}
|
||||
}}
|
||||
return {{ visible: false }};
|
||||
}});
|
||||
|
||||
console.error('Cookie consent:', JSON.stringify(result));
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_extension.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
assert not test_result['visible'], \
|
||||
f"Cookie consent should be hidden by extension. Result: {test_result}"
|
||||
|
||||
finally:
|
||||
# Clean up Chromium
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
@@ -44,6 +44,8 @@ if (!getEnvBool('MODALCLOSER_ENABLED', true)) {
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'modalcloser';
|
||||
@@ -156,22 +158,59 @@ async function closeModals(page) {
|
||||
|
||||
// Generic fallback - hide unrecognized modals with CSS
|
||||
const genericSelectors = [
|
||||
// CookieYes (cky) - popular cookie consent library
|
||||
'.cky-consent-container',
|
||||
'.cky-popup-center',
|
||||
'.cky-overlay',
|
||||
'.cky-modal',
|
||||
'#ckyPreferenceCenter',
|
||||
// CookieYes (cky)
|
||||
'.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal', '#ckyPreferenceCenter',
|
||||
// OneTrust
|
||||
'#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter', '#onetrust-pc-sdk',
|
||||
// CookieBot
|
||||
'#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay', '#CookiebotWidget',
|
||||
// Quantcast / CMP
|
||||
'.qc-cmp-ui-container', '#qc-cmp2-container', '.qc-cmp2-summary-buttons',
|
||||
// TrustArc / TrustE
|
||||
'#truste-consent-track', '.truste-banner', '#truste-consent-content',
|
||||
// Osano
|
||||
'.osano-cm-window', '.osano-cm-dialog',
|
||||
// Klaro
|
||||
'.klaro .cookie-modal', '.klaro .cookie-notice',
|
||||
// Tarteaucitron
|
||||
'#tarteaucitronRoot', '#tarteaucitronAlertBig',
|
||||
// Complianz (WordPress)
|
||||
'.cmplz-cookiebanner', '#cmplz-cookiebanner-container',
|
||||
// GDPR Cookie Consent (WordPress)
|
||||
'#gdpr-cookie-consent-bar', '.gdpr-cookie-consent-popup',
|
||||
// Cookie Notice (WordPress)
|
||||
'#cookie-notice', '.cookie-notice-container',
|
||||
// EU Cookie Law
|
||||
'.eupopup', '#eu-cookie-law',
|
||||
// Didomi
|
||||
'#didomi-popup', '#didomi-host', '.didomi-popup-container',
|
||||
// Usercentrics
|
||||
'#usercentrics-root', '.uc-banner',
|
||||
// Axeptio
|
||||
'#axeptio_overlay', '#axeptio_btn',
|
||||
// iubenda
|
||||
'#iubenda-cs-banner', '.iubenda-cs-container',
|
||||
// Termly
|
||||
'.termly-consent-banner', '#termly-code-snippet-support',
|
||||
// Borlabs Cookie (WordPress)
|
||||
'#BorlabsCookieBox', '.BorlabsCookie',
|
||||
// CookieFirst
|
||||
'.cookiefirst-root', '#cookiefirst-root',
|
||||
// CookieScript
|
||||
'#cookiescript_injected', '.cookiescript_injected_wrapper',
|
||||
// Civic Cookie Control
|
||||
'#ccc', '#ccc-overlay',
|
||||
// Generic patterns
|
||||
'#cookie-consent', '.cookie-banner', '.cookie-notice',
|
||||
'#cookieConsent', '.cookie-consent', '.cookies-banner',
|
||||
'[class*="cookie"][class*="banner"]', '[class*="cookie"][class*="notice"]',
|
||||
'[class*="cookie"][class*="popup"]', '[class*="cookie"][class*="modal"]',
|
||||
'[class*="consent"][class*="banner"]', '[class*="consent"][class*="popup"]',
|
||||
'[class*="gdpr"]', '[class*="privacy"][class*="banner"]',
|
||||
// Modal overlays and backdrops
|
||||
'.modal-overlay:not([style*="display: none"])',
|
||||
'.modal-backdrop:not([style*="display: none"])',
|
||||
'.overlay-visible',
|
||||
// Cookie consent banners
|
||||
'#cookie-consent', '.cookie-banner', '.cookie-notice',
|
||||
'#cookieConsent', '.cookie-consent', '.cookies-banner',
|
||||
'[class*="cookie"][class*="banner"]',
|
||||
'[class*="cookie"][class*="notice"]',
|
||||
'[class*="gdpr"]',
|
||||
// Popup overlays
|
||||
'.popup-overlay', '.newsletter-popup', '.age-gate',
|
||||
'.subscribe-popup', '.subscription-modal',
|
||||
|
||||
@@ -35,9 +35,9 @@ COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
def get_node_modules_dir():
|
||||
"""Get NODE_MODULES_DIR for tests, checking env first."""
|
||||
# Check if NODE_PATH is already set in environment
|
||||
if os.environ.get('NODE_PATH'):
|
||||
return Path(os.environ['NODE_PATH'])
|
||||
# Check if NODE_MODULES_DIR is already set in environment
|
||||
if os.environ.get('NODE_MODULES_DIR'):
|
||||
return Path(os.environ['NODE_MODULES_DIR'])
|
||||
# Otherwise compute from LIB_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
@@ -48,9 +48,9 @@ NODE_MODULES_DIR = get_node_modules_dir()
|
||||
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_PATH set correctly."""
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_PATH'] = str(NODE_MODULES_DIR)
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
return env
|
||||
|
||||
|
||||
|
||||
@@ -90,6 +90,32 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Emit PATH update if npm bin dir not already in PATH
|
||||
npm_bin_dir = str(npm_prefix / 'bin')
|
||||
current_path = os.environ.get('PATH', '')
|
||||
|
||||
# Check if npm_bin_dir is already in PATH
|
||||
path_dirs = current_path.split(':')
|
||||
if npm_bin_dir not in path_dirs:
|
||||
# Prepend npm_bin_dir to PATH
|
||||
new_path = f"{npm_bin_dir}:{current_path}" if current_path else npm_bin_dir
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/PATH',
|
||||
'value': new_path,
|
||||
}))
|
||||
click.echo(f" Added {npm_bin_dir} to PATH", err=True)
|
||||
|
||||
# Also emit NODE_MODULES_DIR for JS module resolution
|
||||
node_modules_dir = str(npm_prefix / 'node_modules')
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/NODE_MODULES_DIR',
|
||||
'value': node_modules_dir,
|
||||
}))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
@@ -20,6 +20,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
|
||||
@@ -40,7 +40,10 @@ if (!getEnvBool('PDF_ENABLED', true)) {
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'pdf';
|
||||
@@ -96,33 +99,6 @@ function getCdpUrl() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
@@ -175,7 +151,7 @@ async function printToPdf(url) {
|
||||
|
||||
// Fall back to launching new browser
|
||||
if (!browser) {
|
||||
const executablePath = findChrome();
|
||||
const executablePath = findChromium();
|
||||
if (!executablePath) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_PATH
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
@@ -38,9 +38,9 @@ LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_PATH set correctly."""
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_PATH'] = str(NODE_MODULES_DIR)
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ import sys
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, PipProvider
|
||||
from abx_pkg import Binary, PipProvider, BinProviderOverrides
|
||||
|
||||
# Fix pydantic forward reference issue
|
||||
PipProvider.model_rebuild()
|
||||
@@ -87,6 +87,23 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Emit PATH update if pip bin dir not already in PATH
|
||||
pip_bin_dir = str(pip_venv_path / 'bin')
|
||||
current_path = os.environ.get('PATH', '')
|
||||
|
||||
# Check if pip_bin_dir is already in PATH
|
||||
path_dirs = current_path.split(':')
|
||||
if pip_bin_dir not in path_dirs:
|
||||
# Prepend pip_bin_dir to PATH
|
||||
new_path = f"{pip_bin_dir}:{current_path}" if current_path else pip_bin_dir
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/PATH',
|
||||
'value': new_path,
|
||||
}))
|
||||
click.echo(f" Added {pip_bin_dir} to PATH", err=True)
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'redirects';
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const crypto = require('crypto');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'responses';
|
||||
|
||||
@@ -40,7 +40,10 @@ if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'screenshot';
|
||||
@@ -96,36 +99,6 @@ function getCdpUrl() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
// Common paths
|
||||
'google-chrome',
|
||||
'chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
@@ -178,7 +151,7 @@ async function takeScreenshot(url) {
|
||||
|
||||
// Fall back to launching new browser
|
||||
if (!browser) {
|
||||
const executablePath = findChrome();
|
||||
const executablePath = findChromium();
|
||||
if (!executablePath) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_PATH
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
@@ -36,9 +36,9 @@ LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_PATH set correctly."""
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_PATH'] = str(NODE_MODULES_DIR)
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
|
||||
@@ -25,7 +25,7 @@ const { exec } = require('child_process');
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome/chrome_extension_utils.js');
|
||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'ssl';
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'staticfile';
|
||||
|
||||
@@ -22,7 +22,7 @@ const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome/chrome_extension_utils.js');
|
||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
|
||||
@@ -155,3 +155,461 @@ def test_large_extension_size():
|
||||
# uBlock Origin with filter lists is typically 2-5 MB
|
||||
size_bytes = crx_file.stat().st_size
|
||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||
|
||||
|
||||
def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
||||
"""Create isolated lib directories for tests and return env dict.
|
||||
|
||||
Sets up:
|
||||
LIB_DIR: tmpdir/lib/<arch>
|
||||
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
|
||||
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
|
||||
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
|
||||
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
|
||||
"""
|
||||
import platform
|
||||
arch = platform.machine()
|
||||
system = platform.system().lower()
|
||||
arch_dir = f"{arch}-{system}"
|
||||
|
||||
lib_dir = tmpdir / 'lib' / arch_dir
|
||||
npm_dir = lib_dir / 'npm'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
pip_venv_dir = lib_dir / 'pip' / 'venv'
|
||||
pip_bin_dir = pip_venv_dir / 'bin'
|
||||
|
||||
# Create directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
pip_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Install puppeteer-core to the test node_modules if not present
|
||||
if not (node_modules_dir / 'puppeteer-core').exists():
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
|
||||
|
||||
return {
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'PIP_VENV_DIR': str(pip_venv_dir),
|
||||
'PIP_BIN_DIR': str(pip_bin_dir),
|
||||
}
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
||||
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
if not chromium_dir.exists():
|
||||
return None
|
||||
|
||||
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
|
||||
if not version_dir.is_dir():
|
||||
continue
|
||||
# macOS ARM
|
||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_arm.exists():
|
||||
return str(mac_arm)
|
||||
# macOS x64
|
||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_x64.exists():
|
||||
return str(mac_x64)
|
||||
# Linux
|
||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
||||
if linux.exists():
|
||||
return str(linux)
|
||||
return None
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
|
||||
# Test URL: ad blocker test page that shows if ads are blocked
|
||||
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
|
||||
|
||||
|
||||
def test_extension_loads_in_chromium():
|
||||
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
|
||||
|
||||
Uses Chromium with --load-extension to load the extension, then navigates
|
||||
to chrome-extension://<id>/dashboard.html and checks that "uBlock" appears
|
||||
in the page content.
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
|
||||
# Step 1: Install the uBlock extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'ublock.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chromium CDP URL not found after 20s"
|
||||
print(f"Chromium launched with CDP URL: {cdp_url}")
|
||||
|
||||
# Print chrome hook stderr for debugging
|
||||
# Read what's available without blocking
|
||||
import select
|
||||
if select.select([chrome_launch_process.stderr], [], [], 0.1)[0]:
|
||||
chrome_stderr = chrome_launch_process.stderr.read()
|
||||
print(f"Chrome hook stderr:\n{chrome_stderr}")
|
||||
|
||||
# Check what extensions were loaded by chrome hook
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded by chrome hook: {[e.get('name') for e in loaded_exts]}")
|
||||
else:
|
||||
print("Warning: extensions.json not found")
|
||||
|
||||
# Get the unpacked extension ID - Chrome computes this from the path
|
||||
unpacked_path = ext_data.get('unpacked_path', '')
|
||||
print(f"Extension unpacked path: {unpacked_path}")
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chromium and verify extension loads
|
||||
# First use CDP to get all targets and find extension ID
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Use CDP to get all targets including service workers
|
||||
const pages = await browser.pages();
|
||||
const page = pages[0] || await browser.newPage();
|
||||
const client = await page.createCDPSession();
|
||||
|
||||
const {{ targetInfos }} = await client.send('Target.getTargets');
|
||||
console.error('All CDP targets:');
|
||||
targetInfos.forEach(t => console.error(' -', t.type, t.url.slice(0, 100)));
|
||||
|
||||
// Find any chrome-extension:// URLs
|
||||
const extTargets = targetInfos.filter(t => t.url.startsWith('chrome-extension://'));
|
||||
console.error('Extension targets:', extTargets.length);
|
||||
|
||||
// Filter out built-in extensions
|
||||
const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai'];
|
||||
const customExts = extTargets.filter(t => {{
|
||||
const extId = t.url.split('://')[1].split('/')[0];
|
||||
return !builtinIds.includes(extId);
|
||||
}});
|
||||
|
||||
if (customExts.length === 0) {{
|
||||
console.log(JSON.stringify({{ loaded: false, error: 'No custom extension found via CDP' }}));
|
||||
browser.disconnect();
|
||||
return;
|
||||
}}
|
||||
|
||||
// Get extension ID from first custom extension
|
||||
const extId = customExts[0].url.split('://')[1].split('/')[0];
|
||||
console.error('Found extension ID:', extId);
|
||||
|
||||
// Try to load dashboard.html
|
||||
const newPage = await browser.newPage();
|
||||
const dashboardUrl = 'chrome-extension://' + extId + '/dashboard.html';
|
||||
console.error('Loading:', dashboardUrl);
|
||||
|
||||
try {{
|
||||
await newPage.goto(dashboardUrl, {{ waitUntil: 'domcontentloaded', timeout: 15000 }});
|
||||
const title = await newPage.title();
|
||||
const content = await newPage.content();
|
||||
const hasUblock = content.toLowerCase().includes('ublock') || title.toLowerCase().includes('ublock');
|
||||
|
||||
console.log(JSON.stringify({{
|
||||
loaded: true,
|
||||
extensionId: extId,
|
||||
pageTitle: title,
|
||||
hasExtensionName: hasUblock,
|
||||
contentLength: content.length
|
||||
}}));
|
||||
}} catch (e) {{
|
||||
console.error('Dashboard load failed:', e.message);
|
||||
console.log(JSON.stringify({{ loaded: true, extensionId: extId, dashboardError: e.message }}));
|
||||
}}
|
||||
|
||||
browser.disconnect();
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_ublock.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
assert test_result.get('loaded'), \
|
||||
f"uBlock extension should be loaded in Chromium. Result: {test_result}"
|
||||
print(f"Extension loaded successfully: {test_result}")
|
||||
|
||||
finally:
|
||||
# Clean up Chromium
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def test_blocks_ads_on_test_page():
|
||||
"""Live test: verify uBlock Origin blocks ads on a test page.
|
||||
|
||||
Uses Chromium with extensions loaded automatically via chrome hook.
|
||||
Tests against d3ward's ad blocker test page which checks ad domains.
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
|
||||
# Step 1: Install the uBlock extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'ublock.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chrome using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chrome CDP URL not found after 20s"
|
||||
print(f"Chrome launched with CDP URL: {cdp_url}")
|
||||
|
||||
# Check that extensions were loaded
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chrome and test ad blocking
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Check extension loaded by looking at targets
|
||||
const targets = browser.targets();
|
||||
const extTargets = targets.filter(t =>
|
||||
t.url().startsWith('chrome-extension://') ||
|
||||
t.type() === 'service_worker' ||
|
||||
t.type() === 'background_page'
|
||||
);
|
||||
console.error('Extension targets found:', extTargets.length);
|
||||
extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60)));
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
console.error('Navigating to {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
|
||||
|
||||
// Wait for the test page to run its checks
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
|
||||
// The d3ward test page shows blocked percentage
|
||||
const result = await page.evaluate(() => {{
|
||||
const scoreEl = document.querySelector('#score');
|
||||
const score = scoreEl ? scoreEl.textContent : null;
|
||||
const blockedItems = document.querySelectorAll('.blocked').length;
|
||||
const totalItems = document.querySelectorAll('.testlist li').length;
|
||||
return {{
|
||||
score,
|
||||
blockedItems,
|
||||
totalItems,
|
||||
percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
|
||||
}};
|
||||
}});
|
||||
|
||||
console.error('Ad blocking result:', JSON.stringify(result));
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_ublock.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
|
||||
# uBlock should block most ad domains on the test page
|
||||
assert test_result['percentBlocked'] >= 50, \
|
||||
f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
|
||||
|
||||
finally:
|
||||
# Clean up Chrome
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user