mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 09:55:33 +10:00
messing with chrome install process to reuse cached chromium with pinned version
This commit is contained in:
@@ -433,6 +433,103 @@ async function killChrome(pid, outputDir = null) {
|
|||||||
console.error('[*] Chrome process killed');
|
console.error('[*] Chrome process killed');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Install Chromium using @puppeteer/browsers programmatic API.
|
||||||
|
* Uses puppeteer's default cache location, returns the binary path.
|
||||||
|
*
|
||||||
|
* @param {Object} options - Install options
|
||||||
|
* @returns {Promise<Object>} - {success, binary, version, error}
|
||||||
|
*/
|
||||||
|
async function installChromium(options = {}) {
|
||||||
|
// Check if CHROME_BINARY is already set and valid
|
||||||
|
const configuredBinary = getEnv('CHROME_BINARY');
|
||||||
|
if (configuredBinary && fs.existsSync(configuredBinary)) {
|
||||||
|
console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`);
|
||||||
|
return { success: true, binary: configuredBinary, version: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to load @puppeteer/browsers from NODE_MODULES_DIR or system
|
||||||
|
let puppeteerBrowsers;
|
||||||
|
try {
|
||||||
|
if (process.env.NODE_MODULES_DIR) {
|
||||||
|
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||||
|
}
|
||||||
|
puppeteerBrowsers = require('@puppeteer/browsers');
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`);
|
||||||
|
return { success: false, error: '@puppeteer/browsers not installed' };
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(`[*] Installing Chromium via @puppeteer/browsers...`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await puppeteerBrowsers.install({
|
||||||
|
browser: 'chromium',
|
||||||
|
buildId: 'latest',
|
||||||
|
});
|
||||||
|
|
||||||
|
const binary = result.executablePath;
|
||||||
|
const version = result.buildId;
|
||||||
|
|
||||||
|
if (!binary || !fs.existsSync(binary)) {
|
||||||
|
console.error(`[!] Chromium binary not found at: ${binary}`);
|
||||||
|
return { success: false, error: `Chromium binary not found at: ${binary}` };
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(`[+] Chromium installed: ${binary}`);
|
||||||
|
return { success: true, binary, version };
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[!] Failed to install Chromium: ${e.message}`);
|
||||||
|
return { success: false, error: e.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Install puppeteer-core npm package.
|
||||||
|
*
|
||||||
|
* @param {Object} options - Install options
|
||||||
|
* @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib/<arch>/npm or ./node_modules parent)
|
||||||
|
* @param {number} [options.timeout=60000] - Timeout in milliseconds
|
||||||
|
* @returns {Promise<Object>} - {success, path, error}
|
||||||
|
*/
|
||||||
|
async function installPuppeteerCore(options = {}) {
|
||||||
|
const arch = `${process.arch}-${process.platform}`;
|
||||||
|
const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm');
|
||||||
|
const {
|
||||||
|
npmPrefix = defaultPrefix,
|
||||||
|
timeout = 60000,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
const nodeModulesDir = path.join(npmPrefix, 'node_modules');
|
||||||
|
const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core');
|
||||||
|
|
||||||
|
// Check if already installed
|
||||||
|
if (fs.existsSync(puppeteerPath)) {
|
||||||
|
console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`);
|
||||||
|
return { success: true, path: puppeteerPath };
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`);
|
||||||
|
|
||||||
|
// Create directory
|
||||||
|
if (!fs.existsSync(npmPrefix)) {
|
||||||
|
fs.mkdirSync(npmPrefix, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { execSync } = require('child_process');
|
||||||
|
execSync(
|
||||||
|
`npm install --prefix "${npmPrefix}" puppeteer-core`,
|
||||||
|
{ encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] }
|
||||||
|
);
|
||||||
|
console.error(`[+] puppeteer-core installed successfully`);
|
||||||
|
return { success: true, path: puppeteerPath };
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[!] Failed to install puppeteer-core: ${e.message}`);
|
||||||
|
return { success: false, error: e.message };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Try to import unzipper, fallback to system unzip if not available
|
// Try to import unzipper, fallback to system unzip if not available
|
||||||
let unzip = null;
|
let unzip = null;
|
||||||
try {
|
try {
|
||||||
@@ -932,78 +1029,88 @@ function getExtensionTargets(browser) {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Find Chromium/Chrome binary path.
|
* Find Chromium/Chrome binary path.
|
||||||
* Prefers Chromium over Chrome because Chrome 137+ removed --load-extension support.
|
* Checks CHROME_BINARY env var first, then falls back to system locations.
|
||||||
*
|
*
|
||||||
* @param {string} [dataDir] - Data directory to check for puppeteer installs
|
|
||||||
* @returns {string|null} - Absolute path to browser binary or null if not found
|
* @returns {string|null} - Absolute path to browser binary or null if not found
|
||||||
*/
|
*/
|
||||||
function findChromium(dataDir = null) {
|
function findChromium() {
|
||||||
// Check CHROME_BINARY env var first
|
const { execSync } = require('child_process');
|
||||||
const chromeBinary = (process.env.CHROME_BINARY || '').trim();
|
|
||||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
// Helper to validate a binary by running --version
|
||||||
// Ensure absolute path
|
const validateBinary = (binaryPath) => {
|
||||||
return path.resolve(chromeBinary);
|
if (!binaryPath || !fs.existsSync(binaryPath)) return false;
|
||||||
|
try {
|
||||||
|
execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' });
|
||||||
|
return true;
|
||||||
|
} catch (e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// 1. Check CHROME_BINARY env var first
|
||||||
|
const chromeBinary = getEnv('CHROME_BINARY');
|
||||||
|
if (chromeBinary) {
|
||||||
|
const absPath = path.resolve(chromeBinary);
|
||||||
|
if (validateBinary(absPath)) {
|
||||||
|
return absPath;
|
||||||
|
}
|
||||||
|
console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Warn that no CHROME_BINARY is configured, searching fallbacks
|
||||||
|
if (!chromeBinary) {
|
||||||
|
console.error('[!] Warning: CHROME_BINARY not set, searching system locations...');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper to find Chromium in @puppeteer/browsers directory structure
|
// Helper to find Chromium in @puppeteer/browsers directory structure
|
||||||
// Always returns absolute paths
|
|
||||||
const findInPuppeteerDir = (baseDir) => {
|
const findInPuppeteerDir = (baseDir) => {
|
||||||
const absBaseDir = path.resolve(baseDir);
|
if (!fs.existsSync(baseDir)) return null;
|
||||||
if (!fs.existsSync(absBaseDir)) return null;
|
|
||||||
try {
|
try {
|
||||||
const versions = fs.readdirSync(absBaseDir);
|
const versions = fs.readdirSync(baseDir);
|
||||||
for (const version of versions.sort().reverse()) {
|
for (const version of versions.sort().reverse()) {
|
||||||
const versionDir = path.join(absBaseDir, version);
|
const versionDir = path.join(baseDir, version);
|
||||||
// Check for macOS ARM structure
|
const candidates = [
|
||||||
const macArmBinary = path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium');
|
path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'),
|
||||||
if (fs.existsSync(macArmBinary)) return macArmBinary;
|
path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
|
||||||
// Check for macOS x64 structure
|
path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'),
|
||||||
const macX64Binary = path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium');
|
path.join(versionDir, 'chrome-linux64/chrome'),
|
||||||
if (fs.existsSync(macX64Binary)) return macX64Binary;
|
path.join(versionDir, 'chrome-linux/chrome'),
|
||||||
// Check for Linux structure
|
];
|
||||||
const linuxBinary = path.join(versionDir, 'chrome-linux/chrome');
|
for (const c of candidates) {
|
||||||
if (fs.existsSync(linuxBinary)) return linuxBinary;
|
if (fs.existsSync(c)) return c;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {}
|
||||||
// Continue
|
|
||||||
}
|
|
||||||
return null;
|
return null;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Check @puppeteer/browsers install locations
|
// 3. Search fallback locations (Chromium first, then Chrome)
|
||||||
const puppeteerDirs = [
|
const fallbackLocations = [
|
||||||
// Local project install (from npx @puppeteer/browsers install)
|
// System Chromium
|
||||||
path.join(dataDir || process.env.DATA_DIR || '.', 'chromium'),
|
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||||
path.join(process.cwd(), 'chromium'),
|
|
||||||
// User cache locations
|
|
||||||
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const puppeteerDir of puppeteerDirs) {
|
|
||||||
const binary = findInPuppeteerDir(puppeteerDir);
|
|
||||||
if (binary) return binary;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check standard system locations
|
|
||||||
const candidates = [
|
|
||||||
// Linux Chromium
|
|
||||||
'/usr/bin/chromium',
|
'/usr/bin/chromium',
|
||||||
'/usr/bin/chromium-browser',
|
'/usr/bin/chromium-browser',
|
||||||
// macOS Chromium (Homebrew or manual install)
|
// Puppeteer cache
|
||||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
|
||||||
// Fallback to Chrome (extension loading may not work in Chrome 137+)
|
path.join(process.env.HOME || '', '.cache/puppeteer'),
|
||||||
|
// Chrome (fallback - extensions may not work in 137+)
|
||||||
|
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||||
'/usr/bin/google-chrome',
|
'/usr/bin/google-chrome',
|
||||||
'/usr/bin/google-chrome-stable',
|
'/usr/bin/google-chrome-stable',
|
||||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const candidate of candidates) {
|
for (const loc of fallbackLocations) {
|
||||||
if (fs.existsSync(candidate)) {
|
// Check if it's a puppeteer cache dir
|
||||||
// Warn if falling back to Chrome
|
if (loc.includes('.cache/puppeteer')) {
|
||||||
if (candidate.includes('google-chrome') || candidate.includes('Google Chrome')) {
|
const binary = findInPuppeteerDir(loc);
|
||||||
|
if (binary && validateBinary(binary)) {
|
||||||
|
return binary;
|
||||||
|
}
|
||||||
|
} else if (validateBinary(loc)) {
|
||||||
|
if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
|
||||||
console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
|
console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
|
||||||
}
|
}
|
||||||
return candidate;
|
return loc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1028,6 +1135,9 @@ module.exports = {
|
|||||||
// Chrome launching
|
// Chrome launching
|
||||||
launchChromium,
|
launchChromium,
|
||||||
killChrome,
|
killChrome,
|
||||||
|
// Chrome/Chromium install
|
||||||
|
installChromium,
|
||||||
|
installPuppeteerCore,
|
||||||
// Chrome/Chromium binary finding
|
// Chrome/Chromium binary finding
|
||||||
findChromium,
|
findChromium,
|
||||||
// Extension utilities
|
// Extension utilities
|
||||||
@@ -1055,7 +1165,9 @@ if (require.main === module) {
|
|||||||
console.log('Usage: chrome_utils.js <command> [args...]');
|
console.log('Usage: chrome_utils.js <command> [args...]');
|
||||||
console.log('');
|
console.log('');
|
||||||
console.log('Commands:');
|
console.log('Commands:');
|
||||||
console.log(' findChromium [data_dir]');
|
console.log(' findChromium');
|
||||||
|
console.log(' installChromium');
|
||||||
|
console.log(' installPuppeteerCore [npm_prefix]');
|
||||||
console.log(' launchChromium [output_dir] [extension_paths_json]');
|
console.log(' launchChromium [output_dir] [extension_paths_json]');
|
||||||
console.log(' killChrome <pid> [output_dir]');
|
console.log(' killChrome <pid> [output_dir]');
|
||||||
console.log(' killZombieChrome [data_dir]');
|
console.log(' killZombieChrome [data_dir]');
|
||||||
@@ -1072,8 +1184,7 @@ if (require.main === module) {
|
|||||||
try {
|
try {
|
||||||
switch (command) {
|
switch (command) {
|
||||||
case 'findChromium': {
|
case 'findChromium': {
|
||||||
const [dataDir] = commandArgs;
|
const binary = findChromium();
|
||||||
const binary = findChromium(dataDir);
|
|
||||||
if (binary) {
|
if (binary) {
|
||||||
console.log(binary);
|
console.log(binary);
|
||||||
} else {
|
} else {
|
||||||
@@ -1083,6 +1194,32 @@ if (require.main === module) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case 'installChromium': {
|
||||||
|
const result = await installChromium();
|
||||||
|
if (result.success) {
|
||||||
|
console.log(JSON.stringify({
|
||||||
|
binary: result.binary,
|
||||||
|
version: result.version,
|
||||||
|
}));
|
||||||
|
} else {
|
||||||
|
console.error(result.error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'installPuppeteerCore': {
|
||||||
|
const [npmPrefix] = commandArgs;
|
||||||
|
const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined });
|
||||||
|
if (result.success) {
|
||||||
|
console.log(JSON.stringify({ path: result.path }));
|
||||||
|
} else {
|
||||||
|
console.error(result.error);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case 'launchChromium': {
|
case 'launchChromium': {
|
||||||
const [outputDir, extensionPathsJson] = commandArgs;
|
const [outputDir, extensionPathsJson] = commandArgs;
|
||||||
const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : [];
|
const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : [];
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Install hook for Chrome/Chromium binary.
|
Install hook for Chrome/Chromium and puppeteer-core.
|
||||||
|
|
||||||
Runs at crawl start to verify Chromium is available.
|
Runs at crawl start to install/find Chromium and puppeteer-core.
|
||||||
Outputs JSONL for Binary and Machine config updates.
|
Outputs JSONL for Binary and Machine config updates.
|
||||||
Respects CHROME_BINARY env var for custom binary paths.
|
Respects CHROME_BINARY env var for custom binary paths.
|
||||||
Falls back to `npx @puppeteer/browsers install chromium@latest` if not found.
|
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
|
||||||
|
|
||||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||||
--load-extension and --disable-extensions-except flags, which are needed for
|
--load-extension and --disable-extensions-except flags, which are needed for
|
||||||
@@ -16,73 +16,139 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import subprocess
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def install_chromium_via_puppeteer() -> bool:
|
def get_chrome_version(binary_path: str) -> str | None:
|
||||||
"""Install Chromium using @puppeteer/browsers."""
|
"""Get Chrome/Chromium version string."""
|
||||||
try:
|
try:
|
||||||
print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
|
[binary_path, '--version'],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=300
|
timeout=5
|
||||||
)
|
)
|
||||||
return result.returncode == 0
|
if result.returncode == 0:
|
||||||
except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
|
return result.stdout.strip()
|
||||||
print(f"Failed to install Chromium: {e}", file=sys.stderr)
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def install_puppeteer_core() -> bool:
|
||||||
|
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
|
||||||
|
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
|
||||||
|
if not node_modules_dir:
|
||||||
|
# No isolated node_modules, skip (will use global)
|
||||||
|
return True
|
||||||
|
|
||||||
|
node_modules_path = Path(node_modules_dir)
|
||||||
|
if (node_modules_path / 'puppeteer-core').exists():
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
|
||||||
|
npm_prefix = node_modules_path.parent
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
|
||||||
|
result = subprocess.run(
|
||||||
|
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=60
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
print(f"[+] puppeteer-core installed", file=sys.stderr)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def find_chromium() -> dict | None:
|
def install_chromium() -> dict | None:
|
||||||
"""Find Chromium binary, respecting CHROME_BINARY env var."""
|
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
|
||||||
# Quick check: if CHROME_BINARY is set and exists, skip expensive lookup
|
|
||||||
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
|
||||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
|
||||||
# Binary is already configured and valid - exit immediately
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
|
Output format: "chromium@<version> <path_to_binary>"
|
||||||
|
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
|
||||||
|
|
||||||
|
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider
|
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
|
||||||
|
|
||||||
# Try to find chromium using abx-pkg
|
# Use --path to install to puppeteer's standard cache location
|
||||||
# Prefer chromium over chrome because Chrome 137+ removed --load-extension support
|
cache_path = os.path.expanduser('~/.cache/puppeteer')
|
||||||
binary = Binary(
|
|
||||||
name='chromium',
|
result = subprocess.run(
|
||||||
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()],
|
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
|
||||||
overrides={'npm': {'packages': ['@puppeteer/browsers']}}
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
stdin=subprocess.DEVNULL,
|
||||||
|
timeout=300
|
||||||
)
|
)
|
||||||
|
|
||||||
loaded = binary.load()
|
if result.returncode != 0:
|
||||||
if loaded and loaded.abspath:
|
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
|
||||||
return {
|
return None
|
||||||
'name': 'chromium',
|
|
||||||
'abspath': str(loaded.abspath),
|
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
|
||||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
|
||||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
|
||||||
}
|
|
||||||
|
|
||||||
# If not found, try to install via @puppeteer/browsers
|
# Parse output: "chromium@1563294 /path/to/Chromium"
|
||||||
if install_chromium_via_puppeteer():
|
output = result.stdout.strip()
|
||||||
# Try loading again after install
|
parts = output.split(' ', 1)
|
||||||
loaded = binary.load()
|
if len(parts) != 2:
|
||||||
if loaded and loaded.abspath:
|
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
|
||||||
return {
|
return None
|
||||||
'name': 'chromium',
|
|
||||||
'abspath': str(loaded.abspath),
|
version_str = parts[0] # "chromium@1563294"
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
binary_path = parts[1].strip()
|
||||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
|
||||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'npm',
|
if not binary_path or not os.path.exists(binary_path):
|
||||||
}
|
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
|
||||||
except Exception:
|
return None
|
||||||
pass
|
|
||||||
|
# Extract version number
|
||||||
|
version = version_str.split('@')[1] if '@' in version_str else None
|
||||||
|
|
||||||
|
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'name': 'chromium',
|
||||||
|
'abspath': binary_path,
|
||||||
|
'version': version,
|
||||||
|
'binprovider': 'puppeteer',
|
||||||
|
}
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
print("[!] Chromium install timed out", file=sys.stderr)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
result = find_chromium()
|
# Install puppeteer-core if NODE_MODULES_DIR is set
|
||||||
|
install_puppeteer_core()
|
||||||
|
|
||||||
|
# Check if CHROME_BINARY is already set and valid
|
||||||
|
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
||||||
|
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
||||||
|
version = get_chrome_version(configured_binary)
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Binary',
|
||||||
|
'name': 'chromium',
|
||||||
|
'abspath': configured_binary,
|
||||||
|
'version': version,
|
||||||
|
'binprovider': 'env',
|
||||||
|
}))
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Install/find Chromium via puppeteer
|
||||||
|
result = install_chromium()
|
||||||
|
|
||||||
if result and result.get('abspath'):
|
if result and result.get('abspath'):
|
||||||
print(json.dumps({
|
print(json.dumps({
|
||||||
@@ -110,7 +176,7 @@ def main():
|
|||||||
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
else:
|
else:
|
||||||
print(f"Chromium binary not found", file=sys.stderr)
|
print("Chromium binary not found", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -67,28 +67,29 @@ def get_test_env():
|
|||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
def find_chromium_binary():
|
def find_chromium_binary(data_dir=None):
|
||||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||||
if not CHROMIUM_INSTALL_DIR.exists():
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Look for versioned directories
|
This uses the centralized findChromium() function which checks:
|
||||||
for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True):
|
- CHROME_BINARY env var
|
||||||
if not version_dir.is_dir():
|
- @puppeteer/browsers install locations (in data_dir/chromium)
|
||||||
continue
|
- System Chromium locations
|
||||||
# macOS ARM
|
- Falls back to Chrome (with warning)
|
||||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
|
||||||
if mac_arm.exists():
|
|
||||||
return str(mac_arm)
|
|
||||||
# macOS x64
|
|
||||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
|
||||||
if mac_x64.exists():
|
|
||||||
return str(mac_x64)
|
|
||||||
# Linux
|
|
||||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
|
||||||
if linux.exists():
|
|
||||||
return str(linux)
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
||||||
|
"""
|
||||||
|
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
|
||||||
|
# Use provided data_dir, or fall back to env var, or current dir
|
||||||
|
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
||||||
|
result = subprocess.run(
|
||||||
|
['node', str(chrome_utils), 'findChromium', str(search_dir)],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
return result.stdout.strip()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -171,31 +171,30 @@ def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def find_chromium_binary():
|
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
|
||||||
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
|
||||||
if not chromium_dir.exists():
|
|
||||||
return None
|
|
||||||
|
|
||||||
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
|
|
||||||
if not version_dir.is_dir():
|
def find_chromium_binary():
|
||||||
continue
|
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||||
# macOS ARM
|
|
||||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
This uses the centralized findChromium() function which checks:
|
||||||
if mac_arm.exists():
|
- CHROME_BINARY env var
|
||||||
return str(mac_arm)
|
- @puppeteer/browsers install locations
|
||||||
# macOS x64
|
- System Chromium locations
|
||||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
- Falls back to Chrome (with warning)
|
||||||
if mac_x64.exists():
|
"""
|
||||||
return str(mac_x64)
|
chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
|
||||||
# Linux
|
result = subprocess.run(
|
||||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
['node', str(chrome_utils), 'findChromium'],
|
||||||
if linux.exists():
|
capture_output=True,
|
||||||
return str(linux)
|
text=True,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
return result.stdout.strip()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
|
||||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||||
|
|
||||||
TEST_URL = 'https://www.filmin.es/'
|
TEST_URL = 'https://www.filmin.es/'
|
||||||
|
|||||||
@@ -157,54 +157,94 @@ def test_large_extension_size():
|
|||||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||||
|
|
||||||
|
|
||||||
def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
|
||||||
"""Get lib directories for tests, using project's existing node_modules.
|
|
||||||
|
|
||||||
Uses the project's node_modules to avoid slow npm install during tests.
|
|
||||||
"""
|
|
||||||
# Use project's existing node_modules (puppeteer-core already installed)
|
|
||||||
project_root = Path(__file__).parent.parent.parent.parent.parent
|
|
||||||
node_modules_dir = project_root / 'node_modules'
|
|
||||||
|
|
||||||
if not (node_modules_dir / 'puppeteer-core').exists():
|
|
||||||
pytest.skip("puppeteer-core not installed in project node_modules")
|
|
||||||
|
|
||||||
return {
|
|
||||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def find_chromium_binary():
|
|
||||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
|
||||||
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
|
||||||
if not chromium_dir.exists():
|
|
||||||
return None
|
|
||||||
|
|
||||||
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
|
|
||||||
if not version_dir.is_dir():
|
|
||||||
continue
|
|
||||||
# macOS ARM
|
|
||||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
|
||||||
if mac_arm.exists():
|
|
||||||
return str(mac_arm)
|
|
||||||
# macOS x64
|
|
||||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
|
||||||
if mac_x64.exists():
|
|
||||||
return str(mac_x64)
|
|
||||||
# Linux
|
|
||||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
|
||||||
if linux.exists():
|
|
||||||
return str(linux)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||||
|
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||||
|
|
||||||
|
|
||||||
|
def setup_test_env(tmpdir: Path) -> dict:
|
||||||
|
"""Set up isolated data/lib directory structure for tests.
|
||||||
|
|
||||||
|
Creates structure like:
|
||||||
|
<tmpdir>/data/
|
||||||
|
lib/
|
||||||
|
arm64-darwin/ (or x86_64-linux, etc.)
|
||||||
|
npm/
|
||||||
|
bin/
|
||||||
|
node_modules/
|
||||||
|
chrome_extensions/
|
||||||
|
|
||||||
|
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||||
|
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||||
|
"""
|
||||||
|
import platform
|
||||||
|
|
||||||
|
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||||
|
machine = platform.machine().lower()
|
||||||
|
system = platform.system().lower()
|
||||||
|
if machine in ('arm64', 'aarch64'):
|
||||||
|
machine = 'arm64'
|
||||||
|
elif machine in ('x86_64', 'amd64'):
|
||||||
|
machine = 'x86_64'
|
||||||
|
machine_type = f"{machine}-{system}"
|
||||||
|
|
||||||
|
# Create proper directory structure
|
||||||
|
data_dir = tmpdir / 'data'
|
||||||
|
lib_dir = data_dir / 'lib' / machine_type
|
||||||
|
npm_dir = lib_dir / 'npm'
|
||||||
|
npm_bin_dir = npm_dir / 'bin'
|
||||||
|
node_modules_dir = npm_dir / 'node_modules'
|
||||||
|
chrome_extensions_dir = data_dir / 'chrome_extensions'
|
||||||
|
|
||||||
|
# Create all directories
|
||||||
|
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Build complete env dict
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update({
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'LIB_DIR': str(lib_dir),
|
||||||
|
'MACHINE_TYPE': machine_type,
|
||||||
|
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||||
|
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||||
|
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', str(CHROME_INSTALL_HOOK)],
|
||||||
|
capture_output=True, text=True, timeout=10, env=env
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||||
|
|
||||||
|
# Parse JSONL output to get CHROME_BINARY
|
||||||
|
chrome_binary = None
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||||
|
chrome_binary = data['abspath']
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not chrome_binary or not Path(chrome_binary).exists():
|
||||||
|
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||||
|
|
||||||
|
env['CHROME_BINARY'] = chrome_binary
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
# Test URL: ad blocker test page that shows if ads are blocked
|
# Test URL: ad blocker test page that shows if ads are blocked
|
||||||
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
|
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(15)
|
||||||
def test_extension_loads_in_chromium():
|
def test_extension_loads_in_chromium():
|
||||||
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
|
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
|
||||||
|
|
||||||
@@ -214,35 +254,30 @@ def test_extension_loads_in_chromium():
|
|||||||
"""
|
"""
|
||||||
import signal
|
import signal
|
||||||
import time
|
import time
|
||||||
|
print("[test] Starting test_extension_loads_in_chromium", flush=True)
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
tmpdir = Path(tmpdir)
|
tmpdir = Path(tmpdir)
|
||||||
|
print(f"[test] tmpdir={tmpdir}", flush=True)
|
||||||
|
|
||||||
# Set up isolated lib directories for this test
|
# Set up isolated env with proper directory structure
|
||||||
lib_env = setup_test_lib_dirs(tmpdir)
|
env = setup_test_env(tmpdir)
|
||||||
|
env.setdefault('CHROME_HEADLESS', 'true')
|
||||||
|
print(f"[test] DATA_DIR={env.get('DATA_DIR')}", flush=True)
|
||||||
|
print(f"[test] CHROME_BINARY={env.get('CHROME_BINARY')}", flush=True)
|
||||||
|
|
||||||
# Set up extensions directory
|
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||||
ext_dir = tmpdir / 'chrome_extensions'
|
|
||||||
ext_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env.update(lib_env)
|
|
||||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
|
||||||
env['CHROME_HEADLESS'] = 'true'
|
|
||||||
|
|
||||||
# Ensure CHROME_BINARY points to Chromium
|
|
||||||
chromium = find_chromium_binary()
|
|
||||||
if chromium:
|
|
||||||
env['CHROME_BINARY'] = chromium
|
|
||||||
|
|
||||||
# Step 1: Install the uBlock extension
|
# Step 1: Install the uBlock extension
|
||||||
|
print("[test] Installing uBlock extension...", flush=True)
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['node', str(INSTALL_SCRIPT)],
|
['node', str(INSTALL_SCRIPT)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
env=env,
|
env=env,
|
||||||
timeout=15
|
timeout=5
|
||||||
)
|
)
|
||||||
|
print(f"[test] Extension install rc={result.returncode}", flush=True)
|
||||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||||
|
|
||||||
# Verify extension cache was created
|
# Verify extension cache was created
|
||||||
@@ -252,7 +287,8 @@ def test_extension_loads_in_chromium():
|
|||||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||||
|
|
||||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||||
crawl_dir = tmpdir / 'crawl'
|
data_dir = Path(env['DATA_DIR'])
|
||||||
|
crawl_dir = data_dir / 'crawl'
|
||||||
crawl_dir.mkdir()
|
crawl_dir.mkdir()
|
||||||
chrome_dir = crawl_dir / 'chrome'
|
chrome_dir = crawl_dir / 'chrome'
|
||||||
|
|
||||||
@@ -422,22 +458,11 @@ def test_blocks_ads_on_test_page():
|
|||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
tmpdir = Path(tmpdir)
|
tmpdir = Path(tmpdir)
|
||||||
|
|
||||||
# Set up isolated lib directories for this test
|
# Set up isolated env with proper directory structure
|
||||||
lib_env = setup_test_lib_dirs(tmpdir)
|
env = setup_test_env(tmpdir)
|
||||||
|
|
||||||
# Set up extensions directory
|
|
||||||
ext_dir = tmpdir / 'chrome_extensions'
|
|
||||||
ext_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env.update(lib_env)
|
|
||||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
|
||||||
env['CHROME_HEADLESS'] = 'true'
|
env['CHROME_HEADLESS'] = 'true'
|
||||||
|
|
||||||
# Ensure CHROME_BINARY points to Chromium
|
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||||
chromium = find_chromium_binary()
|
|
||||||
if chromium:
|
|
||||||
env['CHROME_BINARY'] = chromium
|
|
||||||
|
|
||||||
# Step 1: Install the uBlock extension
|
# Step 1: Install the uBlock extension
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
@@ -455,8 +480,9 @@ def test_blocks_ads_on_test_page():
|
|||||||
ext_data = json.loads(cache_file.read_text())
|
ext_data = json.loads(cache_file.read_text())
|
||||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||||
|
|
||||||
# Step 2: Launch Chrome using the chrome hook (loads extensions automatically)
|
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||||
crawl_dir = tmpdir / 'crawl'
|
data_dir = Path(env['DATA_DIR'])
|
||||||
|
crawl_dir = data_dir / 'crawl'
|
||||||
crawl_dir.mkdir()
|
crawl_dir.mkdir()
|
||||||
chrome_dir = crawl_dir / 'chrome'
|
chrome_dir = crawl_dir / 'chrome'
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user