mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
Merge branch 'dev' into claude/tags-editor-widget-0Dq7f
This commit is contained in:
@@ -433,6 +433,103 @@ async function killChrome(pid, outputDir = null) {
|
||||
console.error('[*] Chrome process killed');
|
||||
}
|
||||
|
||||
/**
|
||||
* Install Chromium using @puppeteer/browsers programmatic API.
|
||||
* Uses puppeteer's default cache location, returns the binary path.
|
||||
*
|
||||
* @param {Object} options - Install options
|
||||
* @returns {Promise<Object>} - {success, binary, version, error}
|
||||
*/
|
||||
async function installChromium(options = {}) {
|
||||
// Check if CHROME_BINARY is already set and valid
|
||||
const configuredBinary = getEnv('CHROME_BINARY');
|
||||
if (configuredBinary && fs.existsSync(configuredBinary)) {
|
||||
console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`);
|
||||
return { success: true, binary: configuredBinary, version: null };
|
||||
}
|
||||
|
||||
// Try to load @puppeteer/browsers from NODE_MODULES_DIR or system
|
||||
let puppeteerBrowsers;
|
||||
try {
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
puppeteerBrowsers = require('@puppeteer/browsers');
|
||||
} catch (e) {
|
||||
console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`);
|
||||
return { success: false, error: '@puppeteer/browsers not installed' };
|
||||
}
|
||||
|
||||
console.error(`[*] Installing Chromium via @puppeteer/browsers...`);
|
||||
|
||||
try {
|
||||
const result = await puppeteerBrowsers.install({
|
||||
browser: 'chromium',
|
||||
buildId: 'latest',
|
||||
});
|
||||
|
||||
const binary = result.executablePath;
|
||||
const version = result.buildId;
|
||||
|
||||
if (!binary || !fs.existsSync(binary)) {
|
||||
console.error(`[!] Chromium binary not found at: ${binary}`);
|
||||
return { success: false, error: `Chromium binary not found at: ${binary}` };
|
||||
}
|
||||
|
||||
console.error(`[+] Chromium installed: ${binary}`);
|
||||
return { success: true, binary, version };
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to install Chromium: ${e.message}`);
|
||||
return { success: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Install puppeteer-core npm package.
|
||||
*
|
||||
* @param {Object} options - Install options
|
||||
* @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib/<arch>/npm or ./node_modules parent)
|
||||
* @param {number} [options.timeout=60000] - Timeout in milliseconds
|
||||
* @returns {Promise<Object>} - {success, path, error}
|
||||
*/
|
||||
async function installPuppeteerCore(options = {}) {
|
||||
const arch = `${process.arch}-${process.platform}`;
|
||||
const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm');
|
||||
const {
|
||||
npmPrefix = defaultPrefix,
|
||||
timeout = 60000,
|
||||
} = options;
|
||||
|
||||
const nodeModulesDir = path.join(npmPrefix, 'node_modules');
|
||||
const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core');
|
||||
|
||||
// Check if already installed
|
||||
if (fs.existsSync(puppeteerPath)) {
|
||||
console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`);
|
||||
return { success: true, path: puppeteerPath };
|
||||
}
|
||||
|
||||
console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`);
|
||||
|
||||
// Create directory
|
||||
if (!fs.existsSync(npmPrefix)) {
|
||||
fs.mkdirSync(npmPrefix, { recursive: true });
|
||||
}
|
||||
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
execSync(
|
||||
`npm install --prefix "${npmPrefix}" puppeteer-core`,
|
||||
{ encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] }
|
||||
);
|
||||
console.error(`[+] puppeteer-core installed successfully`);
|
||||
return { success: true, path: puppeteerPath };
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to install puppeteer-core: ${e.message}`);
|
||||
return { success: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
// Try to import unzipper, fallback to system unzip if not available
|
||||
let unzip = null;
|
||||
try {
|
||||
@@ -932,78 +1029,88 @@ function getExtensionTargets(browser) {
|
||||
|
||||
/**
|
||||
* Find Chromium/Chrome binary path.
|
||||
* Prefers Chromium over Chrome because Chrome 137+ removed --load-extension support.
|
||||
* Checks CHROME_BINARY env var first, then falls back to system locations.
|
||||
*
|
||||
* @param {string} [dataDir] - Data directory to check for puppeteer installs
|
||||
* @returns {string|null} - Absolute path to browser binary or null if not found
|
||||
*/
|
||||
function findChromium(dataDir = null) {
|
||||
// Check CHROME_BINARY env var first
|
||||
const chromeBinary = (process.env.CHROME_BINARY || '').trim();
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
// Ensure absolute path
|
||||
return path.resolve(chromeBinary);
|
||||
function findChromium() {
|
||||
const { execSync } = require('child_process');
|
||||
|
||||
// Helper to validate a binary by running --version
|
||||
const validateBinary = (binaryPath) => {
|
||||
if (!binaryPath || !fs.existsSync(binaryPath)) return false;
|
||||
try {
|
||||
execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' });
|
||||
return true;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// 1. Check CHROME_BINARY env var first
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary) {
|
||||
const absPath = path.resolve(chromeBinary);
|
||||
if (validateBinary(absPath)) {
|
||||
return absPath;
|
||||
}
|
||||
console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
|
||||
}
|
||||
|
||||
// 2. Warn that no CHROME_BINARY is configured, searching fallbacks
|
||||
if (!chromeBinary) {
|
||||
console.error('[!] Warning: CHROME_BINARY not set, searching system locations...');
|
||||
}
|
||||
|
||||
// Helper to find Chromium in @puppeteer/browsers directory structure
|
||||
// Always returns absolute paths
|
||||
const findInPuppeteerDir = (baseDir) => {
|
||||
const absBaseDir = path.resolve(baseDir);
|
||||
if (!fs.existsSync(absBaseDir)) return null;
|
||||
if (!fs.existsSync(baseDir)) return null;
|
||||
try {
|
||||
const versions = fs.readdirSync(absBaseDir);
|
||||
const versions = fs.readdirSync(baseDir);
|
||||
for (const version of versions.sort().reverse()) {
|
||||
const versionDir = path.join(absBaseDir, version);
|
||||
// Check for macOS ARM structure
|
||||
const macArmBinary = path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium');
|
||||
if (fs.existsSync(macArmBinary)) return macArmBinary;
|
||||
// Check for macOS x64 structure
|
||||
const macX64Binary = path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium');
|
||||
if (fs.existsSync(macX64Binary)) return macX64Binary;
|
||||
// Check for Linux structure
|
||||
const linuxBinary = path.join(versionDir, 'chrome-linux/chrome');
|
||||
if (fs.existsSync(linuxBinary)) return linuxBinary;
|
||||
const versionDir = path.join(baseDir, version);
|
||||
const candidates = [
|
||||
path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'),
|
||||
path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'),
|
||||
path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'),
|
||||
path.join(versionDir, 'chrome-linux64/chrome'),
|
||||
path.join(versionDir, 'chrome-linux/chrome'),
|
||||
];
|
||||
for (const c of candidates) {
|
||||
if (fs.existsSync(c)) return c;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Continue
|
||||
}
|
||||
} catch (e) {}
|
||||
return null;
|
||||
};
|
||||
|
||||
// Check @puppeteer/browsers install locations
|
||||
const puppeteerDirs = [
|
||||
// Local project install (from npx @puppeteer/browsers install)
|
||||
path.join(dataDir || process.env.DATA_DIR || '.', 'chromium'),
|
||||
path.join(process.cwd(), 'chromium'),
|
||||
// User cache locations
|
||||
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
|
||||
];
|
||||
|
||||
for (const puppeteerDir of puppeteerDirs) {
|
||||
const binary = findInPuppeteerDir(puppeteerDir);
|
||||
if (binary) return binary;
|
||||
}
|
||||
|
||||
// Check standard system locations
|
||||
const candidates = [
|
||||
// Linux Chromium
|
||||
// 3. Search fallback locations (Chromium first, then Chrome)
|
||||
const fallbackLocations = [
|
||||
// System Chromium
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS Chromium (Homebrew or manual install)
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
// Fallback to Chrome (extension loading may not work in Chrome 137+)
|
||||
// Puppeteer cache
|
||||
path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
|
||||
path.join(process.env.HOME || '', '.cache/puppeteer'),
|
||||
// Chrome (fallback - extensions may not work in 137+)
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (fs.existsSync(candidate)) {
|
||||
// Warn if falling back to Chrome
|
||||
if (candidate.includes('google-chrome') || candidate.includes('Google Chrome')) {
|
||||
for (const loc of fallbackLocations) {
|
||||
// Check if it's a puppeteer cache dir
|
||||
if (loc.includes('.cache/puppeteer')) {
|
||||
const binary = findInPuppeteerDir(loc);
|
||||
if (binary && validateBinary(binary)) {
|
||||
return binary;
|
||||
}
|
||||
} else if (validateBinary(loc)) {
|
||||
if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
|
||||
console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
|
||||
}
|
||||
return candidate;
|
||||
return loc;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1028,6 +1135,9 @@ module.exports = {
|
||||
// Chrome launching
|
||||
launchChromium,
|
||||
killChrome,
|
||||
// Chrome/Chromium install
|
||||
installChromium,
|
||||
installPuppeteerCore,
|
||||
// Chrome/Chromium binary finding
|
||||
findChromium,
|
||||
// Extension utilities
|
||||
@@ -1055,7 +1165,9 @@ if (require.main === module) {
|
||||
console.log('Usage: chrome_utils.js <command> [args...]');
|
||||
console.log('');
|
||||
console.log('Commands:');
|
||||
console.log(' findChromium [data_dir]');
|
||||
console.log(' findChromium');
|
||||
console.log(' installChromium');
|
||||
console.log(' installPuppeteerCore [npm_prefix]');
|
||||
console.log(' launchChromium [output_dir] [extension_paths_json]');
|
||||
console.log(' killChrome <pid> [output_dir]');
|
||||
console.log(' killZombieChrome [data_dir]');
|
||||
@@ -1072,8 +1184,7 @@ if (require.main === module) {
|
||||
try {
|
||||
switch (command) {
|
||||
case 'findChromium': {
|
||||
const [dataDir] = commandArgs;
|
||||
const binary = findChromium(dataDir);
|
||||
const binary = findChromium();
|
||||
if (binary) {
|
||||
console.log(binary);
|
||||
} else {
|
||||
@@ -1083,6 +1194,32 @@ if (require.main === module) {
|
||||
break;
|
||||
}
|
||||
|
||||
case 'installChromium': {
|
||||
const result = await installChromium();
|
||||
if (result.success) {
|
||||
console.log(JSON.stringify({
|
||||
binary: result.binary,
|
||||
version: result.version,
|
||||
}));
|
||||
} else {
|
||||
console.error(result.error);
|
||||
process.exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 'installPuppeteerCore': {
|
||||
const [npmPrefix] = commandArgs;
|
||||
const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined });
|
||||
if (result.success) {
|
||||
console.log(JSON.stringify({ path: result.path }));
|
||||
} else {
|
||||
console.error(result.error);
|
||||
process.exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case 'launchChromium': {
|
||||
const [outputDir, extensionPathsJson] = commandArgs;
|
||||
const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : [];
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install hook for Chrome/Chromium binary.
|
||||
Install hook for Chrome/Chromium and puppeteer-core.
|
||||
|
||||
Runs at crawl start to verify Chromium is available.
|
||||
Runs at crawl start to install/find Chromium and puppeteer-core.
|
||||
Outputs JSONL for Binary and Machine config updates.
|
||||
Respects CHROME_BINARY env var for custom binary paths.
|
||||
Falls back to `npx @puppeteer/browsers install chromium@latest` if not found.
|
||||
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
|
||||
|
||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
--load-extension and --disable-extensions-except flags, which are needed for
|
||||
@@ -16,73 +16,139 @@ import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def install_chromium_via_puppeteer() -> bool:
|
||||
"""Install Chromium using @puppeteer/browsers."""
|
||||
def get_chrome_version(binary_path: str) -> str | None:
|
||||
"""Get Chrome/Chromium version string."""
|
||||
try:
|
||||
print("Chromium not found, attempting to install via @puppeteer/browsers...", file=sys.stderr)
|
||||
result = subprocess.run(
|
||||
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
|
||||
[binary_path, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
|
||||
print(f"Failed to install Chromium: {e}", file=sys.stderr)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def install_puppeteer_core() -> bool:
|
||||
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
|
||||
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
|
||||
if not node_modules_dir:
|
||||
# No isolated node_modules, skip (will use global)
|
||||
return True
|
||||
|
||||
node_modules_path = Path(node_modules_dir)
|
||||
if (node_modules_path / 'puppeteer-core').exists():
|
||||
return True
|
||||
|
||||
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
|
||||
npm_prefix = node_modules_path.parent
|
||||
|
||||
try:
|
||||
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
if result.returncode == 0:
|
||||
print(f"[+] puppeteer-core installed", file=sys.stderr)
|
||||
return True
|
||||
else:
|
||||
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def find_chromium() -> dict | None:
|
||||
"""Find Chromium binary, respecting CHROME_BINARY env var."""
|
||||
# Quick check: if CHROME_BINARY is set and exists, skip expensive lookup
|
||||
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
||||
# Binary is already configured and valid - exit immediately
|
||||
sys.exit(0)
|
||||
def install_chromium() -> dict | None:
|
||||
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
|
||||
|
||||
Output format: "chromium@<version> <path_to_binary>"
|
||||
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
|
||||
|
||||
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
|
||||
"""
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BrewProvider, AptProvider
|
||||
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
|
||||
|
||||
# Try to find chromium using abx-pkg
|
||||
# Prefer chromium over chrome because Chrome 137+ removed --load-extension support
|
||||
binary = Binary(
|
||||
name='chromium',
|
||||
binproviders=[NpmProvider(), EnvProvider(), BrewProvider(), AptProvider()],
|
||||
overrides={'npm': {'packages': ['@puppeteer/browsers']}}
|
||||
# Use --path to install to puppeteer's standard cache location
|
||||
cache_path = os.path.expanduser('~/.cache/puppeteer')
|
||||
|
||||
result = subprocess.run(
|
||||
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
stdin=subprocess.DEVNULL,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'chromium',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
if result.returncode != 0:
|
||||
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# If not found, try to install via @puppeteer/browsers
|
||||
if install_chromium_via_puppeteer():
|
||||
# Try loading again after install
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'chromium',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'npm',
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
# Parse output: "chromium@1563294 /path/to/Chromium"
|
||||
output = result.stdout.strip()
|
||||
parts = output.split(' ', 1)
|
||||
if len(parts) != 2:
|
||||
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
version_str = parts[0] # "chromium@1563294"
|
||||
binary_path = parts[1].strip()
|
||||
|
||||
if not binary_path or not os.path.exists(binary_path):
|
||||
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Extract version number
|
||||
version = version_str.split('@')[1] if '@' in version_str else None
|
||||
|
||||
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
|
||||
|
||||
return {
|
||||
'name': 'chromium',
|
||||
'abspath': binary_path,
|
||||
'version': version,
|
||||
'binprovider': 'puppeteer',
|
||||
}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("[!] Chromium install timed out", file=sys.stderr)
|
||||
except FileNotFoundError:
|
||||
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_chromium()
|
||||
# Install puppeteer-core if NODE_MODULES_DIR is set
|
||||
install_puppeteer_core()
|
||||
|
||||
# Check if CHROME_BINARY is already set and valid
|
||||
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
||||
version = get_chrome_version(configured_binary)
|
||||
print(json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': 'chromium',
|
||||
'abspath': configured_binary,
|
||||
'version': version,
|
||||
'binprovider': 'env',
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
# Install/find Chromium via puppeteer
|
||||
result = install_chromium()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
@@ -110,7 +176,7 @@ def main():
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f"Chromium binary not found", file=sys.stderr)
|
||||
print("Chromium binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
@@ -67,28 +67,29 @@ def get_test_env():
|
||||
return env
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
||||
if not CHROMIUM_INSTALL_DIR.exists():
|
||||
return None
|
||||
def find_chromium_binary(data_dir=None):
|
||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||
|
||||
# Look for versioned directories
|
||||
for version_dir in sorted(CHROMIUM_INSTALL_DIR.iterdir(), reverse=True):
|
||||
if not version_dir.is_dir():
|
||||
continue
|
||||
# macOS ARM
|
||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_arm.exists():
|
||||
return str(mac_arm)
|
||||
# macOS x64
|
||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_x64.exists():
|
||||
return str(mac_x64)
|
||||
# Linux
|
||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
||||
if linux.exists():
|
||||
return str(linux)
|
||||
This uses the centralized findChromium() function which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations (in data_dir/chromium)
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
|
||||
Args:
|
||||
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
||||
"""
|
||||
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
|
||||
# Use provided data_dir, or fall back to env var, or current dir
|
||||
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
||||
result = subprocess.run(
|
||||
['node', str(chrome_utils), 'findChromium', str(search_dir)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -171,31 +171,30 @@ def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
||||
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
if not chromium_dir.exists():
|
||||
return None
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
|
||||
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
|
||||
if not version_dir.is_dir():
|
||||
continue
|
||||
# macOS ARM
|
||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_arm.exists():
|
||||
return str(mac_arm)
|
||||
# macOS x64
|
||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_x64.exists():
|
||||
return str(mac_x64)
|
||||
# Linux
|
||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
||||
if linux.exists():
|
||||
return str(linux)
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||
|
||||
This uses the centralized findChromium() function which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
"""
|
||||
chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
|
||||
result = subprocess.run(
|
||||
['node', str(chrome_utils), 'findChromium'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
return None
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
|
||||
TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
@@ -157,54 +157,94 @@ def test_large_extension_size():
|
||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||
|
||||
|
||||
def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
||||
"""Get lib directories for tests, using project's existing node_modules.
|
||||
|
||||
Uses the project's node_modules to avoid slow npm install during tests.
|
||||
"""
|
||||
# Use project's existing node_modules (puppeteer-core already installed)
|
||||
project_root = Path(__file__).parent.parent.parent.parent.parent
|
||||
node_modules_dir = project_root / 'node_modules'
|
||||
|
||||
if not (node_modules_dir / 'puppeteer-core').exists():
|
||||
pytest.skip("puppeteer-core not installed in project node_modules")
|
||||
|
||||
return {
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
}
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary installed by @puppeteer/browsers."""
|
||||
chromium_dir = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
if not chromium_dir.exists():
|
||||
return None
|
||||
|
||||
for version_dir in sorted(chromium_dir.iterdir(), reverse=True):
|
||||
if not version_dir.is_dir():
|
||||
continue
|
||||
# macOS ARM
|
||||
mac_arm = version_dir / 'chrome-mac' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_arm.exists():
|
||||
return str(mac_arm)
|
||||
# macOS x64
|
||||
mac_x64 = version_dir / 'chrome-mac-x64' / 'Chromium.app' / 'Contents' / 'MacOS' / 'Chromium'
|
||||
if mac_x64.exists():
|
||||
return str(mac_x64)
|
||||
# Linux
|
||||
linux = version_dir / 'chrome-linux' / 'chrome'
|
||||
if linux.exists():
|
||||
return str(linux)
|
||||
return None
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure like:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
bin/
|
||||
node_modules/
|
||||
chrome_extensions/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
chrome_extensions_dir = data_dir / 'chrome_extensions'
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
})
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=10, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
# Test URL: ad blocker test page that shows if ads are blocked
|
||||
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
|
||||
|
||||
|
||||
@pytest.mark.timeout(15)
|
||||
def test_extension_loads_in_chromium():
|
||||
"""Verify uBlock extension loads in Chromium by visiting its dashboard page.
|
||||
|
||||
@@ -214,45 +254,44 @@ def test_extension_loads_in_chromium():
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
print("[test] Starting test_extension_loads_in_chromium", flush=True)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
print(f"[test] tmpdir={tmpdir}", flush=True)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
# Set up isolated env with proper directory structure
|
||||
env = setup_test_env(tmpdir)
|
||||
env.setdefault('CHROME_HEADLESS', 'true')
|
||||
print(f"[test] DATA_DIR={env.get('DATA_DIR')}", flush=True)
|
||||
print(f"[test] CHROME_BINARY={env.get('CHROME_BINARY')}", flush=True)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# Step 1: Install the uBlock extension
|
||||
print("[test] Installing uBlock extension...", flush=True)
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=15
|
||||
timeout=5
|
||||
)
|
||||
print(f"[test] Extension install rc={result.returncode}", flush=True)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'ublock.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
print(f"[test] Extension installed: {ext_data.get('name')} v{ext_data.get('version')}", flush=True)
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
|
||||
print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
|
||||
print("[test] Launching Chromium...", flush=True)
|
||||
data_dir = Path(env['DATA_DIR'])
|
||||
crawl_dir = data_dir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
@@ -264,28 +303,32 @@ def test_extension_loads_in_chromium():
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
print("[test] Chrome hook started, waiting for CDP...", flush=True)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(10):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
import select
|
||||
for i in range(20):
|
||||
poll_result = chrome_launch_process.poll()
|
||||
if poll_result is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
raise RuntimeError(f"Chromium launch failed (exit={poll_result}):\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
print(f"[test] CDP URL found after {i+1} attempts", flush=True)
|
||||
break
|
||||
time.sleep(0.5)
|
||||
# Read any available stderr
|
||||
while select.select([chrome_launch_process.stderr], [], [], 0)[0]:
|
||||
line = chrome_launch_process.stderr.readline()
|
||||
if not line:
|
||||
break
|
||||
print(f"[hook] {line.strip()}", flush=True)
|
||||
time.sleep(0.3)
|
||||
|
||||
assert cdp_url, "Chromium CDP URL not found after 20s"
|
||||
print(f"Chromium launched with CDP URL: {cdp_url}")
|
||||
|
||||
# Print chrome hook stderr for debugging
|
||||
# Read what's available without blocking
|
||||
import select
|
||||
if select.select([chrome_launch_process.stderr], [], [], 0.1)[0]:
|
||||
chrome_stderr = chrome_launch_process.stderr.read()
|
||||
print(f"Chrome hook stderr:\n{chrome_stderr}")
|
||||
print(f"[test] Chromium launched with CDP URL: {cdp_url}", flush=True)
|
||||
print("[test] Reading hook stderr...", flush=True)
|
||||
|
||||
# Check what extensions were loaded by chrome hook
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
@@ -297,7 +340,8 @@ def test_extension_loads_in_chromium():
|
||||
|
||||
# Get the unpacked extension ID - Chrome computes this from the path
|
||||
unpacked_path = ext_data.get('unpacked_path', '')
|
||||
print(f"Extension unpacked path: {unpacked_path}")
|
||||
print(f"[test] Extension unpacked path: {unpacked_path}", flush=True)
|
||||
print("[test] Running puppeteer test script...", flush=True)
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chromium and verify extension loads
|
||||
@@ -310,7 +354,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
|
||||
// Use CDP to get all targets including service workers
|
||||
const pages = await browser.pages();
|
||||
@@ -422,22 +466,11 @@ def test_blocks_ads_on_test_page():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
# Set up isolated env with proper directory structure
|
||||
env = setup_test_env(tmpdir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# Step 1: Install the uBlock extension
|
||||
result = subprocess.run(
|
||||
@@ -455,8 +488,9 @@ def test_blocks_ads_on_test_page():
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chrome using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
data_dir = Path(env['DATA_DIR'])
|
||||
crawl_dir = data_dir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
@@ -500,7 +534,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
|
||||
// Check extension loaded by looking at targets
|
||||
const targets = browser.targets();
|
||||
|
||||
Reference in New Issue
Block a user