diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index b4370fde..9dac6599 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -1333,6 +1333,83 @@ function getExtensionsDir() { path.join(dataDir, 'personas', persona, 'chrome_extensions'); } +/** + * Get machine type string for platform-specific paths. + * Matches Python's archivebox.config.paths.get_machine_type() + * + * @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin') + */ +function getMachineType() { + if (process.env.MACHINE_TYPE) { + return process.env.MACHINE_TYPE; + } + + let machine = process.arch; + const system = process.platform; + + // Normalize machine type to match Python's convention + if (machine === 'arm64' || machine === 'aarch64') { + machine = 'arm64'; + } else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') { + machine = 'x86_64'; + } else if (machine === 'ia32' || machine === 'x86') { + machine = 'x86'; + } + + return `${machine}-${system}`; +} + +/** + * Get LIB_DIR path for platform-specific binaries. + * Returns DATA_DIR/lib/MACHINE_TYPE/ + * + * @returns {string} - Absolute path to lib directory + */ +function getLibDir() { + if (process.env.LIB_DIR) { + return process.env.LIB_DIR; + } + const dataDir = getEnv('DATA_DIR', './data'); + const machineType = getMachineType(); + return path.join(dataDir, 'lib', machineType); +} + +/** + * Get NODE_MODULES_DIR path for npm packages. + * Returns LIB_DIR/npm/node_modules/ + * + * @returns {string} - Absolute path to node_modules directory + */ +function getNodeModulesDir() { + if (process.env.NODE_MODULES_DIR) { + return process.env.NODE_MODULES_DIR; + } + return path.join(getLibDir(), 'npm', 'node_modules'); +} + +/** + * Get all test environment paths as a JSON object. + * This is the single source of truth for path calculations - Python calls this + * to avoid duplicating path logic. + * + * @returns {Object} - Object with all test environment paths + */ +function getTestEnv() { + const dataDir = getEnv('DATA_DIR', './data'); + const machineType = getMachineType(); + const libDir = getLibDir(); + const nodeModulesDir = getNodeModulesDir(); + + return { + DATA_DIR: dataDir, + MACHINE_TYPE: machineType, + LIB_DIR: libDir, + NODE_MODULES_DIR: nodeModulesDir, + NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'), + CHROME_EXTENSIONS_DIR: getExtensionsDir(), + }; +} + /** * Install a Chrome extension with caching support. * @@ -1442,8 +1519,13 @@ module.exports = { getExtensionPaths, waitForExtensionTarget, getExtensionTargets, - // Shared extension installer utilities + // Shared path utilities (single source of truth for Python/JS) + getMachineType, + getLibDir, + getNodeModulesDir, getExtensionsDir, + getTestEnv, + // Shared extension installer utilities installExtensionWithCache, // Deprecated - use enableExtensions option instead getExtensionLaunchArgs, @@ -1457,18 +1539,31 @@ if (require.main === module) { console.log('Usage: chrome_utils.js [args...]'); console.log(''); console.log('Commands:'); - console.log(' findChromium'); - console.log(' installChromium'); - console.log(' installPuppeteerCore [npm_prefix]'); - console.log(' launchChromium [output_dir] [extension_paths_json]'); - console.log(' killChrome [output_dir]'); - console.log(' killZombieChrome [data_dir]'); - console.log(' getExtensionId '); - console.log(' loadExtensionManifest '); - console.log(' getExtensionLaunchArgs '); - console.log(' loadOrInstallExtension [extensions_dir]'); - console.log(' getExtensionsDir'); - console.log(' installExtensionWithCache '); + console.log(' findChromium Find Chrome/Chromium binary'); + console.log(' installChromium Install Chromium via @puppeteer/browsers'); + console.log(' installPuppeteerCore Install puppeteer-core npm package'); + console.log(' launchChromium Launch Chrome with CDP debugging'); + console.log(' killChrome Kill Chrome process by PID'); + console.log(' killZombieChrome Clean up zombie Chrome processes'); + console.log(''); + console.log(' getMachineType Get machine type (e.g., x86_64-linux)'); + console.log(' getLibDir Get LIB_DIR path'); + console.log(' getNodeModulesDir Get NODE_MODULES_DIR path'); + console.log(' getExtensionsDir Get Chrome extensions directory'); + console.log(' getTestEnv Get all paths as JSON (for tests)'); + console.log(''); + console.log(' getExtensionId Get extension ID from unpacked path'); + console.log(' loadExtensionManifest Load extension manifest.json'); + console.log(' loadOrInstallExtension Load or install an extension'); + console.log(' installExtensionWithCache Install extension with caching'); + console.log(''); + console.log('Environment variables:'); + console.log(' DATA_DIR Base data directory'); + console.log(' LIB_DIR Library directory (computed if not set)'); + console.log(' MACHINE_TYPE Machine type override'); + console.log(' NODE_MODULES_DIR Node modules directory'); + console.log(' CHROME_BINARY Chrome binary path'); + console.log(' CHROME_EXTENSIONS_DIR Extensions directory'); process.exit(1); } @@ -1581,11 +1676,31 @@ if (require.main === module) { break; } + case 'getMachineType': { + console.log(getMachineType()); + break; + } + + case 'getLibDir': { + console.log(getLibDir()); + break; + } + + case 'getNodeModulesDir': { + console.log(getNodeModulesDir()); + break; + } + case 'getExtensionsDir': { console.log(getExtensionsDir()); break; } + case 'getTestEnv': { + console.log(JSON.stringify(getTestEnv(), null, 2)); + break; + } + case 'installExtensionWithCache': { const [webstore_id, name] = commandArgs; if (!webstore_id || !name) { diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 4de09796..ee28cf4d 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -321,6 +321,51 @@ def run_hook_and_parse( return returncode, result, stderr +def call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]: + """Call chrome_utils.js CLI command. + + This is the central dispatch for calling the JS utilities from Python. + All path calculations and Chrome operations are centralized in chrome_utils.js + to ensure consistency between Python and JavaScript code. + + Args: + command: The CLI command (e.g., 'findChromium', 'getTestEnv') + *args: Additional command arguments + env: Environment dict (default: current env) + + Returns: + Tuple of (returncode, stdout, stderr) + """ + cmd = ['node', str(CHROME_UTILS), command] + list(args) + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30, + env=env or os.environ.copy() + ) + return result.returncode, result.stdout, result.stderr + + +def get_test_env_from_js() -> Optional[Dict[str, str]]: + """Get test environment paths from chrome_utils.js getTestEnv(). + + This is the single source of truth for path calculations. + Python calls JS to get all paths to avoid duplicating logic. + + Returns: + Dict with DATA_DIR, MACHINE_TYPE, LIB_DIR, NODE_MODULES_DIR, etc. + or None if the JS call fails + """ + returncode, stdout, stderr = call_chrome_utils('getTestEnv') + if returncode == 0 and stdout.strip(): + try: + return json.loads(stdout) + except json.JSONDecodeError: + pass + return None + + def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]: """Find the Chromium binary using chrome_utils.js findChromium(). @@ -336,15 +381,12 @@ def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]: Returns: Path to Chromium binary or None if not found """ - search_dir = data_dir or os.environ.get('DATA_DIR', '.') - result = subprocess.run( - ['node', str(CHROME_UTILS), 'findChromium', str(search_dir)], - capture_output=True, - text=True, - timeout=10 - ) - if result.returncode == 0 and result.stdout.strip(): - return result.stdout.strip() + env = os.environ.copy() + if data_dir: + env['DATA_DIR'] = str(data_dir) + returncode, stdout, stderr = call_chrome_utils('findChromium', env=env) + if returncode == 0 and stdout.strip(): + return stdout.strip() return None @@ -358,21 +400,52 @@ def get_extensions_dir() -> str: Returns: Path to extensions directory """ - result = subprocess.run( - ['node', str(CHROME_UTILS), 'getExtensionsDir'], - capture_output=True, - text=True, - timeout=10, - env=get_test_env() - ) - if result.returncode == 0 and result.stdout.strip(): - return result.stdout.strip() + returncode, stdout, stderr = call_chrome_utils('getExtensionsDir') + if returncode == 0 and stdout.strip(): + return stdout.strip() # Fallback to default computation if JS call fails data_dir = os.environ.get('DATA_DIR', './data') persona = os.environ.get('ACTIVE_PERSONA', 'Default') return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') +def get_machine_type_from_js() -> Optional[str]: + """Get machine type from chrome_utils.js getMachineType(). + + This is the single source of truth for machine type calculation. + Returns values like 'x86_64-linux', 'arm64-darwin'. + + Returns: + Machine type string or None if the JS call fails + """ + returncode, stdout, stderr = call_chrome_utils('getMachineType') + if returncode == 0 and stdout.strip(): + return stdout.strip() + return None + + +def kill_chrome_via_js(pid: int, output_dir: Optional[str] = None) -> bool: + """Kill a Chrome process using chrome_utils.js killChrome(). + + This uses the centralized kill logic which handles: + - SIGTERM then SIGKILL + - Process group killing + - Zombie process cleanup + + Args: + pid: Process ID to kill + output_dir: Optional chrome output directory for PID file cleanup + + Returns: + True if the kill command succeeded + """ + args = [str(pid)] + if output_dir: + args.append(str(output_dir)) + returncode, stdout, stderr = call_chrome_utils('killChrome', *args) + return returncode == 0 + + # ============================================================================= # Extension Test Helpers # Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha) @@ -535,21 +608,26 @@ def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None: """Clean up Chromium process launched by launch_chromium_session. + Uses chrome_utils.js killChrome for proper process group handling. + Args: chrome_launch_process: The Popen object from launch_chromium_session chrome_dir: The chrome directory containing chrome.pid """ + # First try to terminate the launch process gracefully try: chrome_launch_process.send_signal(signal.SIGTERM) chrome_launch_process.wait(timeout=5) except Exception: pass + + # Read PID and use JS to kill with proper cleanup chrome_pid_file = chrome_dir / 'chrome.pid' if chrome_pid_file.exists(): try: chrome_pid = int(chrome_pid_file.read_text().strip()) - os.kill(chrome_pid, signal.SIGKILL) - except (OSError, ValueError): + kill_chrome_via_js(chrome_pid, str(chrome_dir)) + except (ValueError, FileNotFoundError): pass @@ -683,25 +761,28 @@ def setup_chrome_session( return chrome_launch_process, chrome_pid, snapshot_chrome_dir -def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None: - """Clean up Chrome processes. +def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None: + """Clean up Chrome processes using chrome_utils.js killChrome. - Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID. - Ignores errors if processes are already dead. + Uses the centralized kill logic from chrome_utils.js which handles: + - SIGTERM then SIGKILL + - Process group killing + - Zombie process cleanup Args: chrome_launch_process: The Popen object for the chrome launch hook chrome_pid: The PID of the Chrome process + chrome_dir: Optional path to chrome output directory """ + # First try to terminate the launch process gracefully try: chrome_launch_process.send_signal(signal.SIGTERM) chrome_launch_process.wait(timeout=5) except Exception: pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass + + # Use JS to kill Chrome with proper process group handling + kill_chrome_via_js(chrome_pid, str(chrome_dir) if chrome_dir else None) @contextmanager diff --git a/archivebox/plugins/favicon/tests/test_favicon.py b/archivebox/plugins/favicon/tests/test_favicon.py index 88af5059..4434d1a8 100644 --- a/archivebox/plugins/favicon/tests/test_favicon.py +++ b/archivebox/plugins/favicon/tests/test_favicon.py @@ -2,7 +2,6 @@ Integration tests for favicon plugin Tests verify: - pass 1. Plugin script exists 2. requests library is available 3. Favicon extraction works for real example.com @@ -21,9 +20,15 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + parse_jsonl_output, +) -PLUGIN_DIR = Path(__file__).parent.parent -FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*') TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/mercury/tests/test_mercury.py b/archivebox/plugins/mercury/tests/test_mercury.py index 87aff58a..242eb5db 100644 --- a/archivebox/plugins/mercury/tests/test_mercury.py +++ b/archivebox/plugins/mercury/tests/test_mercury.py @@ -2,7 +2,6 @@ Integration tests for mercury plugin Tests verify: - pass 1. Hook script exists 2. Dependencies installed via validation hooks 3. Verify deps with abx-pkg @@ -19,9 +18,15 @@ import tempfile from pathlib import Path import pytest -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None) +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + PLUGINS_ROOT, +) + + +PLUGIN_DIR = get_plugin_dir(__file__) +MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*') TEST_URL = 'https://example.com' def test_hook_script_exists(): diff --git a/archivebox/plugins/readability/tests/test_readability.py b/archivebox/plugins/readability/tests/test_readability.py index 80eafffd..b416169e 100644 --- a/archivebox/plugins/readability/tests/test_readability.py +++ b/archivebox/plugins/readability/tests/test_readability.py @@ -2,7 +2,6 @@ Integration tests for readability plugin Tests verify: - pass 1. Validate hook checks for readability-extractor binary 2. Verify deps with abx-pkg 3. Plugin reports missing dependency correctly @@ -18,10 +17,15 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + PLUGINS_ROOT, +) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*')) + +PLUGIN_DIR = get_plugin_dir(__file__) +READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*') TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index c7dc1686..aa19b82c 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -77,27 +77,9 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Chrome binary search paths -CHROMIUM_BINARY_NAMES_LINUX = [ - 'chromium', 'chromium-browser', 'chromium-browser-beta', - 'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev', -] -CHROME_BINARY_NAMES_LINUX = [ - 'google-chrome', 'google-chrome-stable', 'google-chrome-beta', - 'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome', -] -CHROME_BINARY_NAMES_MACOS = [ - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', -] -CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium'] - -ALL_CHROME_BINARIES = ( - CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX + - CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS -) - - +# Chrome session directory (relative to extractor output dir) +# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for. +# The centralized Chrome binary search is in chrome_utils.js findChromium(). CHROME_SESSION_DIR = '../chrome' diff --git a/archivebox/plugins/title/tests/test_title.py b/archivebox/plugins/title/tests/test_title.py index 2054d22d..285f7309 100644 --- a/archivebox/plugins/title/tests/test_title.py +++ b/archivebox/plugins/title/tests/test_title.py @@ -2,7 +2,6 @@ Integration tests for title plugin Tests verify: - pass 1. Plugin script exists 2. Node.js is available 3. Title extraction works for real example.com @@ -20,9 +19,15 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + parse_jsonl_output, +) -PLUGIN_DIR = Path(__file__).parent.parent -TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*') TEST_URL = 'https://example.com'