diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index b4370fde..9dac6599 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -1333,6 +1333,83 @@ function getExtensionsDir() { path.join(dataDir, 'personas', persona, 'chrome_extensions'); } +/** + * Get machine type string for platform-specific paths. + * Matches Python's archivebox.config.paths.get_machine_type() + * + * @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin') + */ +function getMachineType() { + if (process.env.MACHINE_TYPE) { + return process.env.MACHINE_TYPE; + } + + let machine = process.arch; + const system = process.platform; + + // Normalize machine type to match Python's convention + if (machine === 'arm64' || machine === 'aarch64') { + machine = 'arm64'; + } else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') { + machine = 'x86_64'; + } else if (machine === 'ia32' || machine === 'x86') { + machine = 'x86'; + } + + return `${machine}-${system}`; +} + +/** + * Get LIB_DIR path for platform-specific binaries. + * Returns DATA_DIR/lib/MACHINE_TYPE/ + * + * @returns {string} - Absolute path to lib directory + */ +function getLibDir() { + if (process.env.LIB_DIR) { + return process.env.LIB_DIR; + } + const dataDir = getEnv('DATA_DIR', './data'); + const machineType = getMachineType(); + return path.join(dataDir, 'lib', machineType); +} + +/** + * Get NODE_MODULES_DIR path for npm packages. + * Returns LIB_DIR/npm/node_modules/ + * + * @returns {string} - Absolute path to node_modules directory + */ +function getNodeModulesDir() { + if (process.env.NODE_MODULES_DIR) { + return process.env.NODE_MODULES_DIR; + } + return path.join(getLibDir(), 'npm', 'node_modules'); +} + +/** + * Get all test environment paths as a JSON object. + * This is the single source of truth for path calculations - Python calls this + * to avoid duplicating path logic. + * + * @returns {Object} - Object with all test environment paths + */ +function getTestEnv() { + const dataDir = getEnv('DATA_DIR', './data'); + const machineType = getMachineType(); + const libDir = getLibDir(); + const nodeModulesDir = getNodeModulesDir(); + + return { + DATA_DIR: dataDir, + MACHINE_TYPE: machineType, + LIB_DIR: libDir, + NODE_MODULES_DIR: nodeModulesDir, + NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'), + CHROME_EXTENSIONS_DIR: getExtensionsDir(), + }; +} + /** * Install a Chrome extension with caching support. * @@ -1442,8 +1519,13 @@ module.exports = { getExtensionPaths, waitForExtensionTarget, getExtensionTargets, - // Shared extension installer utilities + // Shared path utilities (single source of truth for Python/JS) + getMachineType, + getLibDir, + getNodeModulesDir, getExtensionsDir, + getTestEnv, + // Shared extension installer utilities installExtensionWithCache, // Deprecated - use enableExtensions option instead getExtensionLaunchArgs, @@ -1457,18 +1539,31 @@ if (require.main === module) { console.log('Usage: chrome_utils.js [args...]'); console.log(''); console.log('Commands:'); - console.log(' findChromium'); - console.log(' installChromium'); - console.log(' installPuppeteerCore [npm_prefix]'); - console.log(' launchChromium [output_dir] [extension_paths_json]'); - console.log(' killChrome [output_dir]'); - console.log(' killZombieChrome [data_dir]'); - console.log(' getExtensionId '); - console.log(' loadExtensionManifest '); - console.log(' getExtensionLaunchArgs '); - console.log(' loadOrInstallExtension [extensions_dir]'); - console.log(' getExtensionsDir'); - console.log(' installExtensionWithCache '); + console.log(' findChromium Find Chrome/Chromium binary'); + console.log(' installChromium Install Chromium via @puppeteer/browsers'); + console.log(' installPuppeteerCore Install puppeteer-core npm package'); + console.log(' launchChromium Launch Chrome with CDP debugging'); + console.log(' killChrome Kill Chrome process by PID'); + console.log(' killZombieChrome Clean up zombie Chrome processes'); + console.log(''); + console.log(' getMachineType Get machine type (e.g., x86_64-linux)'); + console.log(' getLibDir Get LIB_DIR path'); + console.log(' getNodeModulesDir Get NODE_MODULES_DIR path'); + console.log(' getExtensionsDir Get Chrome extensions directory'); + console.log(' getTestEnv Get all paths as JSON (for tests)'); + console.log(''); + console.log(' getExtensionId Get extension ID from unpacked path'); + console.log(' loadExtensionManifest Load extension manifest.json'); + console.log(' loadOrInstallExtension Load or install an extension'); + console.log(' installExtensionWithCache Install extension with caching'); + console.log(''); + console.log('Environment variables:'); + console.log(' DATA_DIR Base data directory'); + console.log(' LIB_DIR Library directory (computed if not set)'); + console.log(' MACHINE_TYPE Machine type override'); + console.log(' NODE_MODULES_DIR Node modules directory'); + console.log(' CHROME_BINARY Chrome binary path'); + console.log(' CHROME_EXTENSIONS_DIR Extensions directory'); process.exit(1); } @@ -1581,11 +1676,31 @@ if (require.main === module) { break; } + case 'getMachineType': { + console.log(getMachineType()); + break; + } + + case 'getLibDir': { + console.log(getLibDir()); + break; + } + + case 'getNodeModulesDir': { + console.log(getNodeModulesDir()); + break; + } + case 'getExtensionsDir': { console.log(getExtensionsDir()); break; } + case 'getTestEnv': { + console.log(JSON.stringify(getTestEnv(), null, 2)); + break; + } + case 'installExtensionWithCache': { const [webstore_id, name] = commandArgs; if (!webstore_id || !name) { diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 97928323..7e8c2d5e 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -2,25 +2,69 @@ Shared Chrome test helpers for plugin integration tests. This module provides common utilities for Chrome-based plugin tests, reducing -duplication across test files. It uses the JavaScript utilities from chrome_utils.js -where appropriate. +duplication across test files. Functions delegate to chrome_utils.js (the single +source of truth) with Python fallbacks. + +Function names match the JS equivalents in snake_case: + JS: getMachineType() -> Python: get_machine_type() + JS: getLibDir() -> Python: get_lib_dir() + JS: getNodeModulesDir() -> Python: get_node_modules_dir() + JS: getExtensionsDir() -> Python: get_extensions_dir() + JS: findChromium() -> Python: find_chromium() + JS: killChrome() -> Python: kill_chrome() + JS: getTestEnv() -> Python: get_test_env() Usage: + # Path helpers (delegate to chrome_utils.js): from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - setup_chrome_session, - cleanup_chrome, - find_chromium_binary, - get_node_modules_dir, + get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE + get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin' + get_lib_dir, # Path to lib dir + get_node_modules_dir, # Path to node_modules + get_extensions_dir, # Path to chrome extensions + find_chromium, # Find Chrome/Chromium binary + kill_chrome, # Kill Chrome process by PID + ) + + # Test file helpers: + from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path + get_hook_script, # Find hook script by glob pattern + PLUGINS_ROOT, # Path to plugins root + LIB_DIR, # Path to lib dir (lazy-loaded) + NODE_MODULES_DIR, # Path to node_modules (lazy-loaded) + ) + + # For Chrome session tests: + from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + setup_chrome_session, # Full Chrome + tab setup + cleanup_chrome, # Cleanup by PID + chrome_session, # Context manager + ) + + # For extension tests: + from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + setup_test_env, # Full dir structure + Chrome install + launch_chromium_session, # Launch Chrome, return CDP URL + kill_chromium_session, # Cleanup Chrome + ) + + # Run hooks and parse JSONL: + from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + run_hook, # Run hook, return (returncode, stdout, stderr) + parse_jsonl_output, # Parse JSONL from stdout ) """ +import json import os +import platform import signal import subprocess import time +from datetime import datetime from pathlib import Path -from typing import Tuple, Optional +from typing import Tuple, Optional, List, Dict, Any from contextlib import contextmanager @@ -29,88 +73,623 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent # Hook script locations +CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py' CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js' CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js' CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None) CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js' -def get_node_modules_dir() -> Path: - """Get NODE_MODULES_DIR for tests, checking env first. +# ============================================================================= +# Path Helpers - delegates to chrome_utils.js with Python fallback +# Function names match JS: getMachineType -> get_machine_type, etc. +# ============================================================================= - Returns the path to the node_modules directory, checking: - 1. NODE_MODULES_DIR environment variable - 2. Computed from LIB_DIR via ArchiveBox config + +def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]: + """Call chrome_utils.js CLI command (internal helper). + + This is the central dispatch for calling the JS utilities from Python. + All path calculations and Chrome operations are centralized in chrome_utils.js + to ensure consistency between Python and JavaScript code. + + Args: + command: The CLI command (e.g., 'findChromium', 'getTestEnv') + *args: Additional command arguments + env: Environment dict (default: current env) + + Returns: + Tuple of (returncode, stdout, stderr) """ + cmd = ['node', str(CHROME_UTILS), command] + list(args) + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=30, + env=env or os.environ.copy() + ) + return result.returncode, result.stdout, result.stderr + + +def get_plugin_dir(test_file: str) -> Path: + """Get the plugin directory from a test file path. + + Usage: + PLUGIN_DIR = get_plugin_dir(__file__) + + Args: + test_file: The __file__ of the test module (e.g., test_screenshot.py) + + Returns: + Path to the plugin directory (e.g., plugins/screenshot/) + """ + return Path(test_file).parent.parent + + +def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]: + """Find a hook script in a plugin directory by pattern. + + Usage: + HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') + + Args: + plugin_dir: Path to the plugin directory + pattern: Glob pattern to match + + Returns: + Path to the hook script or None if not found + """ + matches = list(plugin_dir.glob(pattern)) + return matches[0] if matches else None + + +def get_machine_type() -> str: + """Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin'). + + Matches JS: getMachineType() + + Tries chrome_utils.js first, falls back to Python computation. + """ + # Try JS first (single source of truth) + returncode, stdout, stderr = _call_chrome_utils('getMachineType') + if returncode == 0 and stdout.strip(): + return stdout.strip() + + # Fallback to Python computation + if os.environ.get('MACHINE_TYPE'): + return os.environ['MACHINE_TYPE'] + + machine = platform.machine().lower() + system = platform.system().lower() + if machine in ('arm64', 'aarch64'): + machine = 'arm64' + elif machine in ('x86_64', 'amd64'): + machine = 'x86_64' + return f"{machine}-{system}" + + +def get_lib_dir() -> Path: + """Get LIB_DIR path for platform-specific binaries. + + Matches JS: getLibDir() + + Tries chrome_utils.js first, falls back to Python computation. + """ + # Try JS first + returncode, stdout, stderr = _call_chrome_utils('getLibDir') + if returncode == 0 and stdout.strip(): + return Path(stdout.strip()) + + # Fallback to Python + if os.environ.get('LIB_DIR'): + return Path(os.environ['LIB_DIR']) + from archivebox.config.common import STORAGE_CONFIG + return Path(str(STORAGE_CONFIG.LIB_DIR)) + + +def get_node_modules_dir() -> Path: + """Get NODE_MODULES_DIR path for npm packages. + + Matches JS: getNodeModulesDir() + + Tries chrome_utils.js first, falls back to Python computation. + """ + # Try JS first + returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir') + if returncode == 0 and stdout.strip(): + return Path(stdout.strip()) + + # Fallback to Python if os.environ.get('NODE_MODULES_DIR'): return Path(os.environ['NODE_MODULES_DIR']) - # Otherwise compute from LIB_DIR - from archivebox.config.common import STORAGE_CONFIG - lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) + lib_dir = get_lib_dir() return lib_dir / 'npm' / 'node_modules' -def get_test_env() -> dict: - """Get environment dict with NODE_MODULES_DIR set correctly for tests. +def get_extensions_dir() -> str: + """Get the Chrome extensions directory path. - Returns a copy of os.environ with NODE_MODULES_DIR added/updated. - Use this for all subprocess calls in plugin tests. + Matches JS: getExtensionsDir() + + Tries chrome_utils.js first, falls back to Python computation. """ - env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(get_node_modules_dir()) - return env + returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir') + if returncode == 0 and stdout.strip(): + return stdout.strip() + + # Fallback to default computation if JS call fails + data_dir = os.environ.get('DATA_DIR', './data') + persona = os.environ.get('ACTIVE_PERSONA', 'Default') + return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') -def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]: - """Find the Chromium binary using chrome_utils.js findChromium(). +def find_chromium(data_dir: Optional[str] = None) -> Optional[str]: + """Find the Chromium binary path. - This uses the centralized findChromium() function which checks: + Matches JS: findChromium() + + Uses chrome_utils.js which checks: - CHROME_BINARY env var - @puppeteer/browsers install locations - System Chromium locations - Falls back to Chrome (with warning) Args: - data_dir: Directory where chromium was installed (contains chromium/ subdir) + data_dir: Optional DATA_DIR override Returns: Path to Chromium binary or None if not found """ - search_dir = data_dir or os.environ.get('DATA_DIR', '.') - result = subprocess.run( - ['node', str(CHROME_UTILS), 'findChromium', str(search_dir)], - capture_output=True, - text=True, - timeout=10 - ) - if result.returncode == 0 and result.stdout.strip(): - return result.stdout.strip() + env = os.environ.copy() + if data_dir: + env['DATA_DIR'] = str(data_dir) + returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env) + if returncode == 0 and stdout.strip(): + return stdout.strip() return None -def get_extensions_dir() -> str: - """Get the Chrome extensions directory using chrome_utils.js getExtensionsDir(). +def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool: + """Kill a Chrome process by PID. - This uses the centralized path calculation from chrome_utils.js which checks: - - CHROME_EXTENSIONS_DIR env var - - DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions + Matches JS: killChrome() + + Uses chrome_utils.js which handles: + - SIGTERM then SIGKILL + - Process group killing + - Zombie process cleanup + + Args: + pid: Process ID to kill + output_dir: Optional chrome output directory for PID file cleanup Returns: - Path to extensions directory + True if the kill command succeeded """ + args = [str(pid)] + if output_dir: + args.append(str(output_dir)) + returncode, stdout, stderr = _call_chrome_utils('killChrome', *args) + return returncode == 0 + + +def get_test_env() -> dict: + """Get environment dict with all paths set correctly for tests. + + Matches JS: getTestEnv() + + Tries chrome_utils.js first for path values, builds env dict. + Use this for all subprocess calls in plugin tests. + """ + env = os.environ.copy() + + # Try to get all paths from JS (single source of truth) + returncode, stdout, stderr = _call_chrome_utils('getTestEnv') + if returncode == 0 and stdout.strip(): + try: + js_env = json.loads(stdout) + env.update(js_env) + return env + except json.JSONDecodeError: + pass + + # Fallback to Python computation + lib_dir = get_lib_dir() + env['LIB_DIR'] = str(lib_dir) + env['NODE_MODULES_DIR'] = str(get_node_modules_dir()) + env['MACHINE_TYPE'] = get_machine_type() + return env + + +# Backward compatibility aliases (deprecated, use new names) +find_chromium_binary = find_chromium +kill_chrome_via_js = kill_chrome +get_machine_type_from_js = get_machine_type +get_test_env_from_js = get_test_env + + +# ============================================================================= +# Module-level constants (lazy-loaded on first access) +# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR +# ============================================================================= + +# These are computed once when first accessed +_LIB_DIR: Optional[Path] = None +_NODE_MODULES_DIR: Optional[Path] = None + + +def _get_lib_dir_cached() -> Path: + global _LIB_DIR + if _LIB_DIR is None: + _LIB_DIR = get_lib_dir() + return _LIB_DIR + + +def _get_node_modules_dir_cached() -> Path: + global _NODE_MODULES_DIR + if _NODE_MODULES_DIR is None: + _NODE_MODULES_DIR = get_node_modules_dir() + return _NODE_MODULES_DIR + + +# Module-level constants that can be imported directly +# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR +class _LazyPath: + """Lazy path that computes value on first access.""" + def __init__(self, getter): + self._getter = getter + self._value = None + + def __fspath__(self): + if self._value is None: + self._value = self._getter() + return str(self._value) + + def __truediv__(self, other): + if self._value is None: + self._value = self._getter() + return self._value / other + + def __str__(self): + return self.__fspath__() + + def __repr__(self): + return f"" + + +LIB_DIR = _LazyPath(_get_lib_dir_cached) +NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached) + + +# ============================================================================= +# Hook Execution Helpers +# ============================================================================= + + +def run_hook( + hook_script: Path, + url: str, + snapshot_id: str, + cwd: Optional[Path] = None, + env: Optional[dict] = None, + timeout: int = 60, + extra_args: Optional[List[str]] = None, +) -> Tuple[int, str, str]: + """Run a hook script and return (returncode, stdout, stderr). + + Usage: + returncode, stdout, stderr = run_hook( + HOOK_SCRIPT, 'https://example.com', 'test-snap-123', + cwd=tmpdir, env=get_test_env() + ) + + Args: + hook_script: Path to the hook script + url: URL to process + snapshot_id: Snapshot ID + cwd: Working directory (default: current dir) + env: Environment dict (default: get_test_env()) + timeout: Timeout in seconds + extra_args: Additional arguments to pass + + Returns: + Tuple of (returncode, stdout, stderr) + """ + if env is None: + env = get_test_env() + + # Determine interpreter based on file extension + if hook_script.suffix == '.py': + cmd = ['python', str(hook_script)] + elif hook_script.suffix == '.js': + cmd = ['node', str(hook_script)] + else: + cmd = [str(hook_script)] + + cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}']) + if extra_args: + cmd.extend(extra_args) + result = subprocess.run( - ['node', str(CHROME_UTILS), 'getExtensionsDir'], + cmd, + cwd=str(cwd) if cwd else None, capture_output=True, text=True, - timeout=10, - env=get_test_env() + env=env, + timeout=timeout ) - if result.returncode == 0 and result.stdout.strip(): - return result.stdout.strip() - # Fallback to default computation if JS call fails - data_dir = os.environ.get('DATA_DIR', './data') - persona = os.environ.get('ACTIVE_PERSONA', 'Default') - return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') + return result.returncode, result.stdout, result.stderr + + +def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]: + """Parse JSONL output from hook stdout and return the specified record type. + + Usage: + result = parse_jsonl_output(stdout) + if result and result['status'] == 'succeeded': + print("Success!") + + Args: + stdout: The stdout from a hook execution + record_type: The 'type' field to look for (default: 'ArchiveResult') + + Returns: + The parsed JSON dict or None if not found + """ + for line in stdout.strip().split('\n'): + line = line.strip() + if not line.startswith('{'): + continue + try: + record = json.loads(line) + if record.get('type') == record_type: + return record + except json.JSONDecodeError: + continue + return None + + +def run_hook_and_parse( + hook_script: Path, + url: str, + snapshot_id: str, + cwd: Optional[Path] = None, + env: Optional[dict] = None, + timeout: int = 60, + extra_args: Optional[List[str]] = None, +) -> Tuple[int, Optional[Dict[str, Any]], str]: + """Run a hook and parse its JSONL output. + + Convenience function combining run_hook() and parse_jsonl_output(). + + Returns: + Tuple of (returncode, parsed_result_or_none, stderr) + """ + returncode, stdout, stderr = run_hook( + hook_script, url, snapshot_id, + cwd=cwd, env=env, timeout=timeout, extra_args=extra_args + ) + result = parse_jsonl_output(stdout) + return returncode, result, stderr + + +# ============================================================================= +# Extension Test Helpers +# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha) +# ============================================================================= + + +def setup_test_env(tmpdir: Path) -> dict: + """Set up isolated data/lib directory structure for extension tests. + + Creates structure matching real ArchiveBox data dir: + /data/ + lib/ + arm64-darwin/ (or x86_64-linux, etc.) + npm/ + .bin/ + node_modules/ + personas/ + Default/ + chrome_extensions/ + users/ + testuser/ + crawls/ + snapshots/ + + Calls chrome install hook which handles puppeteer-core and chromium installation. + Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. + + Args: + tmpdir: Base temporary directory for the test + + Returns: + Environment dict with all paths set, or pytest.skip() if Chrome install fails + """ + import pytest + + # Determine machine type (matches archivebox.config.paths.get_machine_type()) + machine = platform.machine().lower() + system = platform.system().lower() + if machine in ('arm64', 'aarch64'): + machine = 'arm64' + elif machine in ('x86_64', 'amd64'): + machine = 'x86_64' + machine_type = f"{machine}-{system}" + + # Create proper directory structure matching real ArchiveBox layout + data_dir = tmpdir / 'data' + lib_dir = data_dir / 'lib' / machine_type + npm_dir = lib_dir / 'npm' + npm_bin_dir = npm_dir / '.bin' + node_modules_dir = npm_dir / 'node_modules' + + # Extensions go under personas/Default/ + chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' + + # User data goes under users/{username}/ + date_str = datetime.now().strftime('%Y%m%d') + users_dir = data_dir / 'users' / 'testuser' + crawls_dir = users_dir / 'crawls' / date_str + snapshots_dir = users_dir / 'snapshots' / date_str + + # Create all directories + node_modules_dir.mkdir(parents=True, exist_ok=True) + npm_bin_dir.mkdir(parents=True, exist_ok=True) + chrome_extensions_dir.mkdir(parents=True, exist_ok=True) + crawls_dir.mkdir(parents=True, exist_ok=True) + snapshots_dir.mkdir(parents=True, exist_ok=True) + + # Build complete env dict + env = os.environ.copy() + env.update({ + 'DATA_DIR': str(data_dir), + 'LIB_DIR': str(lib_dir), + 'MACHINE_TYPE': machine_type, + 'NPM_BIN_DIR': str(npm_bin_dir), + 'NODE_MODULES_DIR': str(node_modules_dir), + 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), + 'CRAWLS_DIR': str(crawls_dir), + 'SNAPSHOTS_DIR': str(snapshots_dir), + }) + + # Only set headless if not already in environment (allow override for debugging) + if 'CHROME_HEADLESS' not in os.environ: + env['CHROME_HEADLESS'] = 'true' + + # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) + result = subprocess.run( + ['python', str(CHROME_INSTALL_HOOK)], + capture_output=True, text=True, timeout=120, env=env + ) + if result.returncode != 0: + pytest.skip(f"Chrome install hook failed: {result.stderr}") + + # Parse JSONL output to get CHROME_BINARY + chrome_binary = None + for line in result.stdout.strip().split('\n'): + if not line.strip(): + continue + try: + data = json.loads(line) + if data.get('type') == 'Binary' and data.get('abspath'): + chrome_binary = data['abspath'] + break + except json.JSONDecodeError: + continue + + if not chrome_binary or not Path(chrome_binary).exists(): + pytest.skip(f"Chromium binary not found: {chrome_binary}") + + env['CHROME_BINARY'] = chrome_binary + return env + + +def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]: + """Launch Chromium and return (process, cdp_url). + + This launches Chrome using the chrome launch hook and waits for the CDP URL + to become available. Use this for extension tests that need direct CDP access. + + Args: + env: Environment dict (from setup_test_env) + chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.) + crawl_id: ID for the crawl + + Returns: + Tuple of (chrome_launch_process, cdp_url) + + Raises: + RuntimeError: If Chrome fails to launch or CDP URL not available after 20s + """ + chrome_dir.mkdir(parents=True, exist_ok=True) + + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chromium to launch and CDP URL to be available + cdp_url = None + for i in range(20): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) + + if not cdp_url: + chrome_launch_process.kill() + raise RuntimeError("Chromium CDP URL not found after 20s") + + return chrome_launch_process, cdp_url + + +def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None: + """Clean up Chromium process launched by launch_chromium_session. + + Uses chrome_utils.js killChrome for proper process group handling. + + Args: + chrome_launch_process: The Popen object from launch_chromium_session + chrome_dir: The chrome directory containing chrome.pid + """ + # First try to terminate the launch process gracefully + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except Exception: + pass + + # Read PID and use JS to kill with proper cleanup + chrome_pid_file = chrome_dir / 'chrome.pid' + if chrome_pid_file.exists(): + try: + chrome_pid = int(chrome_pid_file.read_text().strip()) + kill_chrome(chrome_pid, str(chrome_dir)) + except (ValueError, FileNotFoundError): + pass + + +@contextmanager +def chromium_session(env: dict, chrome_dir: Path, crawl_id: str): + """Context manager for Chromium sessions with automatic cleanup. + + Usage: + with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url): + # Use cdp_url to connect with puppeteer + pass + # Chromium automatically cleaned up + + Args: + env: Environment dict (from setup_test_env) + chrome_dir: Directory for Chrome files + crawl_id: ID for the crawl + + Yields: + Tuple of (chrome_launch_process, cdp_url) + """ + chrome_launch_process = None + try: + chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id) + yield chrome_launch_process, cdp_url + finally: + if chrome_launch_process: + kill_chromium_session(chrome_launch_process, chrome_dir) + + +# ============================================================================= +# Tab-based Test Helpers +# Used by tab-based tests (infiniscroll, modalcloser) +# ============================================================================= def setup_chrome_session( @@ -210,25 +789,28 @@ def setup_chrome_session( return chrome_launch_process, chrome_pid, snapshot_chrome_dir -def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None: - """Clean up Chrome processes. +def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None: + """Clean up Chrome processes using chrome_utils.js killChrome. - Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID. - Ignores errors if processes are already dead. + Uses the centralized kill logic from chrome_utils.js which handles: + - SIGTERM then SIGKILL + - Process group killing + - Zombie process cleanup Args: chrome_launch_process: The Popen object for the chrome launch hook chrome_pid: The PID of the Chrome process + chrome_dir: Optional path to chrome output directory """ + # First try to terminate the launch process gracefully try: chrome_launch_process.send_signal(signal.SIGTERM) chrome_launch_process.wait(timeout=5) except Exception: pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass + + # Use JS to kill Chrome with proper process group handling + kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None) @contextmanager diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index ca8ad874..d455ba41 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -28,70 +28,25 @@ import tempfile import shutil import platform -PLUGIN_DIR = Path(__file__).parent.parent -CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js' -CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js' -CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None) +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + get_lib_dir, + get_node_modules_dir, + find_chromium_binary, + CHROME_PLUGIN_DIR as PLUGIN_DIR, + CHROME_LAUNCH_HOOK, + CHROME_TAB_HOOK, + CHROME_NAVIGATE_HOOK, +) -# Get LIB_DIR and MACHINE_TYPE from environment or compute them -def get_lib_dir_and_machine_type(): - """Get or compute LIB_DIR and MACHINE_TYPE for tests.""" - from archivebox.config.paths import get_machine_type - from archivebox.config.common import STORAGE_CONFIG - - lib_dir = os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR) - machine_type = os.environ.get('MACHINE_TYPE') or get_machine_type() - - return Path(lib_dir), machine_type - -# Setup NODE_MODULES_DIR to find npm packages -LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type() -# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin) -NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' +# Get LIB_DIR and NODE_MODULES_DIR from shared helpers +LIB_DIR = get_lib_dir() +NODE_MODULES_DIR = get_node_modules_dir() NPM_PREFIX = LIB_DIR / 'npm' # Chromium install location (relative to DATA_DIR) CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium' -def get_test_env(): - """Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly.""" - env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) - env['LIB_DIR'] = str(LIB_DIR) - env['MACHINE_TYPE'] = MACHINE_TYPE - # Ensure CHROME_BINARY is set to Chromium - if 'CHROME_BINARY' not in env: - chromium = find_chromium_binary() - if chromium: - env['CHROME_BINARY'] = chromium - return env - - -def find_chromium_binary(data_dir=None): - """Find the Chromium binary using chrome_utils.js findChromium(). - - This uses the centralized findChromium() function which checks: - - CHROME_BINARY env var - - @puppeteer/browsers install locations (in data_dir/chromium) - - System Chromium locations - - Falls back to Chrome (with warning) - - Args: - data_dir: Directory where chromium was installed (contains chromium/ subdir) - """ - chrome_utils = PLUGIN_DIR / 'chrome_utils.js' - # Use provided data_dir, or fall back to env var, or current dir - search_dir = data_dir or os.environ.get('DATA_DIR', '.') - result = subprocess.run( - ['node', str(chrome_utils), 'findChromium', str(search_dir)], - capture_output=True, - text=True, - timeout=10 - ) - if result.returncode == 0 and result.stdout.strip(): - return result.stdout.strip() - return None - @pytest.fixture(scope="session", autouse=True) def ensure_chromium_and_puppeteer_installed(): diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py index 494e131a..7fe69d64 100644 --- a/archivebox/plugins/dom/tests/test_dom.py +++ b/archivebox/plugins/dom/tests/test_dom.py @@ -20,29 +20,22 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + get_plugin_dir, + get_hook_script, + run_hook_and_parse, + LIB_DIR, + NODE_MODULES_DIR, + PLUGINS_ROOT, +) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None) -NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*') +NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py') TEST_URL = 'https://example.com' -# Get LIB_DIR for NODE_MODULES_DIR -def get_lib_dir(): - """Get LIB_DIR for tests.""" - from archivebox.config.common import STORAGE_CONFIG - return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) - -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' - -def get_test_env(): - """Get environment with NODE_MODULES_DIR set correctly.""" - env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) - env['LIB_DIR'] = str(LIB_DIR) - return env - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" diff --git a/archivebox/plugins/favicon/tests/test_favicon.py b/archivebox/plugins/favicon/tests/test_favicon.py index 88af5059..4434d1a8 100644 --- a/archivebox/plugins/favicon/tests/test_favicon.py +++ b/archivebox/plugins/favicon/tests/test_favicon.py @@ -2,7 +2,6 @@ Integration tests for favicon plugin Tests verify: - pass 1. Plugin script exists 2. requests library is available 3. Favicon extraction works for real example.com @@ -21,9 +20,15 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + parse_jsonl_output, +) -PLUGIN_DIR = Path(__file__).parent.parent -FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*') TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py index b5b93288..13a62e58 100644 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py @@ -14,6 +14,14 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + setup_test_env, + launch_chromium_session, + kill_chromium_session, + CHROME_LAUNCH_HOOK, + PLUGINS_ROOT, +) + PLUGIN_DIR = Path(__file__).parent.parent INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None) @@ -124,107 +132,6 @@ def test_no_configuration_required(): assert "API" not in (result.stdout + result.stderr) or result.returncode == 0 -PLUGINS_ROOT = PLUGIN_DIR.parent -CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py' -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' - - -def setup_test_env(tmpdir: Path) -> dict: - """Set up isolated data/lib directory structure for tests. - - Creates structure matching real ArchiveBox data dir: - /data/ - lib/ - arm64-darwin/ (or x86_64-linux, etc.) - npm/ - .bin/ - node_modules/ - personas/ - Default/ - chrome_extensions/ - users/ - testuser/ - crawls/ - snapshots/ - - Calls chrome install hook which handles puppeteer-core and chromium installation. - Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. - """ - import platform - from datetime import datetime - - # Determine machine type (matches archivebox.config.paths.get_machine_type()) - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - machine_type = f"{machine}-{system}" - - # Create proper directory structure matching real ArchiveBox layout - data_dir = tmpdir / 'data' - lib_dir = data_dir / 'lib' / machine_type - npm_dir = lib_dir / 'npm' - npm_bin_dir = npm_dir / '.bin' - node_modules_dir = npm_dir / 'node_modules' - - # Extensions go under personas/Default/ - chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' - - # User data goes under users/{username}/ - date_str = datetime.now().strftime('%Y%m%d') - users_dir = data_dir / 'users' / 'testuser' - crawls_dir = users_dir / 'crawls' / date_str - snapshots_dir = users_dir / 'snapshots' / date_str - - # Create all directories - node_modules_dir.mkdir(parents=True, exist_ok=True) - npm_bin_dir.mkdir(parents=True, exist_ok=True) - chrome_extensions_dir.mkdir(parents=True, exist_ok=True) - crawls_dir.mkdir(parents=True, exist_ok=True) - snapshots_dir.mkdir(parents=True, exist_ok=True) - - # Build complete env dict - env = os.environ.copy() - env.update({ - 'DATA_DIR': str(data_dir), - 'LIB_DIR': str(lib_dir), - 'MACHINE_TYPE': machine_type, - 'NPM_BIN_DIR': str(npm_bin_dir), - 'NODE_MODULES_DIR': str(node_modules_dir), - 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), - 'CRAWLS_DIR': str(crawls_dir), - 'SNAPSHOTS_DIR': str(snapshots_dir), - }) - - # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) - result = subprocess.run( - ['python', str(CHROME_INSTALL_HOOK)], - capture_output=True, text=True, timeout=120, env=env - ) - if result.returncode != 0: - pytest.skip(f"Chrome install hook failed: {result.stderr}") - - # Parse JSONL output to get CHROME_BINARY - chrome_binary = None - for line in result.stdout.strip().split('\n'): - if not line.strip(): - continue - try: - data = json.loads(line) - if data.get('type') == 'Binary' and data.get('abspath'): - chrome_binary = data['abspath'] - break - except json.JSONDecodeError: - continue - - if not chrome_binary or not Path(chrome_binary).exists(): - pytest.skip(f"Chromium binary not found: {chrome_binary}") - - env['CHROME_BINARY'] = chrome_binary - return env - TEST_URL = 'https://www.filmin.es/' @@ -420,54 +327,6 @@ const puppeteer = require('puppeteer-core'); pass -def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str): - """Launch Chromium and return (process, cdp_url) or raise on failure.""" - chrome_dir.mkdir(parents=True, exist_ok=True) - - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chromium to launch and CDP URL to be available - cdp_url = None - for i in range(20): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - break - time.sleep(1) - - if not cdp_url: - chrome_launch_process.kill() - raise RuntimeError("Chromium CDP URL not found after 20s") - - return chrome_launch_process, cdp_url - - -def kill_chromium_session(chrome_launch_process, chrome_dir: Path): - """Clean up Chromium process.""" - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - chrome_pid_file = chrome_dir / 'chrome.pid' - if chrome_pid_file.exists(): - try: - chrome_pid = int(chrome_pid_file.read_text().strip()) - os.kill(chrome_pid, signal.SIGKILL) - except (OSError, ValueError): - pass - - def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict: """Check if cookie consent elements are visible on a page. diff --git a/archivebox/plugins/mercury/tests/test_mercury.py b/archivebox/plugins/mercury/tests/test_mercury.py index 87aff58a..242eb5db 100644 --- a/archivebox/plugins/mercury/tests/test_mercury.py +++ b/archivebox/plugins/mercury/tests/test_mercury.py @@ -2,7 +2,6 @@ Integration tests for mercury plugin Tests verify: - pass 1. Hook script exists 2. Dependencies installed via validation hooks 3. Verify deps with abx-pkg @@ -19,9 +18,15 @@ import tempfile from pathlib import Path import pytest -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None) +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + PLUGINS_ROOT, +) + + +PLUGIN_DIR = get_plugin_dir(__file__) +MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*') TEST_URL = 'https://example.com' def test_hook_script_exists(): diff --git a/archivebox/plugins/pdf/tests/test_pdf.py b/archivebox/plugins/pdf/tests/test_pdf.py index 681e7225..c160cfdc 100644 --- a/archivebox/plugins/pdf/tests/test_pdf.py +++ b/archivebox/plugins/pdf/tests/test_pdf.py @@ -21,29 +21,22 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + get_plugin_dir, + get_hook_script, + run_hook_and_parse, + LIB_DIR, + NODE_MODULES_DIR, + PLUGINS_ROOT, +) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*') NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py' TEST_URL = 'https://example.com' -# Get LIB_DIR for NODE_MODULES_DIR -def get_lib_dir(): - """Get LIB_DIR for tests.""" - from archivebox.config.common import STORAGE_CONFIG - return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) - -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' - -def get_test_env(): - """Get environment with NODE_MODULES_DIR set correctly.""" - env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) - env['LIB_DIR'] = str(LIB_DIR) - return env - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" diff --git a/archivebox/plugins/readability/tests/test_readability.py b/archivebox/plugins/readability/tests/test_readability.py index 80eafffd..b416169e 100644 --- a/archivebox/plugins/readability/tests/test_readability.py +++ b/archivebox/plugins/readability/tests/test_readability.py @@ -2,7 +2,6 @@ Integration tests for readability plugin Tests verify: - pass 1. Validate hook checks for readability-extractor binary 2. Verify deps with abx-pkg 3. Plugin reports missing dependency correctly @@ -18,10 +17,15 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + PLUGINS_ROOT, +) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*')) + +PLUGIN_DIR = get_plugin_dir(__file__) +READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*') TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index edfbd54a..24d4960d 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -20,28 +20,20 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + get_plugin_dir, + get_hook_script, + run_hook_and_parse, + LIB_DIR, + NODE_MODULES_DIR, +) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') TEST_URL = 'https://example.com' -# Get LIB_DIR for NODE_MODULES_DIR -def get_lib_dir(): - """Get LIB_DIR for tests.""" - from archivebox.config.common import STORAGE_CONFIG - return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) - -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' - -def get_test_env(): - """Get environment with NODE_MODULES_DIR set correctly.""" - env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) - env['LIB_DIR'] = str(LIB_DIR) - return env - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index c7dc1686..aa19b82c 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -77,27 +77,9 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Chrome binary search paths -CHROMIUM_BINARY_NAMES_LINUX = [ - 'chromium', 'chromium-browser', 'chromium-browser-beta', - 'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev', -] -CHROME_BINARY_NAMES_LINUX = [ - 'google-chrome', 'google-chrome-stable', 'google-chrome-beta', - 'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome', -] -CHROME_BINARY_NAMES_MACOS = [ - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', -] -CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium'] - -ALL_CHROME_BINARIES = ( - CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX + - CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS -) - - +# Chrome session directory (relative to extractor output dir) +# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for. +# The centralized Chrome binary search is in chrome_utils.js findChromium(). CHROME_SESSION_DIR = '../chrome' diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py index 8d6d01b0..0fbd3c07 100644 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ b/archivebox/plugins/singlefile/tests/test_singlefile.py @@ -6,6 +6,8 @@ Tests verify: 2. CLI-based singlefile extraction works 3. Dependencies available via abx-pkg 4. Output contains valid HTML +5. Connects to Chrome session via CDP when available +6. Works with extensions loaded (ublock, etc.) """ import json @@ -16,10 +18,17 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + get_plugin_dir, + get_hook_script, + setup_chrome_session, + cleanup_chrome, +) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py') TEST_URL = "https://example.com" @@ -52,7 +61,7 @@ def test_singlefile_cli_archives_example_com(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - env = os.environ.copy() + env = get_test_env() env['SINGLEFILE_ENABLED'] = 'true' # Run singlefile snapshot hook @@ -78,5 +87,89 @@ def test_singlefile_cli_archives_example_com(): assert 'Example Domain' in html_content, "Output should contain example.com content" +def test_singlefile_with_chrome_session(): + """Test singlefile connects to existing Chrome session via CDP. + + When a Chrome session exists (chrome/cdp_url.txt), singlefile should + connect to it instead of launching a new Chrome instance. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + try: + # Set up Chrome session using shared helper + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + tmpdir=tmpdir, + crawl_id='singlefile-test-crawl', + snapshot_id='singlefile-test-snap', + test_url=TEST_URL, + navigate=False, # Don't navigate, singlefile will do that + timeout=20, + ) + + # singlefile looks for ../chrome/cdp_url.txt relative to cwd + # So we need to run from a directory that has ../chrome pointing to our chrome dir + singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile' + singlefile_output_dir.mkdir(parents=True, exist_ok=True) + + # Create symlink so singlefile can find the chrome session + chrome_link = singlefile_output_dir.parent / 'chrome' + if not chrome_link.exists(): + chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome') + + env = get_test_env() + env['SINGLEFILE_ENABLED'] = 'true' + env['CHROME_HEADLESS'] = 'true' + + # Run singlefile - it should find and use the existing Chrome session + result = subprocess.run( + ['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'], + cwd=str(singlefile_output_dir), + capture_output=True, + text=True, + env=env, + timeout=120 + ) + + # Verify output + output_file = singlefile_output_dir / 'singlefile.html' + if output_file.exists(): + html_content = output_file.read_text() + assert len(html_content) > 500, "Output file too small" + assert 'Example Domain' in html_content, "Should contain example.com content" + else: + # If singlefile couldn't connect to Chrome, it may have failed + # Check if it mentioned browser-server in its args (indicating it tried to use CDP) + assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \ + f"Singlefile should attempt CDP connection. stderr: {result.stderr}" + + finally: + cleanup_chrome(chrome_launch_process, chrome_pid) + + +def test_singlefile_disabled_skips(): + """Test that SINGLEFILE_ENABLED=False exits without JSONL.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + env = get_test_env() + env['SINGLEFILE_ENABLED'] = 'False' + + result = subprocess.run( + ['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, f"Should exit 0 when disabled: {result.stderr}" + + # Should NOT emit JSONL when disabled + jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] + assert len(jsonl_lines) == 0, f"Should not emit JSONL when disabled, but got: {jsonl_lines}" + + if __name__ == '__main__': pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/title/tests/test_title.py b/archivebox/plugins/title/tests/test_title.py index 2054d22d..285f7309 100644 --- a/archivebox/plugins/title/tests/test_title.py +++ b/archivebox/plugins/title/tests/test_title.py @@ -2,7 +2,6 @@ Integration tests for title plugin Tests verify: - pass 1. Plugin script exists 2. Node.js is available 3. Title extraction works for real example.com @@ -20,9 +19,15 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_plugin_dir, + get_hook_script, + parse_jsonl_output, +) -PLUGIN_DIR = Path(__file__).parent.parent -TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None) + +PLUGIN_DIR = get_plugin_dir(__file__) +TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*') TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py index fd06cde5..f81b55da 100644 --- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py +++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py @@ -16,184 +16,25 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + setup_test_env, + launch_chromium_session, + kill_chromium_session, + CHROME_LAUNCH_HOOK, + PLUGINS_ROOT, +) + PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js' CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js' -CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py' -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' TEST_URL = 'https://2captcha.com/demo/recaptcha-v2' -def setup_test_env(tmpdir: Path) -> dict: - """Set up isolated data/lib directory structure for tests. - - Creates structure matching real ArchiveBox data dir: - /data/ - lib/ - arm64-darwin/ (or x86_64-linux, etc.) - npm/ - .bin/ - node_modules/ - personas/ - default/ - chrome_extensions/ - users/ - testuser/ - crawls/ - snapshots/ - - Calls chrome install hook which handles puppeteer-core and chromium installation. - Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. - """ - import platform - from datetime import datetime - - # Determine machine type (matches archivebox.config.paths.get_machine_type()) - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - machine_type = f"{machine}-{system}" - - # Create proper directory structure matching real ArchiveBox layout - data_dir = tmpdir / 'data' - lib_dir = data_dir / 'lib' / machine_type - npm_dir = lib_dir / 'npm' - npm_bin_dir = npm_dir / '.bin' - node_modules_dir = npm_dir / 'node_modules' - - # Extensions go under personas/Default/ - chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' - - # User data goes under users/{username}/ - date_str = datetime.now().strftime('%Y%m%d') - users_dir = data_dir / 'users' / 'testuser' - crawls_dir = users_dir / 'crawls' / date_str - snapshots_dir = users_dir / 'snapshots' / date_str - - # Create all directories - node_modules_dir.mkdir(parents=True, exist_ok=True) - npm_bin_dir.mkdir(parents=True, exist_ok=True) - chrome_extensions_dir.mkdir(parents=True, exist_ok=True) - crawls_dir.mkdir(parents=True, exist_ok=True) - snapshots_dir.mkdir(parents=True, exist_ok=True) - - # Build complete env dict - env = os.environ.copy() - env.update({ - 'DATA_DIR': str(data_dir), - 'LIB_DIR': str(lib_dir), - 'MACHINE_TYPE': machine_type, - 'NPM_BIN_DIR': str(npm_bin_dir), - 'NODE_MODULES_DIR': str(node_modules_dir), - 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), - 'CRAWLS_DIR': str(crawls_dir), - 'SNAPSHOTS_DIR': str(snapshots_dir), - }) - - # Only set headless if not already in environment (allow override for debugging) - if 'CHROME_HEADLESS' not in os.environ: - env['CHROME_HEADLESS'] = 'true' - - # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) - result = subprocess.run( - ['python', str(CHROME_INSTALL_HOOK)], - capture_output=True, text=True, timeout=120, env=env - ) - if result.returncode != 0: - pytest.skip(f"Chrome install hook failed: {result.stderr}") - - # Parse JSONL output to get CHROME_BINARY - chrome_binary = None - for line in result.stdout.strip().split('\n'): - if not line.strip(): - continue - try: - data = json.loads(line) - if data.get('type') == 'Binary' and data.get('abspath'): - chrome_binary = data['abspath'] - break - except json.JSONDecodeError: - continue - - if not chrome_binary or not Path(chrome_binary).exists(): - pytest.skip(f"Chromium binary not found: {chrome_binary}") - - env['CHROME_BINARY'] = chrome_binary - return env - - -def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str): - """Launch Chromium and return (process, cdp_url).""" - chrome_dir.mkdir(parents=True, exist_ok=True) - - process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - cdp_url = None - extensions_ready = False - for _ in range(30): - if process.poll() is not None: - stdout, stderr = process.communicate() - raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - ext_file = chrome_dir / 'extensions.json' - if cdp_file.exists() and not cdp_url: - cdp_url = cdp_file.read_text().strip() - if ext_file.exists(): - extensions_ready = True - if cdp_url and extensions_ready: - break - time.sleep(1) - - if not cdp_url: - process.kill() - stdout, stderr = process.communicate() - raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}") - - # Print chrome launch hook output for debugging - import select - if hasattr(select, 'poll'): - # Read any available stderr without blocking - import fcntl - import os as os_module - fd = process.stderr.fileno() - fl = fcntl.fcntl(fd, fcntl.F_GETFL) - fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK) - try: - stderr_output = process.stderr.read() - if stderr_output: - print(f"[Chrome Launch Hook Output]\n{stderr_output}") - except: - pass - - return process, cdp_url - - -def kill_chrome(process, chrome_dir: Path): - """Kill Chromium process.""" - try: - process.send_signal(signal.SIGTERM) - process.wait(timeout=5) - except: - pass - pid_file = chrome_dir / 'chrome.pid' - if pid_file.exists(): - try: - os.kill(int(pid_file.read_text().strip()), signal.SIGKILL) - except: - pass +# Alias for backward compatibility with existing test names +launch_chrome = launch_chromium_session +kill_chrome = kill_chromium_session class TestTwoCaptcha: diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index f5acaa52..d295000e 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -12,6 +12,14 @@ from pathlib import Path import pytest +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + setup_test_env, + launch_chromium_session, + kill_chromium_session, + CHROME_LAUNCH_HOOK, + PLUGINS_ROOT, +) + PLUGIN_DIR = Path(__file__).parent.parent INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None) @@ -157,64 +165,6 @@ def test_large_extension_size(): assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes" -PLUGINS_ROOT = PLUGIN_DIR.parent -CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py' -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' - - -def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str): - """Launch Chromium and return (process, cdp_url) or raise on failure.""" - import signal - import time - - chrome_dir.mkdir(parents=True, exist_ok=True) - - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chromium to launch and CDP URL to be available - cdp_url = None - for i in range(20): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - break - time.sleep(1) - - if not cdp_url: - chrome_launch_process.kill() - raise RuntimeError("Chromium CDP URL not found after 20s") - - return chrome_launch_process, cdp_url - - -def kill_chromium_session(chrome_launch_process, chrome_dir: Path): - """Clean up Chromium process.""" - import signal - - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - chrome_pid_file = chrome_dir / 'chrome.pid' - if chrome_pid_file.exists(): - try: - chrome_pid = int(chrome_pid_file.read_text().strip()) - os.kill(chrome_pid, signal.SIGKILL) - except (OSError, ValueError): - pass - - def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict: """Check ad blocking effectiveness by counting ad elements on page. @@ -350,103 +300,6 @@ const puppeteer = require('puppeteer-core'); return json.loads(output_lines[-1]) -def setup_test_env(tmpdir: Path) -> dict: - """Set up isolated data/lib directory structure for tests. - - Creates structure matching real ArchiveBox data dir: - /data/ - lib/ - arm64-darwin/ (or x86_64-linux, etc.) - npm/ - .bin/ - node_modules/ - personas/ - default/ - chrome_extensions/ - users/ - testuser/ - crawls/ - snapshots/ - - Calls chrome install hook which handles puppeteer-core and chromium installation. - Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. - """ - import platform - from datetime import datetime - - # Determine machine type (matches archivebox.config.paths.get_machine_type()) - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - machine_type = f"{machine}-{system}" - - # Create proper directory structure matching real ArchiveBox layout - data_dir = tmpdir / 'data' - lib_dir = data_dir / 'lib' / machine_type - npm_dir = lib_dir / 'npm' - npm_bin_dir = npm_dir / '.bin' - node_modules_dir = npm_dir / 'node_modules' - - # Extensions go under personas/Default/ - chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' - - # User data goes under users/{username}/ - date_str = datetime.now().strftime('%Y%m%d') - users_dir = data_dir / 'users' / 'testuser' - crawls_dir = users_dir / 'crawls' / date_str - snapshots_dir = users_dir / 'snapshots' / date_str - - # Create all directories - node_modules_dir.mkdir(parents=True, exist_ok=True) - npm_bin_dir.mkdir(parents=True, exist_ok=True) - chrome_extensions_dir.mkdir(parents=True, exist_ok=True) - crawls_dir.mkdir(parents=True, exist_ok=True) - snapshots_dir.mkdir(parents=True, exist_ok=True) - - # Build complete env dict - env = os.environ.copy() - env.update({ - 'DATA_DIR': str(data_dir), - 'LIB_DIR': str(lib_dir), - 'MACHINE_TYPE': machine_type, - 'NPM_BIN_DIR': str(npm_bin_dir), - 'NODE_MODULES_DIR': str(node_modules_dir), - 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), - 'CRAWLS_DIR': str(crawls_dir), - 'SNAPSHOTS_DIR': str(snapshots_dir), - }) - - # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) - result = subprocess.run( - ['python', str(CHROME_INSTALL_HOOK)], - capture_output=True, text=True, timeout=120, env=env - ) - if result.returncode != 0: - pytest.skip(f"Chrome install hook failed: {result.stderr}") - - # Parse JSONL output to get CHROME_BINARY - chrome_binary = None - for line in result.stdout.strip().split('\n'): - if not line.strip(): - continue - try: - data = json.loads(line) - if data.get('type') == 'Binary' and data.get('abspath'): - chrome_binary = data['abspath'] - break - except json.JSONDecodeError: - continue - - if not chrome_binary or not Path(chrome_binary).exists(): - pytest.skip(f"Chromium binary not found: {chrome_binary}") - - env['CHROME_BINARY'] = chrome_binary - return env - - # Test URL: Yahoo has many ads that uBlock should block TEST_URL = 'https://www.yahoo.com/'