diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 7e8c2d5e..cd90bc40 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -1,59 +1,8 @@ """ -Shared Chrome test helpers for plugin integration tests. +Chrome test helpers - delegates to chrome_utils.js (single source of truth). -This module provides common utilities for Chrome-based plugin tests, reducing -duplication across test files. Functions delegate to chrome_utils.js (the single -source of truth) with Python fallbacks. - -Function names match the JS equivalents in snake_case: - JS: getMachineType() -> Python: get_machine_type() - JS: getLibDir() -> Python: get_lib_dir() - JS: getNodeModulesDir() -> Python: get_node_modules_dir() - JS: getExtensionsDir() -> Python: get_extensions_dir() - JS: findChromium() -> Python: find_chromium() - JS: killChrome() -> Python: kill_chrome() - JS: getTestEnv() -> Python: get_test_env() - -Usage: - # Path helpers (delegate to chrome_utils.js): - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE - get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin' - get_lib_dir, # Path to lib dir - get_node_modules_dir, # Path to node_modules - get_extensions_dir, # Path to chrome extensions - find_chromium, # Find Chrome/Chromium binary - kill_chrome, # Kill Chrome process by PID - ) - - # Test file helpers: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path - get_hook_script, # Find hook script by glob pattern - PLUGINS_ROOT, # Path to plugins root - LIB_DIR, # Path to lib dir (lazy-loaded) - NODE_MODULES_DIR, # Path to node_modules (lazy-loaded) - ) - - # For Chrome session tests: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_chrome_session, # Full Chrome + tab setup - cleanup_chrome, # Cleanup by PID - chrome_session, # Context manager - ) - - # For extension tests: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_test_env, # Full dir structure + Chrome install - launch_chromium_session, # Launch Chrome, return CDP URL - kill_chromium_session, # Cleanup Chrome - ) - - # Run hooks and parse JSONL: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - run_hook, # Run hook, return (returncode, stdout, stderr) - parse_jsonl_output, # Parse JSONL from stdout - ) +Function names match JS equivalents in snake_case: + getMachineType -> get_machine_type, getLibDir -> get_lib_dir, etc. """ import json @@ -81,85 +30,21 @@ CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js' # ============================================================================= -# Path Helpers - delegates to chrome_utils.js with Python fallback -# Function names match JS: getMachineType -> get_machine_type, etc. +# Path Helpers - delegates to chrome_utils.js (single source of truth) # ============================================================================= def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]: - """Call chrome_utils.js CLI command (internal helper). - - This is the central dispatch for calling the JS utilities from Python. - All path calculations and Chrome operations are centralized in chrome_utils.js - to ensure consistency between Python and JavaScript code. - - Args: - command: The CLI command (e.g., 'findChromium', 'getTestEnv') - *args: Additional command arguments - env: Environment dict (default: current env) - - Returns: - Tuple of (returncode, stdout, stderr) - """ + """Call chrome_utils.js CLI command. Returns (returncode, stdout, stderr).""" cmd = ['node', str(CHROME_UTILS), command] + list(args) - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=30, - env=env or os.environ.copy() - ) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30, env=env or os.environ.copy()) return result.returncode, result.stdout, result.stderr -def get_plugin_dir(test_file: str) -> Path: - """Get the plugin directory from a test file path. - - Usage: - PLUGIN_DIR = get_plugin_dir(__file__) - - Args: - test_file: The __file__ of the test module (e.g., test_screenshot.py) - - Returns: - Path to the plugin directory (e.g., plugins/screenshot/) - """ - return Path(test_file).parent.parent - - -def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]: - """Find a hook script in a plugin directory by pattern. - - Usage: - HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') - - Args: - plugin_dir: Path to the plugin directory - pattern: Glob pattern to match - - Returns: - Path to the hook script or None if not found - """ - matches = list(plugin_dir.glob(pattern)) - return matches[0] if matches else None - - def get_machine_type() -> str: - """Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin'). - - Matches JS: getMachineType() - - Tries chrome_utils.js first, falls back to Python computation. - """ - # Try JS first (single source of truth) - returncode, stdout, stderr = _call_chrome_utils('getMachineType') - if returncode == 0 and stdout.strip(): - return stdout.strip() - - # Fallback to Python computation + """Get machine type (e.g., 'x86_64-linux'). Matches JS getMachineType().""" if os.environ.get('MACHINE_TYPE'): return os.environ['MACHINE_TYPE'] - machine = platform.machine().lower() system = platform.system().lower() if machine in ('arm64', 'aarch64'): @@ -170,103 +55,40 @@ def get_machine_type() -> str: def get_lib_dir() -> Path: - """Get LIB_DIR path for platform-specific binaries. - - Matches JS: getLibDir() - - Tries chrome_utils.js first, falls back to Python computation. - """ - # Try JS first + """Get LIB_DIR path. Matches JS getLibDir().""" returncode, stdout, stderr = _call_chrome_utils('getLibDir') - if returncode == 0 and stdout.strip(): - return Path(stdout.strip()) - - # Fallback to Python - if os.environ.get('LIB_DIR'): - return Path(os.environ['LIB_DIR']) - from archivebox.config.common import STORAGE_CONFIG - return Path(str(STORAGE_CONFIG.LIB_DIR)) + if returncode != 0: + raise RuntimeError(f"getLibDir failed: {stderr}") + return Path(stdout.strip()) def get_node_modules_dir() -> Path: - """Get NODE_MODULES_DIR path for npm packages. - - Matches JS: getNodeModulesDir() - - Tries chrome_utils.js first, falls back to Python computation. - """ - # Try JS first + """Get NODE_MODULES_DIR path. Matches JS getNodeModulesDir().""" returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir') - if returncode == 0 and stdout.strip(): - return Path(stdout.strip()) - - # Fallback to Python - if os.environ.get('NODE_MODULES_DIR'): - return Path(os.environ['NODE_MODULES_DIR']) - lib_dir = get_lib_dir() - return lib_dir / 'npm' / 'node_modules' + if returncode != 0: + raise RuntimeError(f"getNodeModulesDir failed: {stderr}") + return Path(stdout.strip()) def get_extensions_dir() -> str: - """Get the Chrome extensions directory path. - - Matches JS: getExtensionsDir() - - Tries chrome_utils.js first, falls back to Python computation. - """ + """Get Chrome extensions directory. Matches JS getExtensionsDir().""" returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir') - if returncode == 0 and stdout.strip(): - return stdout.strip() - - # Fallback to default computation if JS call fails - data_dir = os.environ.get('DATA_DIR', './data') - persona = os.environ.get('ACTIVE_PERSONA', 'Default') - return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') + if returncode != 0: + raise RuntimeError(f"getExtensionsDir failed: {stderr}") + return stdout.strip() def find_chromium(data_dir: Optional[str] = None) -> Optional[str]: - """Find the Chromium binary path. - - Matches JS: findChromium() - - Uses chrome_utils.js which checks: - - CHROME_BINARY env var - - @puppeteer/browsers install locations - - System Chromium locations - - Falls back to Chrome (with warning) - - Args: - data_dir: Optional DATA_DIR override - - Returns: - Path to Chromium binary or None if not found - """ + """Find Chromium binary path. Matches JS findChromium().""" env = os.environ.copy() if data_dir: env['DATA_DIR'] = str(data_dir) returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env) - if returncode == 0 and stdout.strip(): - return stdout.strip() - return None + return stdout.strip() if returncode == 0 and stdout.strip() else None def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool: - """Kill a Chrome process by PID. - - Matches JS: killChrome() - - Uses chrome_utils.js which handles: - - SIGTERM then SIGKILL - - Process group killing - - Zombie process cleanup - - Args: - pid: Process ID to kill - output_dir: Optional chrome output directory for PID file cleanup - - Returns: - True if the kill command succeeded - """ + """Kill Chrome process by PID. Matches JS killChrome().""" args = [str(pid)] if output_dir: args.append(str(output_dir)) @@ -275,93 +97,15 @@ def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool: def get_test_env() -> dict: - """Get environment dict with all paths set correctly for tests. - - Matches JS: getTestEnv() - - Tries chrome_utils.js first for path values, builds env dict. - Use this for all subprocess calls in plugin tests. - """ + """Get env dict with all paths set for tests. Matches JS getTestEnv().""" env = os.environ.copy() - - # Try to get all paths from JS (single source of truth) returncode, stdout, stderr = _call_chrome_utils('getTestEnv') - if returncode == 0 and stdout.strip(): - try: - js_env = json.loads(stdout) - env.update(js_env) - return env - except json.JSONDecodeError: - pass - - # Fallback to Python computation - lib_dir = get_lib_dir() - env['LIB_DIR'] = str(lib_dir) - env['NODE_MODULES_DIR'] = str(get_node_modules_dir()) - env['MACHINE_TYPE'] = get_machine_type() + if returncode != 0: + raise RuntimeError(f"getTestEnv failed: {stderr}") + env.update(json.loads(stdout)) return env -# Backward compatibility aliases (deprecated, use new names) -find_chromium_binary = find_chromium -kill_chrome_via_js = kill_chrome -get_machine_type_from_js = get_machine_type -get_test_env_from_js = get_test_env - - -# ============================================================================= -# Module-level constants (lazy-loaded on first access) -# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR -# ============================================================================= - -# These are computed once when first accessed -_LIB_DIR: Optional[Path] = None -_NODE_MODULES_DIR: Optional[Path] = None - - -def _get_lib_dir_cached() -> Path: - global _LIB_DIR - if _LIB_DIR is None: - _LIB_DIR = get_lib_dir() - return _LIB_DIR - - -def _get_node_modules_dir_cached() -> Path: - global _NODE_MODULES_DIR - if _NODE_MODULES_DIR is None: - _NODE_MODULES_DIR = get_node_modules_dir() - return _NODE_MODULES_DIR - - -# Module-level constants that can be imported directly -# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR -class _LazyPath: - """Lazy path that computes value on first access.""" - def __init__(self, getter): - self._getter = getter - self._value = None - - def __fspath__(self): - if self._value is None: - self._value = self._getter() - return str(self._value) - - def __truediv__(self, other): - if self._value is None: - self._value = self._getter() - return self._value / other - - def __str__(self): - return self.__fspath__() - - def __repr__(self): - return f"" - - -LIB_DIR = _LazyPath(_get_lib_dir_cached) -NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached) - - # ============================================================================= # Hook Execution Helpers # ============================================================================= @@ -376,30 +120,10 @@ def run_hook( timeout: int = 60, extra_args: Optional[List[str]] = None, ) -> Tuple[int, str, str]: - """Run a hook script and return (returncode, stdout, stderr). - - Usage: - returncode, stdout, stderr = run_hook( - HOOK_SCRIPT, 'https://example.com', 'test-snap-123', - cwd=tmpdir, env=get_test_env() - ) - - Args: - hook_script: Path to the hook script - url: URL to process - snapshot_id: Snapshot ID - cwd: Working directory (default: current dir) - env: Environment dict (default: get_test_env()) - timeout: Timeout in seconds - extra_args: Additional arguments to pass - - Returns: - Tuple of (returncode, stdout, stderr) - """ + """Run a hook script. Returns (returncode, stdout, stderr).""" if env is None: env = get_test_env() - # Determine interpreter based on file extension if hook_script.suffix == '.py': cmd = ['python', str(hook_script)] elif hook_script.suffix == '.js': @@ -411,32 +135,12 @@ def run_hook( if extra_args: cmd.extend(extra_args) - result = subprocess.run( - cmd, - cwd=str(cwd) if cwd else None, - capture_output=True, - text=True, - env=env, - timeout=timeout - ) + result = subprocess.run(cmd, cwd=str(cwd) if cwd else None, capture_output=True, text=True, env=env, timeout=timeout) return result.returncode, result.stdout, result.stderr def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]: - """Parse JSONL output from hook stdout and return the specified record type. - - Usage: - result = parse_jsonl_output(stdout) - if result and result['status'] == 'succeeded': - print("Success!") - - Args: - stdout: The stdout from a hook execution - record_type: The 'type' field to look for (default: 'ArchiveResult') - - Returns: - The parsed JSON dict or None if not found - """ + """Parse JSONL output, return first record matching type.""" for line in stdout.strip().split('\n'): line = line.strip() if not line.startswith('{'): @@ -459,89 +163,39 @@ def run_hook_and_parse( timeout: int = 60, extra_args: Optional[List[str]] = None, ) -> Tuple[int, Optional[Dict[str, Any]], str]: - """Run a hook and parse its JSONL output. - - Convenience function combining run_hook() and parse_jsonl_output(). - - Returns: - Tuple of (returncode, parsed_result_or_none, stderr) - """ - returncode, stdout, stderr = run_hook( - hook_script, url, snapshot_id, - cwd=cwd, env=env, timeout=timeout, extra_args=extra_args - ) - result = parse_jsonl_output(stdout) - return returncode, result, stderr + """Run hook and parse JSONL output. Returns (returncode, parsed_result, stderr).""" + returncode, stdout, stderr = run_hook(hook_script, url, snapshot_id, cwd=cwd, env=env, timeout=timeout, extra_args=extra_args) + return returncode, parse_jsonl_output(stdout), stderr # ============================================================================= -# Extension Test Helpers -# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha) +# Extension Test Helpers (ublock, istilldontcareaboutcookies, twocaptcha) # ============================================================================= def setup_test_env(tmpdir: Path) -> dict: """Set up isolated data/lib directory structure for extension tests. - - Creates structure matching real ArchiveBox data dir: - /data/ - lib/ - arm64-darwin/ (or x86_64-linux, etc.) - npm/ - .bin/ - node_modules/ - personas/ - Default/ - chrome_extensions/ - users/ - testuser/ - crawls/ - snapshots/ - - Calls chrome install hook which handles puppeteer-core and chromium installation. - Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. - - Args: - tmpdir: Base temporary directory for the test - - Returns: - Environment dict with all paths set, or pytest.skip() if Chrome install fails + Returns env dict with DATA_DIR, LIB_DIR, CHROME_BINARY, etc. """ import pytest - # Determine machine type (matches archivebox.config.paths.get_machine_type()) - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - machine_type = f"{machine}-{system}" + machine_type = get_machine_type() - # Create proper directory structure matching real ArchiveBox layout + # Create directory structure data_dir = tmpdir / 'data' lib_dir = data_dir / 'lib' / machine_type npm_dir = lib_dir / 'npm' npm_bin_dir = npm_dir / '.bin' node_modules_dir = npm_dir / 'node_modules' - - # Extensions go under personas/Default/ chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' - - # User data goes under users/{username}/ date_str = datetime.now().strftime('%Y%m%d') users_dir = data_dir / 'users' / 'testuser' crawls_dir = users_dir / 'crawls' / date_str snapshots_dir = users_dir / 'snapshots' / date_str - # Create all directories - node_modules_dir.mkdir(parents=True, exist_ok=True) - npm_bin_dir.mkdir(parents=True, exist_ok=True) - chrome_extensions_dir.mkdir(parents=True, exist_ok=True) - crawls_dir.mkdir(parents=True, exist_ok=True) - snapshots_dir.mkdir(parents=True, exist_ok=True) + for d in [node_modules_dir, npm_bin_dir, chrome_extensions_dir, crawls_dir, snapshots_dir]: + d.mkdir(parents=True, exist_ok=True) - # Build complete env dict env = os.environ.copy() env.update({ 'DATA_DIR': str(data_dir), @@ -553,20 +207,14 @@ def setup_test_env(tmpdir: Path) -> dict: 'CRAWLS_DIR': str(crawls_dir), 'SNAPSHOTS_DIR': str(snapshots_dir), }) - - # Only set headless if not already in environment (allow override for debugging) if 'CHROME_HEADLESS' not in os.environ: env['CHROME_HEADLESS'] = 'true' - # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) - result = subprocess.run( - ['python', str(CHROME_INSTALL_HOOK)], - capture_output=True, text=True, timeout=120, env=env - ) + # Install Chrome + result = subprocess.run(['python', str(CHROME_INSTALL_HOOK)], capture_output=True, text=True, timeout=120, env=env) if result.returncode != 0: - pytest.skip(f"Chrome install hook failed: {result.stderr}") + pytest.skip(f"Chrome install failed: {result.stderr}") - # Parse JSONL output to get CHROME_BINARY chrome_binary = None for line in result.stdout.strip().split('\n'): if not line.strip(): @@ -587,39 +235,19 @@ def setup_test_env(tmpdir: Path) -> dict: def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]: - """Launch Chromium and return (process, cdp_url). - - This launches Chrome using the chrome launch hook and waits for the CDP URL - to become available. Use this for extension tests that need direct CDP access. - - Args: - env: Environment dict (from setup_test_env) - chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.) - crawl_id: ID for the crawl - - Returns: - Tuple of (chrome_launch_process, cdp_url) - - Raises: - RuntimeError: If Chrome fails to launch or CDP URL not available after 20s - """ + """Launch Chromium and return (process, cdp_url).""" chrome_dir.mkdir(parents=True, exist_ok=True) chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env ) - # Wait for Chromium to launch and CDP URL to be available cdp_url = None - for i in range(20): + for _ in range(20): if chrome_launch_process.poll() is not None: stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") + raise RuntimeError(f"Chromium launch failed:\n{stdout}\n{stderr}") cdp_file = chrome_dir / 'cdp_url.txt' if cdp_file.exists(): cdp_url = cdp_file.read_text().strip() @@ -634,22 +262,13 @@ def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None: - """Clean up Chromium process launched by launch_chromium_session. - - Uses chrome_utils.js killChrome for proper process group handling. - - Args: - chrome_launch_process: The Popen object from launch_chromium_session - chrome_dir: The chrome directory containing chrome.pid - """ - # First try to terminate the launch process gracefully + """Clean up Chromium process.""" try: chrome_launch_process.send_signal(signal.SIGTERM) chrome_launch_process.wait(timeout=5) except Exception: pass - # Read PID and use JS to kill with proper cleanup chrome_pid_file = chrome_dir / 'chrome.pid' if chrome_pid_file.exists(): try: @@ -661,22 +280,7 @@ def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: P @contextmanager def chromium_session(env: dict, chrome_dir: Path, crawl_id: str): - """Context manager for Chromium sessions with automatic cleanup. - - Usage: - with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url): - # Use cdp_url to connect with puppeteer - pass - # Chromium automatically cleaned up - - Args: - env: Environment dict (from setup_test_env) - chrome_dir: Directory for Chrome files - crawl_id: ID for the crawl - - Yields: - Tuple of (chrome_launch_process, cdp_url) - """ + """Context manager for Chromium sessions with automatic cleanup.""" chrome_launch_process = None try: chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id) @@ -687,8 +291,7 @@ def chromium_session(env: dict, chrome_dir: Path, crawl_id: str): # ============================================================================= -# Tab-based Test Helpers -# Used by tab-based tests (infiniscroll, modalcloser) +# Tab-based Test Helpers (infiniscroll, modalcloser) # ============================================================================= @@ -700,25 +303,7 @@ def setup_chrome_session( navigate: bool = True, timeout: int = 15, ) -> Tuple[subprocess.Popen, int, Path]: - """Set up a Chrome session with tab and optional navigation. - - Creates the directory structure, launches Chrome, creates a tab, - and optionally navigates to the test URL. - - Args: - tmpdir: Temporary directory for test files - crawl_id: ID to use for the crawl - snapshot_id: ID to use for the snapshot - test_url: URL to navigate to (if navigate=True) - navigate: Whether to navigate to the URL after creating tab - timeout: Seconds to wait for Chrome to start - - Returns: - Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir) - - Raises: - RuntimeError: If Chrome fails to start or tab creation fails - """ + """Set up Chrome session with tab. Returns (process, pid, snapshot_chrome_dir).""" crawl_dir = Path(tmpdir) / 'crawl' crawl_dir.mkdir(exist_ok=True) chrome_dir = crawl_dir / 'chrome' @@ -727,21 +312,15 @@ def setup_chrome_session( env = get_test_env() env['CHROME_HEADLESS'] = 'true' - # Launch Chrome at crawl level chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env ) - # Wait for Chrome to launch - for i in range(timeout): + for _ in range(timeout): if chrome_launch_process.poll() is not None: stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") + raise RuntimeError(f"Chrome launch failed:\n{stdout}\n{stderr}") if (chrome_dir / 'cdp_url.txt').exists(): break time.sleep(1) @@ -751,36 +330,25 @@ def setup_chrome_session( chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - # Create snapshot directory structure snapshot_dir = Path(tmpdir) / 'snapshot' snapshot_dir.mkdir(exist_ok=True) snapshot_chrome_dir = snapshot_dir / 'chrome' snapshot_chrome_dir.mkdir(exist_ok=True) - # Create tab tab_env = env.copy() tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) result = subprocess.run( ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=tab_env + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=60, env=tab_env ) if result.returncode != 0: cleanup_chrome(chrome_launch_process, chrome_pid) raise RuntimeError(f"Tab creation failed: {result.stderr}") - # Navigate to URL if requested if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank': result = subprocess.run( ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=120, env=env ) if result.returncode != 0: cleanup_chrome(chrome_launch_process, chrome_pid) @@ -790,26 +358,12 @@ def setup_chrome_session( def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None: - """Clean up Chrome processes using chrome_utils.js killChrome. - - Uses the centralized kill logic from chrome_utils.js which handles: - - SIGTERM then SIGKILL - - Process group killing - - Zombie process cleanup - - Args: - chrome_launch_process: The Popen object for the chrome launch hook - chrome_pid: The PID of the Chrome process - chrome_dir: Optional path to chrome output directory - """ - # First try to terminate the launch process gracefully + """Clean up Chrome processes.""" try: chrome_launch_process.send_signal(signal.SIGTERM) chrome_launch_process.wait(timeout=5) except Exception: pass - - # Use JS to kill Chrome with proper process group handling kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None) @@ -822,35 +376,12 @@ def chrome_session( navigate: bool = True, timeout: int = 15, ): - """Context manager for Chrome sessions with automatic cleanup. - - Usage: - with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir): - # Run tests with chrome session - pass - # Chrome automatically cleaned up - - Args: - tmpdir: Temporary directory for test files - crawl_id: ID to use for the crawl - snapshot_id: ID to use for the snapshot - test_url: URL to navigate to (if navigate=True) - navigate: Whether to navigate to the URL after creating tab - timeout: Seconds to wait for Chrome to start - - Yields: - Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir) - """ + """Context manager for Chrome sessions with automatic cleanup.""" chrome_launch_process = None chrome_pid = None try: chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( - tmpdir=tmpdir, - crawl_id=crawl_id, - snapshot_id=snapshot_id, - test_url=test_url, - navigate=navigate, - timeout=timeout, + tmpdir=tmpdir, crawl_id=crawl_id, snapshot_id=snapshot_id, test_url=test_url, navigate=navigate, timeout=timeout ) yield chrome_launch_process, chrome_pid, snapshot_chrome_dir finally: diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py index 7fe69d64..a6225722 100644 --- a/archivebox/plugins/dom/tests/test_dom.py +++ b/archivebox/plugins/dom/tests/test_dom.py @@ -22,18 +22,16 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_plugin_dir, - get_hook_script, + get_lib_dir, + get_node_modules_dir, run_hook_and_parse, - LIB_DIR, - NODE_MODULES_DIR, PLUGINS_ROOT, ) -PLUGIN_DIR = get_plugin_dir(__file__) -DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*') -NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py') +PLUGIN_DIR = Path(__file__).parent.parent +DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None) +NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None) TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/favicon/tests/test_favicon.py b/archivebox/plugins/favicon/tests/test_favicon.py index 4434d1a8..17b12f87 100644 --- a/archivebox/plugins/favicon/tests/test_favicon.py +++ b/archivebox/plugins/favicon/tests/test_favicon.py @@ -20,15 +20,11 @@ from pathlib import Path import pytest -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, - get_hook_script, - parse_jsonl_output, -) +from archivebox.plugins.chrome.tests.chrome_test_helpers import parse_jsonl_output -PLUGIN_DIR = get_plugin_dir(__file__) -FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*') +PLUGIN_DIR = Path(__file__).parent.parent +FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None) TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/mercury/tests/test_mercury.py b/archivebox/plugins/mercury/tests/test_mercury.py index 242eb5db..8ca67452 100644 --- a/archivebox/plugins/mercury/tests/test_mercury.py +++ b/archivebox/plugins/mercury/tests/test_mercury.py @@ -18,15 +18,11 @@ import tempfile from pathlib import Path import pytest -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, - get_hook_script, - PLUGINS_ROOT, -) +from archivebox.plugins.chrome.tests.chrome_test_helpers import PLUGINS_ROOT -PLUGIN_DIR = get_plugin_dir(__file__) -MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*') +PLUGIN_DIR = Path(__file__).parent.parent +MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None) TEST_URL = 'https://example.com' def test_hook_script_exists(): diff --git a/archivebox/plugins/pdf/tests/test_pdf.py b/archivebox/plugins/pdf/tests/test_pdf.py index c160cfdc..b5badfca 100644 --- a/archivebox/plugins/pdf/tests/test_pdf.py +++ b/archivebox/plugins/pdf/tests/test_pdf.py @@ -23,17 +23,15 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_plugin_dir, - get_hook_script, + get_lib_dir, + get_node_modules_dir, run_hook_and_parse, - LIB_DIR, - NODE_MODULES_DIR, PLUGINS_ROOT, ) -PLUGIN_DIR = get_plugin_dir(__file__) -PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*') +PLUGIN_DIR = Path(__file__).parent.parent +PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None) NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py' TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/readability/tests/test_readability.py b/archivebox/plugins/readability/tests/test_readability.py index b416169e..2302d5a3 100644 --- a/archivebox/plugins/readability/tests/test_readability.py +++ b/archivebox/plugins/readability/tests/test_readability.py @@ -17,15 +17,11 @@ from pathlib import Path import pytest -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, - get_hook_script, - PLUGINS_ROOT, -) +from archivebox.plugins.chrome.tests.chrome_test_helpers import PLUGINS_ROOT -PLUGIN_DIR = get_plugin_dir(__file__) -READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*') +PLUGIN_DIR = Path(__file__).parent.parent +READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*'), None) TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index 24d4960d..856e6809 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -22,16 +22,14 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_plugin_dir, - get_hook_script, + get_lib_dir, + get_node_modules_dir, run_hook_and_parse, - LIB_DIR, - NODE_MODULES_DIR, ) -PLUGIN_DIR = get_plugin_dir(__file__) -SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') +PLUGIN_DIR = Path(__file__).parent.parent +SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None) TEST_URL = 'https://example.com' diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py index 0fbd3c07..3978b047 100644 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ b/archivebox/plugins/singlefile/tests/test_singlefile.py @@ -20,15 +20,13 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_plugin_dir, - get_hook_script, setup_chrome_session, cleanup_chrome, ) -PLUGIN_DIR = get_plugin_dir(__file__) -SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py') +PLUGIN_DIR = Path(__file__).parent.parent +SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None) TEST_URL = "https://example.com" diff --git a/archivebox/plugins/title/tests/test_title.py b/archivebox/plugins/title/tests/test_title.py index 285f7309..619c3613 100644 --- a/archivebox/plugins/title/tests/test_title.py +++ b/archivebox/plugins/title/tests/test_title.py @@ -19,15 +19,11 @@ from pathlib import Path import pytest -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, - get_hook_script, - parse_jsonl_output, -) +from archivebox.plugins.chrome.tests.chrome_test_helpers import parse_jsonl_output -PLUGIN_DIR = get_plugin_dir(__file__) -TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*') +PLUGIN_DIR = Path(__file__).parent.parent +TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None) TEST_URL = 'https://example.com'