From d72ab7c397283f8bc04e01a3a29936ae915a763b Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 09:02:34 +0000 Subject: [PATCH] Add simpler Chrome test helpers and update test files New helpers in chrome_test_helpers.py: - get_plugin_dir(__file__) - get plugin dir from test file path - get_hook_script(dir, pattern) - find hook script by glob pattern - run_hook() - run hook script and return (returncode, stdout, stderr) - parse_jsonl_output() - parse JSONL from hook output - run_hook_and_parse() - convenience combo of above two - LIB_DIR, NODE_MODULES_DIR - lazy-loaded module constants - _LazyPath class for deferred path resolution Updated test files to use simpler patterns: - screenshot/tests/test_screenshot.py - dom/tests/test_dom.py - pdf/tests/test_pdf.py - singlefile/tests/test_singlefile.py Before: PLUGIN_DIR = Path(__file__).parent.parent After: PLUGIN_DIR = get_plugin_dir(__file__) Before: LIB_DIR = get_lib_dir(); NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' After: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR --- .../chrome/tests/chrome_test_helpers.py | 236 +++++++++++++++++- archivebox/plugins/dom/tests/test_dom.py | 17 +- archivebox/plugins/pdf/tests/test_pdf.py | 15 +- .../screenshot/tests/test_screenshot.py | 14 +- .../singlefile/tests/test_singlefile.py | 8 +- 5 files changed, 251 insertions(+), 39 deletions(-) diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 935081d5..4de09796 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -6,25 +6,33 @@ duplication across test files. It uses the JavaScript utilities from chrome_util where appropriate. Usage: - # For simple tests (screenshot, dom, pdf, etc.): + # Simplest - just import what you need: from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - get_lib_dir, - find_chromium_binary, + get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE + get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path + LIB_DIR, # Path to lib dir (lazy-loaded) + NODE_MODULES_DIR, # Path to node_modules (lazy-loaded) + PLUGINS_ROOT, # Path to plugins root ) - # For extension tests (ublock, istilldontcareaboutcookies, twocaptcha): + # For Chrome session tests: from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_test_env, - launch_chromium_session, - kill_chromium_session, + setup_chrome_session, # Full Chrome + tab setup + cleanup_chrome, # Cleanup by PID + chrome_session, # Context manager ) - # For tab-based tests (infiniscroll, modalcloser): + # For extension tests: from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_chrome_session, - cleanup_chrome, - chrome_session, + setup_test_env, # Full dir structure + Chrome install + launch_chromium_session, # Launch Chrome, return CDP URL + kill_chromium_session, # Cleanup Chrome + ) + + # Run hooks and parse JSONL: + from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + run_hook, # Run hook, return (returncode, stdout, stderr) + parse_jsonl_output, # Parse JSONL from stdout ) """ @@ -36,7 +44,7 @@ import subprocess import time from datetime import datetime from pathlib import Path -from typing import Tuple, Optional +from typing import Tuple, Optional, List, Dict, Any from contextlib import contextmanager @@ -52,6 +60,43 @@ CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_naviga CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js' +# ============================================================================= +# Path Helpers - use these to avoid boilerplate in test files +# ============================================================================= + + +def get_plugin_dir(test_file: str) -> Path: + """Get the plugin directory from a test file path. + + Usage: + PLUGIN_DIR = get_plugin_dir(__file__) + + Args: + test_file: The __file__ of the test module (e.g., test_screenshot.py) + + Returns: + Path to the plugin directory (e.g., plugins/screenshot/) + """ + return Path(test_file).parent.parent + + +def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]: + """Find a hook script in a plugin directory by pattern. + + Usage: + HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') + + Args: + plugin_dir: Path to the plugin directory + pattern: Glob pattern to match + + Returns: + Path to the hook script or None if not found + """ + matches = list(plugin_dir.glob(pattern)) + return matches[0] if matches else None + + def get_lib_dir() -> Path: """Get LIB_DIR for tests, checking env first then ArchiveBox config. @@ -111,6 +156,171 @@ def get_test_env() -> dict: return env +# ============================================================================= +# Module-level constants (lazy-loaded on first access) +# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR +# ============================================================================= + +# These are computed once when first accessed +_LIB_DIR: Optional[Path] = None +_NODE_MODULES_DIR: Optional[Path] = None + + +def _get_lib_dir_cached() -> Path: + global _LIB_DIR + if _LIB_DIR is None: + _LIB_DIR = get_lib_dir() + return _LIB_DIR + + +def _get_node_modules_dir_cached() -> Path: + global _NODE_MODULES_DIR + if _NODE_MODULES_DIR is None: + _NODE_MODULES_DIR = get_node_modules_dir() + return _NODE_MODULES_DIR + + +# Module-level constants that can be imported directly +# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR +class _LazyPath: + """Lazy path that computes value on first access.""" + def __init__(self, getter): + self._getter = getter + self._value = None + + def __fspath__(self): + if self._value is None: + self._value = self._getter() + return str(self._value) + + def __truediv__(self, other): + if self._value is None: + self._value = self._getter() + return self._value / other + + def __str__(self): + return self.__fspath__() + + def __repr__(self): + return f"" + + +LIB_DIR = _LazyPath(_get_lib_dir_cached) +NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached) + + +# ============================================================================= +# Hook Execution Helpers +# ============================================================================= + + +def run_hook( + hook_script: Path, + url: str, + snapshot_id: str, + cwd: Optional[Path] = None, + env: Optional[dict] = None, + timeout: int = 60, + extra_args: Optional[List[str]] = None, +) -> Tuple[int, str, str]: + """Run a hook script and return (returncode, stdout, stderr). + + Usage: + returncode, stdout, stderr = run_hook( + HOOK_SCRIPT, 'https://example.com', 'test-snap-123', + cwd=tmpdir, env=get_test_env() + ) + + Args: + hook_script: Path to the hook script + url: URL to process + snapshot_id: Snapshot ID + cwd: Working directory (default: current dir) + env: Environment dict (default: get_test_env()) + timeout: Timeout in seconds + extra_args: Additional arguments to pass + + Returns: + Tuple of (returncode, stdout, stderr) + """ + if env is None: + env = get_test_env() + + # Determine interpreter based on file extension + if hook_script.suffix == '.py': + cmd = ['python', str(hook_script)] + elif hook_script.suffix == '.js': + cmd = ['node', str(hook_script)] + else: + cmd = [str(hook_script)] + + cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}']) + if extra_args: + cmd.extend(extra_args) + + result = subprocess.run( + cmd, + cwd=str(cwd) if cwd else None, + capture_output=True, + text=True, + env=env, + timeout=timeout + ) + return result.returncode, result.stdout, result.stderr + + +def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]: + """Parse JSONL output from hook stdout and return the specified record type. + + Usage: + result = parse_jsonl_output(stdout) + if result and result['status'] == 'succeeded': + print("Success!") + + Args: + stdout: The stdout from a hook execution + record_type: The 'type' field to look for (default: 'ArchiveResult') + + Returns: + The parsed JSON dict or None if not found + """ + for line in stdout.strip().split('\n'): + line = line.strip() + if not line.startswith('{'): + continue + try: + record = json.loads(line) + if record.get('type') == record_type: + return record + except json.JSONDecodeError: + continue + return None + + +def run_hook_and_parse( + hook_script: Path, + url: str, + snapshot_id: str, + cwd: Optional[Path] = None, + env: Optional[dict] = None, + timeout: int = 60, + extra_args: Optional[List[str]] = None, +) -> Tuple[int, Optional[Dict[str, Any]], str]: + """Run a hook and parse its JSONL output. + + Convenience function combining run_hook() and parse_jsonl_output(). + + Returns: + Tuple of (returncode, parsed_result_or_none, stderr) + """ + returncode, stdout, stderr = run_hook( + hook_script, url, snapshot_id, + cwd=cwd, env=env, timeout=timeout, extra_args=extra_args + ) + result = parse_jsonl_output(stdout) + return returncode, result, stderr + + def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]: """Find the Chromium binary using chrome_utils.js findChromium(). diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py index dcc00212..7fe69d64 100644 --- a/archivebox/plugins/dom/tests/test_dom.py +++ b/archivebox/plugins/dom/tests/test_dom.py @@ -22,19 +22,20 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_lib_dir, + get_plugin_dir, + get_hook_script, + run_hook_and_parse, + LIB_DIR, + NODE_MODULES_DIR, + PLUGINS_ROOT, ) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None) -NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None) +PLUGIN_DIR = get_plugin_dir(__file__) +DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*') +NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py') TEST_URL = 'https://example.com' -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" diff --git a/archivebox/plugins/pdf/tests/test_pdf.py b/archivebox/plugins/pdf/tests/test_pdf.py index 5b909482..c160cfdc 100644 --- a/archivebox/plugins/pdf/tests/test_pdf.py +++ b/archivebox/plugins/pdf/tests/test_pdf.py @@ -23,19 +23,20 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_lib_dir, + get_plugin_dir, + get_hook_script, + run_hook_and_parse, + LIB_DIR, + NODE_MODULES_DIR, + PLUGINS_ROOT, ) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None) +PLUGIN_DIR = get_plugin_dir(__file__) +PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*') NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py' TEST_URL = 'https://example.com' -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index 378ce13a..24d4960d 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -22,18 +22,18 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, - get_lib_dir, + get_plugin_dir, + get_hook_script, + run_hook_and_parse, + LIB_DIR, + NODE_MODULES_DIR, ) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None) +PLUGIN_DIR = get_plugin_dir(__file__) +SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') TEST_URL = 'https://example.com' -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py index 23ecf090..0fbd3c07 100644 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ b/archivebox/plugins/singlefile/tests/test_singlefile.py @@ -20,15 +20,15 @@ import pytest from archivebox.plugins.chrome.tests.chrome_test_helpers import ( get_test_env, + get_plugin_dir, + get_hook_script, setup_chrome_session, cleanup_chrome, - CHROME_PLUGIN_DIR, ) -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None) +PLUGIN_DIR = get_plugin_dir(__file__) +SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py') TEST_URL = "https://example.com"