Add simpler Chrome test helpers and update test files

New helpers in chrome_test_helpers.py: - get_plugin_dir(__file__) - get plugin dir from test file path - get_hook_script(dir, pattern) - find hook script by glob pattern - run_hook() - run hook script and return (returncode, stdout, stderr) - parse_jsonl_output() - parse JSONL from hook output - run_hook_and_parse() - convenience combo of above two - LIB_DIR, NODE_MODULES_DIR - lazy-loaded module constants - _LazyPath class for deferred path resolution Updated test files to use simpler patterns: - screenshot/tests/test_screenshot.py - dom/tests/test_dom.py - pdf/tests/test_pdf.py - singlefile/tests/test_singlefile.py Before: PLUGIN_DIR = Path(__file__).parent.parent After: PLUGIN_DIR = get_plugin_dir(__file__) Before: LIB_DIR = get_lib_dir(); NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules' After: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
2026-01-03 01:15:57 +10:00 · 2025-12-31 09:02:34 +00:00
parent 7d74dd906c
commit d72ab7c397
5 changed files with 251 additions and 39 deletions
--- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py
+++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
@@ -6,25 +6,33 @@ duplication across test files. It uses the JavaScript utilities from chrome_util
 where appropriate.

 Usage:
-    # For simple tests (screenshot, dom, pdf, etc.):
+    # Simplest - just import what you need:
    from archivebox.plugins.chrome.tests.chrome_test_helpers import (
-        get_test_env,
-        get_lib_dir,
-        find_chromium_binary,
+        get_test_env,           # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE
+        get_plugin_dir,         # get_plugin_dir(__file__) -> plugin dir Path
+        LIB_DIR,                # Path to lib dir (lazy-loaded)
+        NODE_MODULES_DIR,       # Path to node_modules (lazy-loaded)
+        PLUGINS_ROOT,           # Path to plugins root
    )

-    # For extension tests (ublock, istilldontcareaboutcookies, twocaptcha):
+    # For Chrome session tests:
    from archivebox.plugins.chrome.tests.chrome_test_helpers import (
-        setup_test_env,
-        launch_chromium_session,
-        kill_chromium_session,
+        setup_chrome_session,   # Full Chrome + tab setup
+        cleanup_chrome,         # Cleanup by PID
+        chrome_session,         # Context manager
    )

-    # For tab-based tests (infiniscroll, modalcloser):
+    # For extension tests:
    from archivebox.plugins.chrome.tests.chrome_test_helpers import (
-        setup_chrome_session,
-        cleanup_chrome,
-        chrome_session,
+        setup_test_env,         # Full dir structure + Chrome install
+        launch_chromium_session, # Launch Chrome, return CDP URL
+        kill_chromium_session,   # Cleanup Chrome
+    )
+
+    # Run hooks and parse JSONL:
+    from archivebox.plugins.chrome.tests.chrome_test_helpers import (
+        run_hook,               # Run hook, return (returncode, stdout, stderr)
+        parse_jsonl_output,     # Parse JSONL from stdout
    )
 """

@@ -36,7 +44,7 @@ import subprocess
 import time
 from datetime import datetime
 from pathlib import Path
-from typing import Tuple, Optional
+from typing import Tuple, Optional, List, Dict, Any
 from contextlib import contextmanager


@@ -52,6 +60,43 @@ CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_naviga
 CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'


+# =============================================================================
+# Path Helpers - use these to avoid boilerplate in test files
+# =============================================================================
+
+
+def get_plugin_dir(test_file: str) -> Path:
+    """Get the plugin directory from a test file path.
+
+    Usage:
+        PLUGIN_DIR = get_plugin_dir(__file__)
+
+    Args:
+        test_file: The __file__ of the test module (e.g., test_screenshot.py)
+
+    Returns:
+        Path to the plugin directory (e.g., plugins/screenshot/)
+    """
+    return Path(test_file).parent.parent
+
+
+def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]:
+    """Find a hook script in a plugin directory by pattern.
+
+    Usage:
+        HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
+
+    Args:
+        plugin_dir: Path to the plugin directory
+        pattern: Glob pattern to match
+
+    Returns:
+        Path to the hook script or None if not found
+    """
+    matches = list(plugin_dir.glob(pattern))
+    return matches[0] if matches else None
+
+
 def get_lib_dir() -> Path:
    """Get LIB_DIR for tests, checking env first then ArchiveBox config.

@@ -111,6 +156,171 @@ def get_test_env() -> dict:
    return env


+# =============================================================================
+# Module-level constants (lazy-loaded on first access)
+# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
+# =============================================================================
+
+# These are computed once when first accessed
+_LIB_DIR: Optional[Path] = None
+_NODE_MODULES_DIR: Optional[Path] = None
+
+
+def _get_lib_dir_cached() -> Path:
+    global _LIB_DIR
+    if _LIB_DIR is None:
+        _LIB_DIR = get_lib_dir()
+    return _LIB_DIR
+
+
+def _get_node_modules_dir_cached() -> Path:
+    global _NODE_MODULES_DIR
+    if _NODE_MODULES_DIR is None:
+        _NODE_MODULES_DIR = get_node_modules_dir()
+    return _NODE_MODULES_DIR
+
+
+# Module-level constants that can be imported directly
+# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
+class _LazyPath:
+    """Lazy path that computes value on first access."""
+    def __init__(self, getter):
+        self._getter = getter
+        self._value = None
+
+    def __fspath__(self):
+        if self._value is None:
+            self._value = self._getter()
+        return str(self._value)
+
+    def __truediv__(self, other):
+        if self._value is None:
+            self._value = self._getter()
+        return self._value / other
+
+    def __str__(self):
+        return self.__fspath__()
+
+    def __repr__(self):
+        return f"<LazyPath: {self.__fspath__()}>"
+
+
+LIB_DIR = _LazyPath(_get_lib_dir_cached)
+NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached)
+
+
+# =============================================================================
+# Hook Execution Helpers
+# =============================================================================
+
+
+def run_hook(
+    hook_script: Path,
+    url: str,
+    snapshot_id: str,
+    cwd: Optional[Path] = None,
+    env: Optional[dict] = None,
+    timeout: int = 60,
+    extra_args: Optional[List[str]] = None,
+) -> Tuple[int, str, str]:
+    """Run a hook script and return (returncode, stdout, stderr).
+
+    Usage:
+        returncode, stdout, stderr = run_hook(
+            HOOK_SCRIPT, 'https://example.com', 'test-snap-123',
+            cwd=tmpdir, env=get_test_env()
+        )
+
+    Args:
+        hook_script: Path to the hook script
+        url: URL to process
+        snapshot_id: Snapshot ID
+        cwd: Working directory (default: current dir)
+        env: Environment dict (default: get_test_env())
+        timeout: Timeout in seconds
+        extra_args: Additional arguments to pass
+
+    Returns:
+        Tuple of (returncode, stdout, stderr)
+    """
+    if env is None:
+        env = get_test_env()
+
+    # Determine interpreter based on file extension
+    if hook_script.suffix == '.py':
+        cmd = ['python', str(hook_script)]
+    elif hook_script.suffix == '.js':
+        cmd = ['node', str(hook_script)]
+    else:
+        cmd = [str(hook_script)]
+
+    cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}'])
+    if extra_args:
+        cmd.extend(extra_args)
+
+    result = subprocess.run(
+        cmd,
+        cwd=str(cwd) if cwd else None,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=timeout
+    )
+    return result.returncode, result.stdout, result.stderr
+
+
+def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]:
+    """Parse JSONL output from hook stdout and return the specified record type.
+
+    Usage:
+        result = parse_jsonl_output(stdout)
+        if result and result['status'] == 'succeeded':
+            print("Success!")
+
+    Args:
+        stdout: The stdout from a hook execution
+        record_type: The 'type' field to look for (default: 'ArchiveResult')
+
+    Returns:
+        The parsed JSON dict or None if not found
+    """
+    for line in stdout.strip().split('\n'):
+        line = line.strip()
+        if not line.startswith('{'):
+            continue
+        try:
+            record = json.loads(line)
+            if record.get('type') == record_type:
+                return record
+        except json.JSONDecodeError:
+            continue
+    return None
+
+
+def run_hook_and_parse(
+    hook_script: Path,
+    url: str,
+    snapshot_id: str,
+    cwd: Optional[Path] = None,
+    env: Optional[dict] = None,
+    timeout: int = 60,
+    extra_args: Optional[List[str]] = None,
+) -> Tuple[int, Optional[Dict[str, Any]], str]:
+    """Run a hook and parse its JSONL output.
+
+    Convenience function combining run_hook() and parse_jsonl_output().
+
+    Returns:
+        Tuple of (returncode, parsed_result_or_none, stderr)
+    """
+    returncode, stdout, stderr = run_hook(
+        hook_script, url, snapshot_id,
+        cwd=cwd, env=env, timeout=timeout, extra_args=extra_args
+    )
+    result = parse_jsonl_output(stdout)
+    return returncode, result, stderr
+
+
 def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
    """Find the Chromium binary using chrome_utils.js findChromium().

--- a/archivebox/plugins/dom/tests/test_dom.py
+++ b/archivebox/plugins/dom/tests/test_dom.py
@@ -22,19 +22,20 @@ import pytest

 from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    get_test_env,
-    get_lib_dir,
+    get_plugin_dir,
+    get_hook_script,
+    run_hook_and_parse,
+    LIB_DIR,
+    NODE_MODULES_DIR,
+    PLUGINS_ROOT,
 )


-PLUGIN_DIR = Path(__file__).parent.parent
-PLUGINS_ROOT = PLUGIN_DIR.parent
-DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
-NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
+PLUGIN_DIR = get_plugin_dir(__file__)
+DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
+NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
 TEST_URL = 'https://example.com'

-LIB_DIR = get_lib_dir()
-NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
-

 def test_hook_script_exists():
    """Verify on_Snapshot hook exists."""
--- a/archivebox/plugins/pdf/tests/test_pdf.py
+++ b/archivebox/plugins/pdf/tests/test_pdf.py
@@ -23,19 +23,20 @@ import pytest

 from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    get_test_env,
-    get_lib_dir,
+    get_plugin_dir,
+    get_hook_script,
+    run_hook_and_parse,
+    LIB_DIR,
+    NODE_MODULES_DIR,
+    PLUGINS_ROOT,
 )


-PLUGIN_DIR = Path(__file__).parent.parent
-PLUGINS_ROOT = PLUGIN_DIR.parent
-PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
+PLUGIN_DIR = get_plugin_dir(__file__)
+PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*')
 NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
 TEST_URL = 'https://example.com'

-LIB_DIR = get_lib_dir()
-NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
-

 def test_hook_script_exists():
    """Verify on_Snapshot hook exists."""
--- a/archivebox/plugins/screenshot/tests/test_screenshot.py
+++ b/archivebox/plugins/screenshot/tests/test_screenshot.py
@@ -22,18 +22,18 @@ import pytest

 from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    get_test_env,
-    get_lib_dir,
+    get_plugin_dir,
+    get_hook_script,
+    run_hook_and_parse,
+    LIB_DIR,
+    NODE_MODULES_DIR,
 )


-PLUGIN_DIR = Path(__file__).parent.parent
-PLUGINS_ROOT = PLUGIN_DIR.parent
-SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
+PLUGIN_DIR = get_plugin_dir(__file__)
+SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
 TEST_URL = 'https://example.com'

-LIB_DIR = get_lib_dir()
-NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
-

 def test_hook_script_exists():
    """Verify on_Snapshot hook exists."""
--- a/archivebox/plugins/singlefile/tests/test_singlefile.py
+++ b/archivebox/plugins/singlefile/tests/test_singlefile.py
@@ -20,15 +20,15 @@ import pytest

 from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    get_test_env,
+    get_plugin_dir,
+    get_hook_script,
    setup_chrome_session,
    cleanup_chrome,
-    CHROME_PLUGIN_DIR,
 )


-PLUGIN_DIR = Path(__file__).parent.parent
-PLUGINS_ROOT = PLUGIN_DIR.parent
-SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
+PLUGIN_DIR = get_plugin_dir(__file__)
+SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
 TEST_URL = "https://example.com"