Consolidate Chrome test helpers across all plugin tests (#1738)

<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->

# Summary

<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->

# Related issues

<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->

# Changes these areas

- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
This commit is contained in:
Nick Sweeting
2025-12-31 01:25:39 -08:00
committed by GitHub
15 changed files with 978 additions and 701 deletions

View File

@@ -1333,6 +1333,83 @@ function getExtensionsDir() {
path.join(dataDir, 'personas', persona, 'chrome_extensions');
}
/**
* Get machine type string for platform-specific paths.
* Matches Python's archivebox.config.paths.get_machine_type()
*
* @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin')
*/
function getMachineType() {
if (process.env.MACHINE_TYPE) {
return process.env.MACHINE_TYPE;
}
let machine = process.arch;
const system = process.platform;
// Normalize machine type to match Python's convention
if (machine === 'arm64' || machine === 'aarch64') {
machine = 'arm64';
} else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') {
machine = 'x86_64';
} else if (machine === 'ia32' || machine === 'x86') {
machine = 'x86';
}
return `${machine}-${system}`;
}
/**
* Get LIB_DIR path for platform-specific binaries.
* Returns DATA_DIR/lib/MACHINE_TYPE/
*
* @returns {string} - Absolute path to lib directory
*/
function getLibDir() {
if (process.env.LIB_DIR) {
return process.env.LIB_DIR;
}
const dataDir = getEnv('DATA_DIR', './data');
const machineType = getMachineType();
return path.join(dataDir, 'lib', machineType);
}
/**
* Get NODE_MODULES_DIR path for npm packages.
* Returns LIB_DIR/npm/node_modules/
*
* @returns {string} - Absolute path to node_modules directory
*/
function getNodeModulesDir() {
if (process.env.NODE_MODULES_DIR) {
return process.env.NODE_MODULES_DIR;
}
return path.join(getLibDir(), 'npm', 'node_modules');
}
/**
* Get all test environment paths as a JSON object.
* This is the single source of truth for path calculations - Python calls this
* to avoid duplicating path logic.
*
* @returns {Object} - Object with all test environment paths
*/
function getTestEnv() {
const dataDir = getEnv('DATA_DIR', './data');
const machineType = getMachineType();
const libDir = getLibDir();
const nodeModulesDir = getNodeModulesDir();
return {
DATA_DIR: dataDir,
MACHINE_TYPE: machineType,
LIB_DIR: libDir,
NODE_MODULES_DIR: nodeModulesDir,
NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'),
CHROME_EXTENSIONS_DIR: getExtensionsDir(),
};
}
/**
* Install a Chrome extension with caching support.
*
@@ -1442,8 +1519,13 @@ module.exports = {
getExtensionPaths,
waitForExtensionTarget,
getExtensionTargets,
// Shared extension installer utilities
// Shared path utilities (single source of truth for Python/JS)
getMachineType,
getLibDir,
getNodeModulesDir,
getExtensionsDir,
getTestEnv,
// Shared extension installer utilities
installExtensionWithCache,
// Deprecated - use enableExtensions option instead
getExtensionLaunchArgs,
@@ -1457,18 +1539,31 @@ if (require.main === module) {
console.log('Usage: chrome_utils.js <command> [args...]');
console.log('');
console.log('Commands:');
console.log(' findChromium');
console.log(' installChromium');
console.log(' installPuppeteerCore [npm_prefix]');
console.log(' launchChromium [output_dir] [extension_paths_json]');
console.log(' killChrome <pid> [output_dir]');
console.log(' killZombieChrome [data_dir]');
console.log(' getExtensionId <path>');
console.log(' loadExtensionManifest <path>');
console.log(' getExtensionLaunchArgs <extensions_json>');
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
console.log(' getExtensionsDir');
console.log(' installExtensionWithCache <webstore_id> <name>');
console.log(' findChromium Find Chrome/Chromium binary');
console.log(' installChromium Install Chromium via @puppeteer/browsers');
console.log(' installPuppeteerCore Install puppeteer-core npm package');
console.log(' launchChromium Launch Chrome with CDP debugging');
console.log(' killChrome <pid> Kill Chrome process by PID');
console.log(' killZombieChrome Clean up zombie Chrome processes');
console.log('');
console.log(' getMachineType Get machine type (e.g., x86_64-linux)');
console.log(' getLibDir Get LIB_DIR path');
console.log(' getNodeModulesDir Get NODE_MODULES_DIR path');
console.log(' getExtensionsDir Get Chrome extensions directory');
console.log(' getTestEnv Get all paths as JSON (for tests)');
console.log('');
console.log(' getExtensionId <path> Get extension ID from unpacked path');
console.log(' loadExtensionManifest Load extension manifest.json');
console.log(' loadOrInstallExtension Load or install an extension');
console.log(' installExtensionWithCache Install extension with caching');
console.log('');
console.log('Environment variables:');
console.log(' DATA_DIR Base data directory');
console.log(' LIB_DIR Library directory (computed if not set)');
console.log(' MACHINE_TYPE Machine type override');
console.log(' NODE_MODULES_DIR Node modules directory');
console.log(' CHROME_BINARY Chrome binary path');
console.log(' CHROME_EXTENSIONS_DIR Extensions directory');
process.exit(1);
}
@@ -1581,11 +1676,31 @@ if (require.main === module) {
break;
}
case 'getMachineType': {
console.log(getMachineType());
break;
}
case 'getLibDir': {
console.log(getLibDir());
break;
}
case 'getNodeModulesDir': {
console.log(getNodeModulesDir());
break;
}
case 'getExtensionsDir': {
console.log(getExtensionsDir());
break;
}
case 'getTestEnv': {
console.log(JSON.stringify(getTestEnv(), null, 2));
break;
}
case 'installExtensionWithCache': {
const [webstore_id, name] = commandArgs;
if (!webstore_id || !name) {

View File

@@ -2,25 +2,69 @@
Shared Chrome test helpers for plugin integration tests.
This module provides common utilities for Chrome-based plugin tests, reducing
duplication across test files. It uses the JavaScript utilities from chrome_utils.js
where appropriate.
duplication across test files. Functions delegate to chrome_utils.js (the single
source of truth) with Python fallbacks.
Function names match the JS equivalents in snake_case:
JS: getMachineType() -> Python: get_machine_type()
JS: getLibDir() -> Python: get_lib_dir()
JS: getNodeModulesDir() -> Python: get_node_modules_dir()
JS: getExtensionsDir() -> Python: get_extensions_dir()
JS: findChromium() -> Python: find_chromium()
JS: killChrome() -> Python: kill_chrome()
JS: getTestEnv() -> Python: get_test_env()
Usage:
# Path helpers (delegate to chrome_utils.js):
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
setup_chrome_session,
cleanup_chrome,
find_chromium_binary,
get_node_modules_dir,
get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE
get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin'
get_lib_dir, # Path to lib dir
get_node_modules_dir, # Path to node_modules
get_extensions_dir, # Path to chrome extensions
find_chromium, # Find Chrome/Chromium binary
kill_chrome, # Kill Chrome process by PID
)
# Test file helpers:
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path
get_hook_script, # Find hook script by glob pattern
PLUGINS_ROOT, # Path to plugins root
LIB_DIR, # Path to lib dir (lazy-loaded)
NODE_MODULES_DIR, # Path to node_modules (lazy-loaded)
)
# For Chrome session tests:
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_chrome_session, # Full Chrome + tab setup
cleanup_chrome, # Cleanup by PID
chrome_session, # Context manager
)
# For extension tests:
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env, # Full dir structure + Chrome install
launch_chromium_session, # Launch Chrome, return CDP URL
kill_chromium_session, # Cleanup Chrome
)
# Run hooks and parse JSONL:
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
run_hook, # Run hook, return (returncode, stdout, stderr)
parse_jsonl_output, # Parse JSONL from stdout
)
"""
import json
import os
import platform
import signal
import subprocess
import time
from datetime import datetime
from pathlib import Path
from typing import Tuple, Optional
from typing import Tuple, Optional, List, Dict, Any
from contextlib import contextmanager
@@ -29,88 +73,623 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
# Hook script locations
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
def get_node_modules_dir() -> Path:
"""Get NODE_MODULES_DIR for tests, checking env first.
# =============================================================================
# Path Helpers - delegates to chrome_utils.js with Python fallback
# Function names match JS: getMachineType -> get_machine_type, etc.
# =============================================================================
Returns the path to the node_modules directory, checking:
1. NODE_MODULES_DIR environment variable
2. Computed from LIB_DIR via ArchiveBox config
def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]:
"""Call chrome_utils.js CLI command (internal helper).
This is the central dispatch for calling the JS utilities from Python.
All path calculations and Chrome operations are centralized in chrome_utils.js
to ensure consistency between Python and JavaScript code.
Args:
command: The CLI command (e.g., 'findChromium', 'getTestEnv')
*args: Additional command arguments
env: Environment dict (default: current env)
Returns:
Tuple of (returncode, stdout, stderr)
"""
cmd = ['node', str(CHROME_UTILS), command] + list(args)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=30,
env=env or os.environ.copy()
)
return result.returncode, result.stdout, result.stderr
def get_plugin_dir(test_file: str) -> Path:
"""Get the plugin directory from a test file path.
Usage:
PLUGIN_DIR = get_plugin_dir(__file__)
Args:
test_file: The __file__ of the test module (e.g., test_screenshot.py)
Returns:
Path to the plugin directory (e.g., plugins/screenshot/)
"""
return Path(test_file).parent.parent
def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]:
"""Find a hook script in a plugin directory by pattern.
Usage:
HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
Args:
plugin_dir: Path to the plugin directory
pattern: Glob pattern to match
Returns:
Path to the hook script or None if not found
"""
matches = list(plugin_dir.glob(pattern))
return matches[0] if matches else None
def get_machine_type() -> str:
"""Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin').
Matches JS: getMachineType()
Tries chrome_utils.js first, falls back to Python computation.
"""
# Try JS first (single source of truth)
returncode, stdout, stderr = _call_chrome_utils('getMachineType')
if returncode == 0 and stdout.strip():
return stdout.strip()
# Fallback to Python computation
if os.environ.get('MACHINE_TYPE'):
return os.environ['MACHINE_TYPE']
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
return f"{machine}-{system}"
def get_lib_dir() -> Path:
"""Get LIB_DIR path for platform-specific binaries.
Matches JS: getLibDir()
Tries chrome_utils.js first, falls back to Python computation.
"""
# Try JS first
returncode, stdout, stderr = _call_chrome_utils('getLibDir')
if returncode == 0 and stdout.strip():
return Path(stdout.strip())
# Fallback to Python
if os.environ.get('LIB_DIR'):
return Path(os.environ['LIB_DIR'])
from archivebox.config.common import STORAGE_CONFIG
return Path(str(STORAGE_CONFIG.LIB_DIR))
def get_node_modules_dir() -> Path:
"""Get NODE_MODULES_DIR path for npm packages.
Matches JS: getNodeModulesDir()
Tries chrome_utils.js first, falls back to Python computation.
"""
# Try JS first
returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir')
if returncode == 0 and stdout.strip():
return Path(stdout.strip())
# Fallback to Python
if os.environ.get('NODE_MODULES_DIR'):
return Path(os.environ['NODE_MODULES_DIR'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
lib_dir = get_lib_dir()
return lib_dir / 'npm' / 'node_modules'
def get_test_env() -> dict:
"""Get environment dict with NODE_MODULES_DIR set correctly for tests.
def get_extensions_dir() -> str:
"""Get the Chrome extensions directory path.
Returns a copy of os.environ with NODE_MODULES_DIR added/updated.
Use this for all subprocess calls in plugin tests.
Matches JS: getExtensionsDir()
Tries chrome_utils.js first, falls back to Python computation.
"""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
return env
returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir')
if returncode == 0 and stdout.strip():
return stdout.strip()
# Fallback to default computation if JS call fails
data_dir = os.environ.get('DATA_DIR', './data')
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
"""Find the Chromium binary using chrome_utils.js findChromium().
def find_chromium(data_dir: Optional[str] = None) -> Optional[str]:
"""Find the Chromium binary path.
This uses the centralized findChromium() function which checks:
Matches JS: findChromium()
Uses chrome_utils.js which checks:
- CHROME_BINARY env var
- @puppeteer/browsers install locations
- System Chromium locations
- Falls back to Chrome (with warning)
Args:
data_dir: Directory where chromium was installed (contains chromium/ subdir)
data_dir: Optional DATA_DIR override
Returns:
Path to Chromium binary or None if not found
"""
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
result = subprocess.run(
['node', str(CHROME_UTILS), 'findChromium', str(search_dir)],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
env = os.environ.copy()
if data_dir:
env['DATA_DIR'] = str(data_dir)
returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env)
if returncode == 0 and stdout.strip():
return stdout.strip()
return None
def get_extensions_dir() -> str:
"""Get the Chrome extensions directory using chrome_utils.js getExtensionsDir().
def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool:
"""Kill a Chrome process by PID.
This uses the centralized path calculation from chrome_utils.js which checks:
- CHROME_EXTENSIONS_DIR env var
- DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions
Matches JS: killChrome()
Uses chrome_utils.js which handles:
- SIGTERM then SIGKILL
- Process group killing
- Zombie process cleanup
Args:
pid: Process ID to kill
output_dir: Optional chrome output directory for PID file cleanup
Returns:
Path to extensions directory
True if the kill command succeeded
"""
args = [str(pid)]
if output_dir:
args.append(str(output_dir))
returncode, stdout, stderr = _call_chrome_utils('killChrome', *args)
return returncode == 0
def get_test_env() -> dict:
"""Get environment dict with all paths set correctly for tests.
Matches JS: getTestEnv()
Tries chrome_utils.js first for path values, builds env dict.
Use this for all subprocess calls in plugin tests.
"""
env = os.environ.copy()
# Try to get all paths from JS (single source of truth)
returncode, stdout, stderr = _call_chrome_utils('getTestEnv')
if returncode == 0 and stdout.strip():
try:
js_env = json.loads(stdout)
env.update(js_env)
return env
except json.JSONDecodeError:
pass
# Fallback to Python computation
lib_dir = get_lib_dir()
env['LIB_DIR'] = str(lib_dir)
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
env['MACHINE_TYPE'] = get_machine_type()
return env
# Backward compatibility aliases (deprecated, use new names)
find_chromium_binary = find_chromium
kill_chrome_via_js = kill_chrome
get_machine_type_from_js = get_machine_type
get_test_env_from_js = get_test_env
# =============================================================================
# Module-level constants (lazy-loaded on first access)
# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
# =============================================================================
# These are computed once when first accessed
_LIB_DIR: Optional[Path] = None
_NODE_MODULES_DIR: Optional[Path] = None
def _get_lib_dir_cached() -> Path:
global _LIB_DIR
if _LIB_DIR is None:
_LIB_DIR = get_lib_dir()
return _LIB_DIR
def _get_node_modules_dir_cached() -> Path:
global _NODE_MODULES_DIR
if _NODE_MODULES_DIR is None:
_NODE_MODULES_DIR = get_node_modules_dir()
return _NODE_MODULES_DIR
# Module-level constants that can be imported directly
# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
class _LazyPath:
"""Lazy path that computes value on first access."""
def __init__(self, getter):
self._getter = getter
self._value = None
def __fspath__(self):
if self._value is None:
self._value = self._getter()
return str(self._value)
def __truediv__(self, other):
if self._value is None:
self._value = self._getter()
return self._value / other
def __str__(self):
return self.__fspath__()
def __repr__(self):
return f"<LazyPath: {self.__fspath__()}>"
LIB_DIR = _LazyPath(_get_lib_dir_cached)
NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached)
# =============================================================================
# Hook Execution Helpers
# =============================================================================
def run_hook(
hook_script: Path,
url: str,
snapshot_id: str,
cwd: Optional[Path] = None,
env: Optional[dict] = None,
timeout: int = 60,
extra_args: Optional[List[str]] = None,
) -> Tuple[int, str, str]:
"""Run a hook script and return (returncode, stdout, stderr).
Usage:
returncode, stdout, stderr = run_hook(
HOOK_SCRIPT, 'https://example.com', 'test-snap-123',
cwd=tmpdir, env=get_test_env()
)
Args:
hook_script: Path to the hook script
url: URL to process
snapshot_id: Snapshot ID
cwd: Working directory (default: current dir)
env: Environment dict (default: get_test_env())
timeout: Timeout in seconds
extra_args: Additional arguments to pass
Returns:
Tuple of (returncode, stdout, stderr)
"""
if env is None:
env = get_test_env()
# Determine interpreter based on file extension
if hook_script.suffix == '.py':
cmd = ['python', str(hook_script)]
elif hook_script.suffix == '.js':
cmd = ['node', str(hook_script)]
else:
cmd = [str(hook_script)]
cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}'])
if extra_args:
cmd.extend(extra_args)
result = subprocess.run(
['node', str(CHROME_UTILS), 'getExtensionsDir'],
cmd,
cwd=str(cwd) if cwd else None,
capture_output=True,
text=True,
timeout=10,
env=get_test_env()
env=env,
timeout=timeout
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
# Fallback to default computation if JS call fails
data_dir = os.environ.get('DATA_DIR', './data')
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
return result.returncode, result.stdout, result.stderr
def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]:
"""Parse JSONL output from hook stdout and return the specified record type.
Usage:
result = parse_jsonl_output(stdout)
if result and result['status'] == 'succeeded':
print("Success!")
Args:
stdout: The stdout from a hook execution
record_type: The 'type' field to look for (default: 'ArchiveResult')
Returns:
The parsed JSON dict or None if not found
"""
for line in stdout.strip().split('\n'):
line = line.strip()
if not line.startswith('{'):
continue
try:
record = json.loads(line)
if record.get('type') == record_type:
return record
except json.JSONDecodeError:
continue
return None
def run_hook_and_parse(
hook_script: Path,
url: str,
snapshot_id: str,
cwd: Optional[Path] = None,
env: Optional[dict] = None,
timeout: int = 60,
extra_args: Optional[List[str]] = None,
) -> Tuple[int, Optional[Dict[str, Any]], str]:
"""Run a hook and parse its JSONL output.
Convenience function combining run_hook() and parse_jsonl_output().
Returns:
Tuple of (returncode, parsed_result_or_none, stderr)
"""
returncode, stdout, stderr = run_hook(
hook_script, url, snapshot_id,
cwd=cwd, env=env, timeout=timeout, extra_args=extra_args
)
result = parse_jsonl_output(stdout)
return returncode, result, stderr
# =============================================================================
# Extension Test Helpers
# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha)
# =============================================================================
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for extension tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
Default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
Args:
tmpdir: Base temporary directory for the test
Returns:
Environment dict with all paths set, or pytest.skip() if Chrome install fails
"""
import pytest
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Only set headless if not already in environment (allow override for debugging)
if 'CHROME_HEADLESS' not in os.environ:
env['CHROME_HEADLESS'] = 'true'
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]:
"""Launch Chromium and return (process, cdp_url).
This launches Chrome using the chrome launch hook and waits for the CDP URL
to become available. Use this for extension tests that need direct CDP access.
Args:
env: Environment dict (from setup_test_env)
chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.)
crawl_id: ID for the crawl
Returns:
Tuple of (chrome_launch_process, cdp_url)
Raises:
RuntimeError: If Chrome fails to launch or CDP URL not available after 20s
"""
chrome_dir.mkdir(parents=True, exist_ok=True)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
return chrome_launch_process, cdp_url
def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None:
"""Clean up Chromium process launched by launch_chromium_session.
Uses chrome_utils.js killChrome for proper process group handling.
Args:
chrome_launch_process: The Popen object from launch_chromium_session
chrome_dir: The chrome directory containing chrome.pid
"""
# First try to terminate the launch process gracefully
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except Exception:
pass
# Read PID and use JS to kill with proper cleanup
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
kill_chrome(chrome_pid, str(chrome_dir))
except (ValueError, FileNotFoundError):
pass
@contextmanager
def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Context manager for Chromium sessions with automatic cleanup.
Usage:
with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url):
# Use cdp_url to connect with puppeteer
pass
# Chromium automatically cleaned up
Args:
env: Environment dict (from setup_test_env)
chrome_dir: Directory for Chrome files
crawl_id: ID for the crawl
Yields:
Tuple of (chrome_launch_process, cdp_url)
"""
chrome_launch_process = None
try:
chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id)
yield chrome_launch_process, cdp_url
finally:
if chrome_launch_process:
kill_chromium_session(chrome_launch_process, chrome_dir)
# =============================================================================
# Tab-based Test Helpers
# Used by tab-based tests (infiniscroll, modalcloser)
# =============================================================================
def setup_chrome_session(
@@ -210,25 +789,28 @@ def setup_chrome_session(
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None:
"""Clean up Chrome processes.
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
"""Clean up Chrome processes using chrome_utils.js killChrome.
Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID.
Ignores errors if processes are already dead.
Uses the centralized kill logic from chrome_utils.js which handles:
- SIGTERM then SIGKILL
- Process group killing
- Zombie process cleanup
Args:
chrome_launch_process: The Popen object for the chrome launch hook
chrome_pid: The PID of the Chrome process
chrome_dir: Optional path to chrome output directory
"""
# First try to terminate the launch process gracefully
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except Exception:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
# Use JS to kill Chrome with proper process group handling
kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None)
@contextmanager

View File

@@ -28,70 +28,25 @@ import tempfile
import shutil
import platform
PLUGIN_DIR = Path(__file__).parent.parent
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_lib_dir,
get_node_modules_dir,
find_chromium_binary,
CHROME_PLUGIN_DIR as PLUGIN_DIR,
CHROME_LAUNCH_HOOK,
CHROME_TAB_HOOK,
CHROME_NAVIGATE_HOOK,
)
# Get LIB_DIR and MACHINE_TYPE from environment or compute them
def get_lib_dir_and_machine_type():
"""Get or compute LIB_DIR and MACHINE_TYPE for tests."""
from archivebox.config.paths import get_machine_type
from archivebox.config.common import STORAGE_CONFIG
lib_dir = os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)
machine_type = os.environ.get('MACHINE_TYPE') or get_machine_type()
return Path(lib_dir), machine_type
# Setup NODE_MODULES_DIR to find npm packages
LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type()
# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin)
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
# Get LIB_DIR and NODE_MODULES_DIR from shared helpers
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = get_node_modules_dir()
NPM_PREFIX = LIB_DIR / 'npm'
# Chromium install location (relative to DATA_DIR)
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
def get_test_env():
"""Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly."""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
env['MACHINE_TYPE'] = MACHINE_TYPE
# Ensure CHROME_BINARY is set to Chromium
if 'CHROME_BINARY' not in env:
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
return env
def find_chromium_binary(data_dir=None):
"""Find the Chromium binary using chrome_utils.js findChromium().
This uses the centralized findChromium() function which checks:
- CHROME_BINARY env var
- @puppeteer/browsers install locations (in data_dir/chromium)
- System Chromium locations
- Falls back to Chrome (with warning)
Args:
data_dir: Directory where chromium was installed (contains chromium/ subdir)
"""
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
# Use provided data_dir, or fall back to env var, or current dir
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
result = subprocess.run(
['node', str(chrome_utils), 'findChromium', str(search_dir)],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None
@pytest.fixture(scope="session", autouse=True)
def ensure_chromium_and_puppeteer_installed():

View File

@@ -20,29 +20,22 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_plugin_dir,
get_hook_script,
run_hook_and_parse,
LIB_DIR,
NODE_MODULES_DIR,
PLUGINS_ROOT,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
PLUGIN_DIR = get_plugin_dir(__file__)
DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_MODULES_DIR
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""

View File

@@ -2,7 +2,6 @@
Integration tests for favicon plugin
Tests verify:
pass
1. Plugin script exists
2. requests library is available
3. Favicon extraction works for real example.com
@@ -21,9 +20,15 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
PLUGIN_DIR = Path(__file__).parent.parent
FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None)
PLUGIN_DIR = get_plugin_dir(__file__)
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
TEST_URL = 'https://example.com'

View File

@@ -14,6 +14,14 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env,
launch_chromium_session,
kill_chromium_session,
CHROME_LAUNCH_HOOK,
PLUGINS_ROOT,
)
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
@@ -124,107 +132,6 @@ def test_no_configuration_required():
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
Default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
TEST_URL = 'https://www.filmin.es/'
@@ -420,54 +327,6 @@ const puppeteer = require('puppeteer-core');
pass
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
chrome_dir.mkdir(parents=True, exist_ok=True)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
return chrome_launch_process, cdp_url
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
"""Clean up Chromium process."""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
"""Check if cookie consent elements are visible on a page.

View File

@@ -2,7 +2,6 @@
Integration tests for mercury plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -19,9 +18,15 @@ import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None)
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
PLUGINS_ROOT,
)
PLUGIN_DIR = get_plugin_dir(__file__)
MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*')
TEST_URL = 'https://example.com'
def test_hook_script_exists():

View File

@@ -21,29 +21,22 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_plugin_dir,
get_hook_script,
run_hook_and_parse,
LIB_DIR,
NODE_MODULES_DIR,
PLUGINS_ROOT,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
PLUGIN_DIR = get_plugin_dir(__file__)
PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*')
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_MODULES_DIR
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""

View File

@@ -2,7 +2,6 @@
Integration tests for readability plugin
Tests verify:
pass
1. Validate hook checks for readability-extractor binary
2. Verify deps with abx-pkg
3. Plugin reports missing dependency correctly
@@ -18,10 +17,15 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
PLUGINS_ROOT,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*'))
PLUGIN_DIR = get_plugin_dir(__file__)
READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*')
TEST_URL = 'https://example.com'

View File

@@ -20,28 +20,20 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_plugin_dir,
get_hook_script,
run_hook_and_parse,
LIB_DIR,
NODE_MODULES_DIR,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
PLUGIN_DIR = get_plugin_dir(__file__)
SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_MODULES_DIR
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""

View File

@@ -77,27 +77,9 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Chrome binary search paths
CHROMIUM_BINARY_NAMES_LINUX = [
'chromium', 'chromium-browser', 'chromium-browser-beta',
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
]
CHROME_BINARY_NAMES_LINUX = [
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
]
CHROME_BINARY_NAMES_MACOS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
]
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
ALL_CHROME_BINARIES = (
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
)
# Chrome session directory (relative to extractor output dir)
# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for.
# The centralized Chrome binary search is in chrome_utils.js findChromium().
CHROME_SESSION_DIR = '../chrome'

View File

@@ -6,6 +6,8 @@ Tests verify:
2. CLI-based singlefile extraction works
3. Dependencies available via abx-pkg
4. Output contains valid HTML
5. Connects to Chrome session via CDP when available
6. Works with extensions loaded (ublock, etc.)
"""
import json
@@ -16,10 +18,17 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_plugin_dir,
get_hook_script,
setup_chrome_session,
cleanup_chrome,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
PLUGIN_DIR = get_plugin_dir(__file__)
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
TEST_URL = "https://example.com"
@@ -52,7 +61,7 @@ def test_singlefile_cli_archives_example_com():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env = get_test_env()
env['SINGLEFILE_ENABLED'] = 'true'
# Run singlefile snapshot hook
@@ -78,5 +87,89 @@ def test_singlefile_cli_archives_example_com():
assert 'Example Domain' in html_content, "Output should contain example.com content"
def test_singlefile_with_chrome_session():
"""Test singlefile connects to existing Chrome session via CDP.
When a Chrome session exists (chrome/cdp_url.txt), singlefile should
connect to it instead of launching a new Chrome instance.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
try:
# Set up Chrome session using shared helper
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
tmpdir=tmpdir,
crawl_id='singlefile-test-crawl',
snapshot_id='singlefile-test-snap',
test_url=TEST_URL,
navigate=False, # Don't navigate, singlefile will do that
timeout=20,
)
# singlefile looks for ../chrome/cdp_url.txt relative to cwd
# So we need to run from a directory that has ../chrome pointing to our chrome dir
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
# Create symlink so singlefile can find the chrome session
chrome_link = singlefile_output_dir.parent / 'chrome'
if not chrome_link.exists():
chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
env = get_test_env()
env['SINGLEFILE_ENABLED'] = 'true'
env['CHROME_HEADLESS'] = 'true'
# Run singlefile - it should find and use the existing Chrome session
result = subprocess.run(
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'],
cwd=str(singlefile_output_dir),
capture_output=True,
text=True,
env=env,
timeout=120
)
# Verify output
output_file = singlefile_output_dir / 'singlefile.html'
if output_file.exists():
html_content = output_file.read_text()
assert len(html_content) > 500, "Output file too small"
assert 'Example Domain' in html_content, "Should contain example.com content"
else:
# If singlefile couldn't connect to Chrome, it may have failed
# Check if it mentioned browser-server in its args (indicating it tried to use CDP)
assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
finally:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_singlefile_disabled_skips():
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = get_test_env()
env['SINGLEFILE_ENABLED'] = 'False'
result = subprocess.run(
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when disabled: {result.stderr}"
# Should NOT emit JSONL when disabled
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when disabled, but got: {jsonl_lines}"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -2,7 +2,6 @@
Integration tests for title plugin
Tests verify:
pass
1. Plugin script exists
2. Node.js is available
3. Title extraction works for real example.com
@@ -20,9 +19,15 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
PLUGIN_DIR = Path(__file__).parent.parent
TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None)
PLUGIN_DIR = get_plugin_dir(__file__)
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
TEST_URL = 'https://example.com'

View File

@@ -16,184 +16,25 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env,
launch_chromium_session,
kill_chromium_session,
CHROME_LAUNCH_HOOK,
PLUGINS_ROOT,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Only set headless if not already in environment (allow override for debugging)
if 'CHROME_HEADLESS' not in os.environ:
env['CHROME_HEADLESS'] = 'true'
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url)."""
chrome_dir.mkdir(parents=True, exist_ok=True)
process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
cdp_url = None
extensions_ready = False
for _ in range(30):
if process.poll() is not None:
stdout, stderr = process.communicate()
raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
ext_file = chrome_dir / 'extensions.json'
if cdp_file.exists() and not cdp_url:
cdp_url = cdp_file.read_text().strip()
if ext_file.exists():
extensions_ready = True
if cdp_url and extensions_ready:
break
time.sleep(1)
if not cdp_url:
process.kill()
stdout, stderr = process.communicate()
raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
# Print chrome launch hook output for debugging
import select
if hasattr(select, 'poll'):
# Read any available stderr without blocking
import fcntl
import os as os_module
fd = process.stderr.fileno()
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
try:
stderr_output = process.stderr.read()
if stderr_output:
print(f"[Chrome Launch Hook Output]\n{stderr_output}")
except:
pass
return process, cdp_url
def kill_chrome(process, chrome_dir: Path):
"""Kill Chromium process."""
try:
process.send_signal(signal.SIGTERM)
process.wait(timeout=5)
except:
pass
pid_file = chrome_dir / 'chrome.pid'
if pid_file.exists():
try:
os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
except:
pass
# Alias for backward compatibility with existing test names
launch_chrome = launch_chromium_session
kill_chrome = kill_chromium_session
class TestTwoCaptcha:

View File

@@ -12,6 +12,14 @@ from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
setup_test_env,
launch_chromium_session,
kill_chromium_session,
CHROME_LAUNCH_HOOK,
PLUGINS_ROOT,
)
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
@@ -157,64 +165,6 @@ def test_large_extension_size():
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
import signal
import time
chrome_dir.mkdir(parents=True, exist_ok=True)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
return chrome_launch_process, cdp_url
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
"""Clean up Chromium process."""
import signal
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
"""Check ad blocking effectiveness by counting ad elements on page.
@@ -350,103 +300,6 @@ const puppeteer = require('puppeteer-core');
return json.loads(output_lines[-1])
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
# Test URL: Yahoo has many ads that uBlock should block
TEST_URL = 'https://www.yahoo.com/'