mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
Consolidate Chrome test helpers across all plugin tests (#1738)
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes. --> # Summary <!--e.g. This PR fixes ABC or adds the ability to do XYZ...--> # Related issues <!-- e.g. #123 or Roadmap goal # https://github.com/pirate/ArchiveBox/wiki/Roadmap --> # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk
This commit is contained in:
@@ -1333,6 +1333,83 @@ function getExtensionsDir() {
|
||||
path.join(dataDir, 'personas', persona, 'chrome_extensions');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get machine type string for platform-specific paths.
|
||||
* Matches Python's archivebox.config.paths.get_machine_type()
|
||||
*
|
||||
* @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin')
|
||||
*/
|
||||
function getMachineType() {
|
||||
if (process.env.MACHINE_TYPE) {
|
||||
return process.env.MACHINE_TYPE;
|
||||
}
|
||||
|
||||
let machine = process.arch;
|
||||
const system = process.platform;
|
||||
|
||||
// Normalize machine type to match Python's convention
|
||||
if (machine === 'arm64' || machine === 'aarch64') {
|
||||
machine = 'arm64';
|
||||
} else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') {
|
||||
machine = 'x86_64';
|
||||
} else if (machine === 'ia32' || machine === 'x86') {
|
||||
machine = 'x86';
|
||||
}
|
||||
|
||||
return `${machine}-${system}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get LIB_DIR path for platform-specific binaries.
|
||||
* Returns DATA_DIR/lib/MACHINE_TYPE/
|
||||
*
|
||||
* @returns {string} - Absolute path to lib directory
|
||||
*/
|
||||
function getLibDir() {
|
||||
if (process.env.LIB_DIR) {
|
||||
return process.env.LIB_DIR;
|
||||
}
|
||||
const dataDir = getEnv('DATA_DIR', './data');
|
||||
const machineType = getMachineType();
|
||||
return path.join(dataDir, 'lib', machineType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get NODE_MODULES_DIR path for npm packages.
|
||||
* Returns LIB_DIR/npm/node_modules/
|
||||
*
|
||||
* @returns {string} - Absolute path to node_modules directory
|
||||
*/
|
||||
function getNodeModulesDir() {
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
return process.env.NODE_MODULES_DIR;
|
||||
}
|
||||
return path.join(getLibDir(), 'npm', 'node_modules');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all test environment paths as a JSON object.
|
||||
* This is the single source of truth for path calculations - Python calls this
|
||||
* to avoid duplicating path logic.
|
||||
*
|
||||
* @returns {Object} - Object with all test environment paths
|
||||
*/
|
||||
function getTestEnv() {
|
||||
const dataDir = getEnv('DATA_DIR', './data');
|
||||
const machineType = getMachineType();
|
||||
const libDir = getLibDir();
|
||||
const nodeModulesDir = getNodeModulesDir();
|
||||
|
||||
return {
|
||||
DATA_DIR: dataDir,
|
||||
MACHINE_TYPE: machineType,
|
||||
LIB_DIR: libDir,
|
||||
NODE_MODULES_DIR: nodeModulesDir,
|
||||
NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'),
|
||||
CHROME_EXTENSIONS_DIR: getExtensionsDir(),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Install a Chrome extension with caching support.
|
||||
*
|
||||
@@ -1442,8 +1519,13 @@ module.exports = {
|
||||
getExtensionPaths,
|
||||
waitForExtensionTarget,
|
||||
getExtensionTargets,
|
||||
// Shared extension installer utilities
|
||||
// Shared path utilities (single source of truth for Python/JS)
|
||||
getMachineType,
|
||||
getLibDir,
|
||||
getNodeModulesDir,
|
||||
getExtensionsDir,
|
||||
getTestEnv,
|
||||
// Shared extension installer utilities
|
||||
installExtensionWithCache,
|
||||
// Deprecated - use enableExtensions option instead
|
||||
getExtensionLaunchArgs,
|
||||
@@ -1457,18 +1539,31 @@ if (require.main === module) {
|
||||
console.log('Usage: chrome_utils.js <command> [args...]');
|
||||
console.log('');
|
||||
console.log('Commands:');
|
||||
console.log(' findChromium');
|
||||
console.log(' installChromium');
|
||||
console.log(' installPuppeteerCore [npm_prefix]');
|
||||
console.log(' launchChromium [output_dir] [extension_paths_json]');
|
||||
console.log(' killChrome <pid> [output_dir]');
|
||||
console.log(' killZombieChrome [data_dir]');
|
||||
console.log(' getExtensionId <path>');
|
||||
console.log(' loadExtensionManifest <path>');
|
||||
console.log(' getExtensionLaunchArgs <extensions_json>');
|
||||
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
|
||||
console.log(' getExtensionsDir');
|
||||
console.log(' installExtensionWithCache <webstore_id> <name>');
|
||||
console.log(' findChromium Find Chrome/Chromium binary');
|
||||
console.log(' installChromium Install Chromium via @puppeteer/browsers');
|
||||
console.log(' installPuppeteerCore Install puppeteer-core npm package');
|
||||
console.log(' launchChromium Launch Chrome with CDP debugging');
|
||||
console.log(' killChrome <pid> Kill Chrome process by PID');
|
||||
console.log(' killZombieChrome Clean up zombie Chrome processes');
|
||||
console.log('');
|
||||
console.log(' getMachineType Get machine type (e.g., x86_64-linux)');
|
||||
console.log(' getLibDir Get LIB_DIR path');
|
||||
console.log(' getNodeModulesDir Get NODE_MODULES_DIR path');
|
||||
console.log(' getExtensionsDir Get Chrome extensions directory');
|
||||
console.log(' getTestEnv Get all paths as JSON (for tests)');
|
||||
console.log('');
|
||||
console.log(' getExtensionId <path> Get extension ID from unpacked path');
|
||||
console.log(' loadExtensionManifest Load extension manifest.json');
|
||||
console.log(' loadOrInstallExtension Load or install an extension');
|
||||
console.log(' installExtensionWithCache Install extension with caching');
|
||||
console.log('');
|
||||
console.log('Environment variables:');
|
||||
console.log(' DATA_DIR Base data directory');
|
||||
console.log(' LIB_DIR Library directory (computed if not set)');
|
||||
console.log(' MACHINE_TYPE Machine type override');
|
||||
console.log(' NODE_MODULES_DIR Node modules directory');
|
||||
console.log(' CHROME_BINARY Chrome binary path');
|
||||
console.log(' CHROME_EXTENSIONS_DIR Extensions directory');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
@@ -1581,11 +1676,31 @@ if (require.main === module) {
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getMachineType': {
|
||||
console.log(getMachineType());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getLibDir': {
|
||||
console.log(getLibDir());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getNodeModulesDir': {
|
||||
console.log(getNodeModulesDir());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getExtensionsDir': {
|
||||
console.log(getExtensionsDir());
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getTestEnv': {
|
||||
console.log(JSON.stringify(getTestEnv(), null, 2));
|
||||
break;
|
||||
}
|
||||
|
||||
case 'installExtensionWithCache': {
|
||||
const [webstore_id, name] = commandArgs;
|
||||
if (!webstore_id || !name) {
|
||||
|
||||
@@ -2,25 +2,69 @@
|
||||
Shared Chrome test helpers for plugin integration tests.
|
||||
|
||||
This module provides common utilities for Chrome-based plugin tests, reducing
|
||||
duplication across test files. It uses the JavaScript utilities from chrome_utils.js
|
||||
where appropriate.
|
||||
duplication across test files. Functions delegate to chrome_utils.js (the single
|
||||
source of truth) with Python fallbacks.
|
||||
|
||||
Function names match the JS equivalents in snake_case:
|
||||
JS: getMachineType() -> Python: get_machine_type()
|
||||
JS: getLibDir() -> Python: get_lib_dir()
|
||||
JS: getNodeModulesDir() -> Python: get_node_modules_dir()
|
||||
JS: getExtensionsDir() -> Python: get_extensions_dir()
|
||||
JS: findChromium() -> Python: find_chromium()
|
||||
JS: killChrome() -> Python: kill_chrome()
|
||||
JS: getTestEnv() -> Python: get_test_env()
|
||||
|
||||
Usage:
|
||||
# Path helpers (delegate to chrome_utils.js):
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
find_chromium_binary,
|
||||
get_node_modules_dir,
|
||||
get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE
|
||||
get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin'
|
||||
get_lib_dir, # Path to lib dir
|
||||
get_node_modules_dir, # Path to node_modules
|
||||
get_extensions_dir, # Path to chrome extensions
|
||||
find_chromium, # Find Chrome/Chromium binary
|
||||
kill_chrome, # Kill Chrome process by PID
|
||||
)
|
||||
|
||||
# Test file helpers:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path
|
||||
get_hook_script, # Find hook script by glob pattern
|
||||
PLUGINS_ROOT, # Path to plugins root
|
||||
LIB_DIR, # Path to lib dir (lazy-loaded)
|
||||
NODE_MODULES_DIR, # Path to node_modules (lazy-loaded)
|
||||
)
|
||||
|
||||
# For Chrome session tests:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_chrome_session, # Full Chrome + tab setup
|
||||
cleanup_chrome, # Cleanup by PID
|
||||
chrome_session, # Context manager
|
||||
)
|
||||
|
||||
# For extension tests:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env, # Full dir structure + Chrome install
|
||||
launch_chromium_session, # Launch Chrome, return CDP URL
|
||||
kill_chromium_session, # Cleanup Chrome
|
||||
)
|
||||
|
||||
# Run hooks and parse JSONL:
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
run_hook, # Run hook, return (returncode, stdout, stderr)
|
||||
parse_jsonl_output, # Parse JSONL from stdout
|
||||
)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
from typing import Tuple, Optional, List, Dict, Any
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
@@ -29,88 +73,623 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
|
||||
|
||||
# Hook script locations
|
||||
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
|
||||
|
||||
|
||||
def get_node_modules_dir() -> Path:
|
||||
"""Get NODE_MODULES_DIR for tests, checking env first.
|
||||
# =============================================================================
|
||||
# Path Helpers - delegates to chrome_utils.js with Python fallback
|
||||
# Function names match JS: getMachineType -> get_machine_type, etc.
|
||||
# =============================================================================
|
||||
|
||||
Returns the path to the node_modules directory, checking:
|
||||
1. NODE_MODULES_DIR environment variable
|
||||
2. Computed from LIB_DIR via ArchiveBox config
|
||||
|
||||
def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]:
|
||||
"""Call chrome_utils.js CLI command (internal helper).
|
||||
|
||||
This is the central dispatch for calling the JS utilities from Python.
|
||||
All path calculations and Chrome operations are centralized in chrome_utils.js
|
||||
to ensure consistency between Python and JavaScript code.
|
||||
|
||||
Args:
|
||||
command: The CLI command (e.g., 'findChromium', 'getTestEnv')
|
||||
*args: Additional command arguments
|
||||
env: Environment dict (default: current env)
|
||||
|
||||
Returns:
|
||||
Tuple of (returncode, stdout, stderr)
|
||||
"""
|
||||
cmd = ['node', str(CHROME_UTILS), command] + list(args)
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env or os.environ.copy()
|
||||
)
|
||||
return result.returncode, result.stdout, result.stderr
|
||||
|
||||
|
||||
def get_plugin_dir(test_file: str) -> Path:
|
||||
"""Get the plugin directory from a test file path.
|
||||
|
||||
Usage:
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
|
||||
Args:
|
||||
test_file: The __file__ of the test module (e.g., test_screenshot.py)
|
||||
|
||||
Returns:
|
||||
Path to the plugin directory (e.g., plugins/screenshot/)
|
||||
"""
|
||||
return Path(test_file).parent.parent
|
||||
|
||||
|
||||
def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]:
|
||||
"""Find a hook script in a plugin directory by pattern.
|
||||
|
||||
Usage:
|
||||
HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
||||
|
||||
Args:
|
||||
plugin_dir: Path to the plugin directory
|
||||
pattern: Glob pattern to match
|
||||
|
||||
Returns:
|
||||
Path to the hook script or None if not found
|
||||
"""
|
||||
matches = list(plugin_dir.glob(pattern))
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
def get_machine_type() -> str:
|
||||
"""Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin').
|
||||
|
||||
Matches JS: getMachineType()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
# Try JS first (single source of truth)
|
||||
returncode, stdout, stderr = _call_chrome_utils('getMachineType')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return stdout.strip()
|
||||
|
||||
# Fallback to Python computation
|
||||
if os.environ.get('MACHINE_TYPE'):
|
||||
return os.environ['MACHINE_TYPE']
|
||||
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
return f"{machine}-{system}"
|
||||
|
||||
|
||||
def get_lib_dir() -> Path:
|
||||
"""Get LIB_DIR path for platform-specific binaries.
|
||||
|
||||
Matches JS: getLibDir()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
# Try JS first
|
||||
returncode, stdout, stderr = _call_chrome_utils('getLibDir')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return Path(stdout.strip())
|
||||
|
||||
# Fallback to Python
|
||||
if os.environ.get('LIB_DIR'):
|
||||
return Path(os.environ['LIB_DIR'])
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
|
||||
def get_node_modules_dir() -> Path:
|
||||
"""Get NODE_MODULES_DIR path for npm packages.
|
||||
|
||||
Matches JS: getNodeModulesDir()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
# Try JS first
|
||||
returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return Path(stdout.strip())
|
||||
|
||||
# Fallback to Python
|
||||
if os.environ.get('NODE_MODULES_DIR'):
|
||||
return Path(os.environ['NODE_MODULES_DIR'])
|
||||
# Otherwise compute from LIB_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
lib_dir = get_lib_dir()
|
||||
return lib_dir / 'npm' / 'node_modules'
|
||||
|
||||
|
||||
def get_test_env() -> dict:
|
||||
"""Get environment dict with NODE_MODULES_DIR set correctly for tests.
|
||||
def get_extensions_dir() -> str:
|
||||
"""Get the Chrome extensions directory path.
|
||||
|
||||
Returns a copy of os.environ with NODE_MODULES_DIR added/updated.
|
||||
Use this for all subprocess calls in plugin tests.
|
||||
Matches JS: getExtensionsDir()
|
||||
|
||||
Tries chrome_utils.js first, falls back to Python computation.
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
|
||||
return env
|
||||
returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir')
|
||||
if returncode == 0 and stdout.strip():
|
||||
return stdout.strip()
|
||||
|
||||
# Fallback to default computation if JS call fails
|
||||
data_dir = os.environ.get('DATA_DIR', './data')
|
||||
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
|
||||
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
|
||||
|
||||
|
||||
def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
|
||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||
def find_chromium(data_dir: Optional[str] = None) -> Optional[str]:
|
||||
"""Find the Chromium binary path.
|
||||
|
||||
This uses the centralized findChromium() function which checks:
|
||||
Matches JS: findChromium()
|
||||
|
||||
Uses chrome_utils.js which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
|
||||
Args:
|
||||
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
||||
data_dir: Optional DATA_DIR override
|
||||
|
||||
Returns:
|
||||
Path to Chromium binary or None if not found
|
||||
"""
|
||||
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_UTILS), 'findChromium', str(search_dir)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
env = os.environ.copy()
|
||||
if data_dir:
|
||||
env['DATA_DIR'] = str(data_dir)
|
||||
returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env)
|
||||
if returncode == 0 and stdout.strip():
|
||||
return stdout.strip()
|
||||
return None
|
||||
|
||||
|
||||
def get_extensions_dir() -> str:
|
||||
"""Get the Chrome extensions directory using chrome_utils.js getExtensionsDir().
|
||||
def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool:
|
||||
"""Kill a Chrome process by PID.
|
||||
|
||||
This uses the centralized path calculation from chrome_utils.js which checks:
|
||||
- CHROME_EXTENSIONS_DIR env var
|
||||
- DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions
|
||||
Matches JS: killChrome()
|
||||
|
||||
Uses chrome_utils.js which handles:
|
||||
- SIGTERM then SIGKILL
|
||||
- Process group killing
|
||||
- Zombie process cleanup
|
||||
|
||||
Args:
|
||||
pid: Process ID to kill
|
||||
output_dir: Optional chrome output directory for PID file cleanup
|
||||
|
||||
Returns:
|
||||
Path to extensions directory
|
||||
True if the kill command succeeded
|
||||
"""
|
||||
args = [str(pid)]
|
||||
if output_dir:
|
||||
args.append(str(output_dir))
|
||||
returncode, stdout, stderr = _call_chrome_utils('killChrome', *args)
|
||||
return returncode == 0
|
||||
|
||||
|
||||
def get_test_env() -> dict:
|
||||
"""Get environment dict with all paths set correctly for tests.
|
||||
|
||||
Matches JS: getTestEnv()
|
||||
|
||||
Tries chrome_utils.js first for path values, builds env dict.
|
||||
Use this for all subprocess calls in plugin tests.
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
|
||||
# Try to get all paths from JS (single source of truth)
|
||||
returncode, stdout, stderr = _call_chrome_utils('getTestEnv')
|
||||
if returncode == 0 and stdout.strip():
|
||||
try:
|
||||
js_env = json.loads(stdout)
|
||||
env.update(js_env)
|
||||
return env
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback to Python computation
|
||||
lib_dir = get_lib_dir()
|
||||
env['LIB_DIR'] = str(lib_dir)
|
||||
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
|
||||
env['MACHINE_TYPE'] = get_machine_type()
|
||||
return env
|
||||
|
||||
|
||||
# Backward compatibility aliases (deprecated, use new names)
|
||||
find_chromium_binary = find_chromium
|
||||
kill_chrome_via_js = kill_chrome
|
||||
get_machine_type_from_js = get_machine_type
|
||||
get_test_env_from_js = get_test_env
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Module-level constants (lazy-loaded on first access)
|
||||
# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
|
||||
# =============================================================================
|
||||
|
||||
# These are computed once when first accessed
|
||||
_LIB_DIR: Optional[Path] = None
|
||||
_NODE_MODULES_DIR: Optional[Path] = None
|
||||
|
||||
|
||||
def _get_lib_dir_cached() -> Path:
|
||||
global _LIB_DIR
|
||||
if _LIB_DIR is None:
|
||||
_LIB_DIR = get_lib_dir()
|
||||
return _LIB_DIR
|
||||
|
||||
|
||||
def _get_node_modules_dir_cached() -> Path:
|
||||
global _NODE_MODULES_DIR
|
||||
if _NODE_MODULES_DIR is None:
|
||||
_NODE_MODULES_DIR = get_node_modules_dir()
|
||||
return _NODE_MODULES_DIR
|
||||
|
||||
|
||||
# Module-level constants that can be imported directly
|
||||
# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
|
||||
class _LazyPath:
|
||||
"""Lazy path that computes value on first access."""
|
||||
def __init__(self, getter):
|
||||
self._getter = getter
|
||||
self._value = None
|
||||
|
||||
def __fspath__(self):
|
||||
if self._value is None:
|
||||
self._value = self._getter()
|
||||
return str(self._value)
|
||||
|
||||
def __truediv__(self, other):
|
||||
if self._value is None:
|
||||
self._value = self._getter()
|
||||
return self._value / other
|
||||
|
||||
def __str__(self):
|
||||
return self.__fspath__()
|
||||
|
||||
def __repr__(self):
|
||||
return f"<LazyPath: {self.__fspath__()}>"
|
||||
|
||||
|
||||
LIB_DIR = _LazyPath(_get_lib_dir_cached)
|
||||
NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Hook Execution Helpers
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def run_hook(
|
||||
hook_script: Path,
|
||||
url: str,
|
||||
snapshot_id: str,
|
||||
cwd: Optional[Path] = None,
|
||||
env: Optional[dict] = None,
|
||||
timeout: int = 60,
|
||||
extra_args: Optional[List[str]] = None,
|
||||
) -> Tuple[int, str, str]:
|
||||
"""Run a hook script and return (returncode, stdout, stderr).
|
||||
|
||||
Usage:
|
||||
returncode, stdout, stderr = run_hook(
|
||||
HOOK_SCRIPT, 'https://example.com', 'test-snap-123',
|
||||
cwd=tmpdir, env=get_test_env()
|
||||
)
|
||||
|
||||
Args:
|
||||
hook_script: Path to the hook script
|
||||
url: URL to process
|
||||
snapshot_id: Snapshot ID
|
||||
cwd: Working directory (default: current dir)
|
||||
env: Environment dict (default: get_test_env())
|
||||
timeout: Timeout in seconds
|
||||
extra_args: Additional arguments to pass
|
||||
|
||||
Returns:
|
||||
Tuple of (returncode, stdout, stderr)
|
||||
"""
|
||||
if env is None:
|
||||
env = get_test_env()
|
||||
|
||||
# Determine interpreter based on file extension
|
||||
if hook_script.suffix == '.py':
|
||||
cmd = ['python', str(hook_script)]
|
||||
elif hook_script.suffix == '.js':
|
||||
cmd = ['node', str(hook_script)]
|
||||
else:
|
||||
cmd = [str(hook_script)]
|
||||
|
||||
cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}'])
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_UTILS), 'getExtensionsDir'],
|
||||
cmd,
|
||||
cwd=str(cwd) if cwd else None,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
env=get_test_env()
|
||||
env=env,
|
||||
timeout=timeout
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
# Fallback to default computation if JS call fails
|
||||
data_dir = os.environ.get('DATA_DIR', './data')
|
||||
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
|
||||
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
|
||||
return result.returncode, result.stdout, result.stderr
|
||||
|
||||
|
||||
def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]:
|
||||
"""Parse JSONL output from hook stdout and return the specified record type.
|
||||
|
||||
Usage:
|
||||
result = parse_jsonl_output(stdout)
|
||||
if result and result['status'] == 'succeeded':
|
||||
print("Success!")
|
||||
|
||||
Args:
|
||||
stdout: The stdout from a hook execution
|
||||
record_type: The 'type' field to look for (default: 'ArchiveResult')
|
||||
|
||||
Returns:
|
||||
The parsed JSON dict or None if not found
|
||||
"""
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if not line.startswith('{'):
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == record_type:
|
||||
return record
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def run_hook_and_parse(
|
||||
hook_script: Path,
|
||||
url: str,
|
||||
snapshot_id: str,
|
||||
cwd: Optional[Path] = None,
|
||||
env: Optional[dict] = None,
|
||||
timeout: int = 60,
|
||||
extra_args: Optional[List[str]] = None,
|
||||
) -> Tuple[int, Optional[Dict[str, Any]], str]:
|
||||
"""Run a hook and parse its JSONL output.
|
||||
|
||||
Convenience function combining run_hook() and parse_jsonl_output().
|
||||
|
||||
Returns:
|
||||
Tuple of (returncode, parsed_result_or_none, stderr)
|
||||
"""
|
||||
returncode, stdout, stderr = run_hook(
|
||||
hook_script, url, snapshot_id,
|
||||
cwd=cwd, env=env, timeout=timeout, extra_args=extra_args
|
||||
)
|
||||
result = parse_jsonl_output(stdout)
|
||||
return returncode, result, stderr
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Extension Test Helpers
|
||||
# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for extension tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
Default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
|
||||
Args:
|
||||
tmpdir: Base temporary directory for the test
|
||||
|
||||
Returns:
|
||||
Environment dict with all paths set, or pytest.skip() if Chrome install fails
|
||||
"""
|
||||
import pytest
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Only set headless if not already in environment (allow override for debugging)
|
||||
if 'CHROME_HEADLESS' not in os.environ:
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]:
|
||||
"""Launch Chromium and return (process, cdp_url).
|
||||
|
||||
This launches Chrome using the chrome launch hook and waits for the CDP URL
|
||||
to become available. Use this for extension tests that need direct CDP access.
|
||||
|
||||
Args:
|
||||
env: Environment dict (from setup_test_env)
|
||||
chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.)
|
||||
crawl_id: ID for the crawl
|
||||
|
||||
Returns:
|
||||
Tuple of (chrome_launch_process, cdp_url)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If Chrome fails to launch or CDP URL not available after 20s
|
||||
"""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None:
|
||||
"""Clean up Chromium process launched by launch_chromium_session.
|
||||
|
||||
Uses chrome_utils.js killChrome for proper process group handling.
|
||||
|
||||
Args:
|
||||
chrome_launch_process: The Popen object from launch_chromium_session
|
||||
chrome_dir: The chrome directory containing chrome.pid
|
||||
"""
|
||||
# First try to terminate the launch process gracefully
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Read PID and use JS to kill with proper cleanup
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
kill_chrome(chrome_pid, str(chrome_dir))
|
||||
except (ValueError, FileNotFoundError):
|
||||
pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Context manager for Chromium sessions with automatic cleanup.
|
||||
|
||||
Usage:
|
||||
with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url):
|
||||
# Use cdp_url to connect with puppeteer
|
||||
pass
|
||||
# Chromium automatically cleaned up
|
||||
|
||||
Args:
|
||||
env: Environment dict (from setup_test_env)
|
||||
chrome_dir: Directory for Chrome files
|
||||
crawl_id: ID for the crawl
|
||||
|
||||
Yields:
|
||||
Tuple of (chrome_launch_process, cdp_url)
|
||||
"""
|
||||
chrome_launch_process = None
|
||||
try:
|
||||
chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id)
|
||||
yield chrome_launch_process, cdp_url
|
||||
finally:
|
||||
if chrome_launch_process:
|
||||
kill_chromium_session(chrome_launch_process, chrome_dir)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tab-based Test Helpers
|
||||
# Used by tab-based tests (infiniscroll, modalcloser)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_chrome_session(
|
||||
@@ -210,25 +789,28 @@ def setup_chrome_session(
|
||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
|
||||
|
||||
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None:
|
||||
"""Clean up Chrome processes.
|
||||
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
|
||||
"""Clean up Chrome processes using chrome_utils.js killChrome.
|
||||
|
||||
Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID.
|
||||
Ignores errors if processes are already dead.
|
||||
Uses the centralized kill logic from chrome_utils.js which handles:
|
||||
- SIGTERM then SIGKILL
|
||||
- Process group killing
|
||||
- Zombie process cleanup
|
||||
|
||||
Args:
|
||||
chrome_launch_process: The Popen object for the chrome launch hook
|
||||
chrome_pid: The PID of the Chrome process
|
||||
chrome_dir: Optional path to chrome output directory
|
||||
"""
|
||||
# First try to terminate the launch process gracefully
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Use JS to kill Chrome with proper process group handling
|
||||
kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None)
|
||||
|
||||
|
||||
@contextmanager
|
||||
|
||||
@@ -28,70 +28,25 @@ import tempfile
|
||||
import shutil
|
||||
import platform
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_lib_dir,
|
||||
get_node_modules_dir,
|
||||
find_chromium_binary,
|
||||
CHROME_PLUGIN_DIR as PLUGIN_DIR,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
CHROME_TAB_HOOK,
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
)
|
||||
|
||||
# Get LIB_DIR and MACHINE_TYPE from environment or compute them
|
||||
def get_lib_dir_and_machine_type():
|
||||
"""Get or compute LIB_DIR and MACHINE_TYPE for tests."""
|
||||
from archivebox.config.paths import get_machine_type
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
lib_dir = os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)
|
||||
machine_type = os.environ.get('MACHINE_TYPE') or get_machine_type()
|
||||
|
||||
return Path(lib_dir), machine_type
|
||||
|
||||
# Setup NODE_MODULES_DIR to find npm packages
|
||||
LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type()
|
||||
# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin)
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
# Get LIB_DIR and NODE_MODULES_DIR from shared helpers
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = get_node_modules_dir()
|
||||
NPM_PREFIX = LIB_DIR / 'npm'
|
||||
|
||||
# Chromium install location (relative to DATA_DIR)
|
||||
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
env['MACHINE_TYPE'] = MACHINE_TYPE
|
||||
# Ensure CHROME_BINARY is set to Chromium
|
||||
if 'CHROME_BINARY' not in env:
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
return env
|
||||
|
||||
|
||||
def find_chromium_binary(data_dir=None):
|
||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||
|
||||
This uses the centralized findChromium() function which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations (in data_dir/chromium)
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
|
||||
Args:
|
||||
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
||||
"""
|
||||
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
|
||||
# Use provided data_dir, or fall back to env var, or current dir
|
||||
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
||||
result = subprocess.run(
|
||||
['node', str(chrome_utils), 'findChromium', str(search_dir)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def ensure_chromium_and_puppeteer_installed():
|
||||
|
||||
@@ -20,29 +20,22 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
|
||||
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
|
||||
NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for favicon plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Plugin script exists
|
||||
2. requests library is available
|
||||
3. Favicon extraction works for real example.com
|
||||
@@ -21,9 +20,15 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
@@ -14,6 +14,14 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
|
||||
@@ -124,107 +132,6 @@ def test_no_configuration_required():
|
||||
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
Default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
|
||||
@@ -420,54 +327,6 @@ const puppeteer = require('puppeteer-core');
|
||||
pass
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
||||
"""Clean up Chromium process."""
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check if cookie consent elements are visible on a page.
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for mercury plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
@@ -19,9 +18,15 @@ import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None)
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
|
||||
@@ -21,29 +21,22 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*')
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for readability plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Validate hook checks for readability-extractor binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Plugin reports missing dependency correctly
|
||||
@@ -18,10 +17,15 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*'))
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
@@ -20,28 +20,20 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -77,27 +77,9 @@ def has_staticfile_output() -> bool:
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
# Chrome binary search paths
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
'chromium', 'chromium-browser', 'chromium-browser-beta',
|
||||
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
|
||||
]
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
|
||||
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
|
||||
|
||||
ALL_CHROME_BINARIES = (
|
||||
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
|
||||
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
|
||||
)
|
||||
|
||||
|
||||
# Chrome session directory (relative to extractor output dir)
|
||||
# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for.
|
||||
# The centralized Chrome binary search is in chrome_utils.js findChromium().
|
||||
CHROME_SESSION_DIR = '../chrome'
|
||||
|
||||
|
||||
|
||||
@@ -6,6 +6,8 @@ Tests verify:
|
||||
2. CLI-based singlefile extraction works
|
||||
3. Dependencies available via abx-pkg
|
||||
4. Output contains valid HTML
|
||||
5. Connects to Chrome session via CDP when available
|
||||
6. Works with extensions loaded (ublock, etc.)
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -16,10 +18,17 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
@@ -52,7 +61,7 @@ def test_singlefile_cli_archives_example_com():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = os.environ.copy()
|
||||
env = get_test_env()
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
|
||||
# Run singlefile snapshot hook
|
||||
@@ -78,5 +87,89 @@ def test_singlefile_cli_archives_example_com():
|
||||
assert 'Example Domain' in html_content, "Output should contain example.com content"
|
||||
|
||||
|
||||
def test_singlefile_with_chrome_session():
|
||||
"""Test singlefile connects to existing Chrome session via CDP.
|
||||
|
||||
When a Chrome session exists (chrome/cdp_url.txt), singlefile should
|
||||
connect to it instead of launching a new Chrome instance.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
try:
|
||||
# Set up Chrome session using shared helper
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||
tmpdir=tmpdir,
|
||||
crawl_id='singlefile-test-crawl',
|
||||
snapshot_id='singlefile-test-snap',
|
||||
test_url=TEST_URL,
|
||||
navigate=False, # Don't navigate, singlefile will do that
|
||||
timeout=20,
|
||||
)
|
||||
|
||||
# singlefile looks for ../chrome/cdp_url.txt relative to cwd
|
||||
# So we need to run from a directory that has ../chrome pointing to our chrome dir
|
||||
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
|
||||
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create symlink so singlefile can find the chrome session
|
||||
chrome_link = singlefile_output_dir.parent / 'chrome'
|
||||
if not chrome_link.exists():
|
||||
chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
|
||||
|
||||
env = get_test_env()
|
||||
env['SINGLEFILE_ENABLED'] = 'true'
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Run singlefile - it should find and use the existing Chrome session
|
||||
result = subprocess.run(
|
||||
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'],
|
||||
cwd=str(singlefile_output_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# Verify output
|
||||
output_file = singlefile_output_dir / 'singlefile.html'
|
||||
if output_file.exists():
|
||||
html_content = output_file.read_text()
|
||||
assert len(html_content) > 500, "Output file too small"
|
||||
assert 'Example Domain' in html_content, "Should contain example.com content"
|
||||
else:
|
||||
# If singlefile couldn't connect to Chrome, it may have failed
|
||||
# Check if it mentioned browser-server in its args (indicating it tried to use CDP)
|
||||
assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
|
||||
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
|
||||
|
||||
finally:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_singlefile_disabled_skips():
|
||||
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = get_test_env()
|
||||
env['SINGLEFILE_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when disabled: {result.stderr}"
|
||||
|
||||
# Should NOT emit JSONL when disabled
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
Integration tests for title plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Plugin script exists
|
||||
2. Node.js is available
|
||||
3. Title extraction works for real example.com
|
||||
@@ -20,9 +19,15 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None)
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
@@ -16,184 +16,25 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Only set headless if not already in environment (allow override for debugging)
|
||||
if 'CHROME_HEADLESS' not in os.environ:
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url)."""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
cdp_url = None
|
||||
extensions_ready = False
|
||||
for _ in range(30):
|
||||
if process.poll() is not None:
|
||||
stdout, stderr = process.communicate()
|
||||
raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
ext_file = chrome_dir / 'extensions.json'
|
||||
if cdp_file.exists() and not cdp_url:
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
if ext_file.exists():
|
||||
extensions_ready = True
|
||||
if cdp_url and extensions_ready:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
process.kill()
|
||||
stdout, stderr = process.communicate()
|
||||
raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
|
||||
|
||||
# Print chrome launch hook output for debugging
|
||||
import select
|
||||
if hasattr(select, 'poll'):
|
||||
# Read any available stderr without blocking
|
||||
import fcntl
|
||||
import os as os_module
|
||||
fd = process.stderr.fileno()
|
||||
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
|
||||
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
|
||||
try:
|
||||
stderr_output = process.stderr.read()
|
||||
if stderr_output:
|
||||
print(f"[Chrome Launch Hook Output]\n{stderr_output}")
|
||||
except:
|
||||
pass
|
||||
|
||||
return process, cdp_url
|
||||
|
||||
|
||||
def kill_chrome(process, chrome_dir: Path):
|
||||
"""Kill Chromium process."""
|
||||
try:
|
||||
process.send_signal(signal.SIGTERM)
|
||||
process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
pid_file = chrome_dir / 'chrome.pid'
|
||||
if pid_file.exists():
|
||||
try:
|
||||
os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
|
||||
except:
|
||||
pass
|
||||
# Alias for backward compatibility with existing test names
|
||||
launch_chrome = launch_chromium_session
|
||||
kill_chrome = kill_chromium_session
|
||||
|
||||
|
||||
class TestTwoCaptcha:
|
||||
|
||||
@@ -12,6 +12,14 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
|
||||
@@ -157,64 +165,6 @@ def test_large_extension_size():
|
||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
||||
import signal
|
||||
import time
|
||||
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
||||
"""Clean up Chromium process."""
|
||||
import signal
|
||||
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check ad blocking effectiveness by counting ad elements on page.
|
||||
|
||||
@@ -350,103 +300,6 @@ const puppeteer = require('puppeteer-core');
|
||||
return json.loads(output_lines[-1])
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
# Test URL: Yahoo has many ads that uBlock should block
|
||||
TEST_URL = 'https://www.yahoo.com/'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user