mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 09:55:33 +10:00
Consolidate Chrome test helpers across all plugin tests (#1738)
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes. --> # Summary <!--e.g. This PR fixes ABC or adds the ability to do XYZ...--> # Related issues <!-- e.g. #123 or Roadmap goal # https://github.com/pirate/ArchiveBox/wiki/Roadmap --> # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk
This commit is contained in:
@@ -1333,6 +1333,83 @@ function getExtensionsDir() {
|
|||||||
path.join(dataDir, 'personas', persona, 'chrome_extensions');
|
path.join(dataDir, 'personas', persona, 'chrome_extensions');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get machine type string for platform-specific paths.
|
||||||
|
* Matches Python's archivebox.config.paths.get_machine_type()
|
||||||
|
*
|
||||||
|
* @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin')
|
||||||
|
*/
|
||||||
|
function getMachineType() {
|
||||||
|
if (process.env.MACHINE_TYPE) {
|
||||||
|
return process.env.MACHINE_TYPE;
|
||||||
|
}
|
||||||
|
|
||||||
|
let machine = process.arch;
|
||||||
|
const system = process.platform;
|
||||||
|
|
||||||
|
// Normalize machine type to match Python's convention
|
||||||
|
if (machine === 'arm64' || machine === 'aarch64') {
|
||||||
|
machine = 'arm64';
|
||||||
|
} else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') {
|
||||||
|
machine = 'x86_64';
|
||||||
|
} else if (machine === 'ia32' || machine === 'x86') {
|
||||||
|
machine = 'x86';
|
||||||
|
}
|
||||||
|
|
||||||
|
return `${machine}-${system}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get LIB_DIR path for platform-specific binaries.
|
||||||
|
* Returns DATA_DIR/lib/MACHINE_TYPE/
|
||||||
|
*
|
||||||
|
* @returns {string} - Absolute path to lib directory
|
||||||
|
*/
|
||||||
|
function getLibDir() {
|
||||||
|
if (process.env.LIB_DIR) {
|
||||||
|
return process.env.LIB_DIR;
|
||||||
|
}
|
||||||
|
const dataDir = getEnv('DATA_DIR', './data');
|
||||||
|
const machineType = getMachineType();
|
||||||
|
return path.join(dataDir, 'lib', machineType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get NODE_MODULES_DIR path for npm packages.
|
||||||
|
* Returns LIB_DIR/npm/node_modules/
|
||||||
|
*
|
||||||
|
* @returns {string} - Absolute path to node_modules directory
|
||||||
|
*/
|
||||||
|
function getNodeModulesDir() {
|
||||||
|
if (process.env.NODE_MODULES_DIR) {
|
||||||
|
return process.env.NODE_MODULES_DIR;
|
||||||
|
}
|
||||||
|
return path.join(getLibDir(), 'npm', 'node_modules');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get all test environment paths as a JSON object.
|
||||||
|
* This is the single source of truth for path calculations - Python calls this
|
||||||
|
* to avoid duplicating path logic.
|
||||||
|
*
|
||||||
|
* @returns {Object} - Object with all test environment paths
|
||||||
|
*/
|
||||||
|
function getTestEnv() {
|
||||||
|
const dataDir = getEnv('DATA_DIR', './data');
|
||||||
|
const machineType = getMachineType();
|
||||||
|
const libDir = getLibDir();
|
||||||
|
const nodeModulesDir = getNodeModulesDir();
|
||||||
|
|
||||||
|
return {
|
||||||
|
DATA_DIR: dataDir,
|
||||||
|
MACHINE_TYPE: machineType,
|
||||||
|
LIB_DIR: libDir,
|
||||||
|
NODE_MODULES_DIR: nodeModulesDir,
|
||||||
|
NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'),
|
||||||
|
CHROME_EXTENSIONS_DIR: getExtensionsDir(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Install a Chrome extension with caching support.
|
* Install a Chrome extension with caching support.
|
||||||
*
|
*
|
||||||
@@ -1442,8 +1519,13 @@ module.exports = {
|
|||||||
getExtensionPaths,
|
getExtensionPaths,
|
||||||
waitForExtensionTarget,
|
waitForExtensionTarget,
|
||||||
getExtensionTargets,
|
getExtensionTargets,
|
||||||
// Shared extension installer utilities
|
// Shared path utilities (single source of truth for Python/JS)
|
||||||
|
getMachineType,
|
||||||
|
getLibDir,
|
||||||
|
getNodeModulesDir,
|
||||||
getExtensionsDir,
|
getExtensionsDir,
|
||||||
|
getTestEnv,
|
||||||
|
// Shared extension installer utilities
|
||||||
installExtensionWithCache,
|
installExtensionWithCache,
|
||||||
// Deprecated - use enableExtensions option instead
|
// Deprecated - use enableExtensions option instead
|
||||||
getExtensionLaunchArgs,
|
getExtensionLaunchArgs,
|
||||||
@@ -1457,18 +1539,31 @@ if (require.main === module) {
|
|||||||
console.log('Usage: chrome_utils.js <command> [args...]');
|
console.log('Usage: chrome_utils.js <command> [args...]');
|
||||||
console.log('');
|
console.log('');
|
||||||
console.log('Commands:');
|
console.log('Commands:');
|
||||||
console.log(' findChromium');
|
console.log(' findChromium Find Chrome/Chromium binary');
|
||||||
console.log(' installChromium');
|
console.log(' installChromium Install Chromium via @puppeteer/browsers');
|
||||||
console.log(' installPuppeteerCore [npm_prefix]');
|
console.log(' installPuppeteerCore Install puppeteer-core npm package');
|
||||||
console.log(' launchChromium [output_dir] [extension_paths_json]');
|
console.log(' launchChromium Launch Chrome with CDP debugging');
|
||||||
console.log(' killChrome <pid> [output_dir]');
|
console.log(' killChrome <pid> Kill Chrome process by PID');
|
||||||
console.log(' killZombieChrome [data_dir]');
|
console.log(' killZombieChrome Clean up zombie Chrome processes');
|
||||||
console.log(' getExtensionId <path>');
|
console.log('');
|
||||||
console.log(' loadExtensionManifest <path>');
|
console.log(' getMachineType Get machine type (e.g., x86_64-linux)');
|
||||||
console.log(' getExtensionLaunchArgs <extensions_json>');
|
console.log(' getLibDir Get LIB_DIR path');
|
||||||
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
|
console.log(' getNodeModulesDir Get NODE_MODULES_DIR path');
|
||||||
console.log(' getExtensionsDir');
|
console.log(' getExtensionsDir Get Chrome extensions directory');
|
||||||
console.log(' installExtensionWithCache <webstore_id> <name>');
|
console.log(' getTestEnv Get all paths as JSON (for tests)');
|
||||||
|
console.log('');
|
||||||
|
console.log(' getExtensionId <path> Get extension ID from unpacked path');
|
||||||
|
console.log(' loadExtensionManifest Load extension manifest.json');
|
||||||
|
console.log(' loadOrInstallExtension Load or install an extension');
|
||||||
|
console.log(' installExtensionWithCache Install extension with caching');
|
||||||
|
console.log('');
|
||||||
|
console.log('Environment variables:');
|
||||||
|
console.log(' DATA_DIR Base data directory');
|
||||||
|
console.log(' LIB_DIR Library directory (computed if not set)');
|
||||||
|
console.log(' MACHINE_TYPE Machine type override');
|
||||||
|
console.log(' NODE_MODULES_DIR Node modules directory');
|
||||||
|
console.log(' CHROME_BINARY Chrome binary path');
|
||||||
|
console.log(' CHROME_EXTENSIONS_DIR Extensions directory');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1581,11 +1676,31 @@ if (require.main === module) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case 'getMachineType': {
|
||||||
|
console.log(getMachineType());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'getLibDir': {
|
||||||
|
console.log(getLibDir());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'getNodeModulesDir': {
|
||||||
|
console.log(getNodeModulesDir());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case 'getExtensionsDir': {
|
case 'getExtensionsDir': {
|
||||||
console.log(getExtensionsDir());
|
console.log(getExtensionsDir());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case 'getTestEnv': {
|
||||||
|
console.log(JSON.stringify(getTestEnv(), null, 2));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case 'installExtensionWithCache': {
|
case 'installExtensionWithCache': {
|
||||||
const [webstore_id, name] = commandArgs;
|
const [webstore_id, name] = commandArgs;
|
||||||
if (!webstore_id || !name) {
|
if (!webstore_id || !name) {
|
||||||
|
|||||||
@@ -2,25 +2,69 @@
|
|||||||
Shared Chrome test helpers for plugin integration tests.
|
Shared Chrome test helpers for plugin integration tests.
|
||||||
|
|
||||||
This module provides common utilities for Chrome-based plugin tests, reducing
|
This module provides common utilities for Chrome-based plugin tests, reducing
|
||||||
duplication across test files. It uses the JavaScript utilities from chrome_utils.js
|
duplication across test files. Functions delegate to chrome_utils.js (the single
|
||||||
where appropriate.
|
source of truth) with Python fallbacks.
|
||||||
|
|
||||||
|
Function names match the JS equivalents in snake_case:
|
||||||
|
JS: getMachineType() -> Python: get_machine_type()
|
||||||
|
JS: getLibDir() -> Python: get_lib_dir()
|
||||||
|
JS: getNodeModulesDir() -> Python: get_node_modules_dir()
|
||||||
|
JS: getExtensionsDir() -> Python: get_extensions_dir()
|
||||||
|
JS: findChromium() -> Python: find_chromium()
|
||||||
|
JS: killChrome() -> Python: kill_chrome()
|
||||||
|
JS: getTestEnv() -> Python: get_test_env()
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
|
# Path helpers (delegate to chrome_utils.js):
|
||||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
get_test_env,
|
get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE
|
||||||
setup_chrome_session,
|
get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin'
|
||||||
cleanup_chrome,
|
get_lib_dir, # Path to lib dir
|
||||||
find_chromium_binary,
|
get_node_modules_dir, # Path to node_modules
|
||||||
get_node_modules_dir,
|
get_extensions_dir, # Path to chrome extensions
|
||||||
|
find_chromium, # Find Chrome/Chromium binary
|
||||||
|
kill_chrome, # Kill Chrome process by PID
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test file helpers:
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path
|
||||||
|
get_hook_script, # Find hook script by glob pattern
|
||||||
|
PLUGINS_ROOT, # Path to plugins root
|
||||||
|
LIB_DIR, # Path to lib dir (lazy-loaded)
|
||||||
|
NODE_MODULES_DIR, # Path to node_modules (lazy-loaded)
|
||||||
|
)
|
||||||
|
|
||||||
|
# For Chrome session tests:
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
setup_chrome_session, # Full Chrome + tab setup
|
||||||
|
cleanup_chrome, # Cleanup by PID
|
||||||
|
chrome_session, # Context manager
|
||||||
|
)
|
||||||
|
|
||||||
|
# For extension tests:
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
setup_test_env, # Full dir structure + Chrome install
|
||||||
|
launch_chromium_session, # Launch Chrome, return CDP URL
|
||||||
|
kill_chromium_session, # Cleanup Chrome
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run hooks and parse JSONL:
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
run_hook, # Run hook, return (returncode, stdout, stderr)
|
||||||
|
parse_jsonl_output, # Parse JSONL from stdout
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import platform
|
||||||
import signal
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Tuple, Optional
|
from typing import Tuple, Optional, List, Dict, Any
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
|
||||||
@@ -29,88 +73,623 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
|
|||||||
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
|
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
|
||||||
|
|
||||||
# Hook script locations
|
# Hook script locations
|
||||||
|
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||||
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||||
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||||
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||||
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
|
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
|
||||||
|
|
||||||
|
|
||||||
def get_node_modules_dir() -> Path:
|
# =============================================================================
|
||||||
"""Get NODE_MODULES_DIR for tests, checking env first.
|
# Path Helpers - delegates to chrome_utils.js with Python fallback
|
||||||
|
# Function names match JS: getMachineType -> get_machine_type, etc.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
Returns the path to the node_modules directory, checking:
|
|
||||||
1. NODE_MODULES_DIR environment variable
|
def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]:
|
||||||
2. Computed from LIB_DIR via ArchiveBox config
|
"""Call chrome_utils.js CLI command (internal helper).
|
||||||
|
|
||||||
|
This is the central dispatch for calling the JS utilities from Python.
|
||||||
|
All path calculations and Chrome operations are centralized in chrome_utils.js
|
||||||
|
to ensure consistency between Python and JavaScript code.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
command: The CLI command (e.g., 'findChromium', 'getTestEnv')
|
||||||
|
*args: Additional command arguments
|
||||||
|
env: Environment dict (default: current env)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (returncode, stdout, stderr)
|
||||||
"""
|
"""
|
||||||
|
cmd = ['node', str(CHROME_UTILS), command] + list(args)
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=30,
|
||||||
|
env=env or os.environ.copy()
|
||||||
|
)
|
||||||
|
return result.returncode, result.stdout, result.stderr
|
||||||
|
|
||||||
|
|
||||||
|
def get_plugin_dir(test_file: str) -> Path:
|
||||||
|
"""Get the plugin directory from a test file path.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
test_file: The __file__ of the test module (e.g., test_screenshot.py)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the plugin directory (e.g., plugins/screenshot/)
|
||||||
|
"""
|
||||||
|
return Path(test_file).parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]:
|
||||||
|
"""Find a hook script in a plugin directory by pattern.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
||||||
|
|
||||||
|
Args:
|
||||||
|
plugin_dir: Path to the plugin directory
|
||||||
|
pattern: Glob pattern to match
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the hook script or None if not found
|
||||||
|
"""
|
||||||
|
matches = list(plugin_dir.glob(pattern))
|
||||||
|
return matches[0] if matches else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_machine_type() -> str:
|
||||||
|
"""Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin').
|
||||||
|
|
||||||
|
Matches JS: getMachineType()
|
||||||
|
|
||||||
|
Tries chrome_utils.js first, falls back to Python computation.
|
||||||
|
"""
|
||||||
|
# Try JS first (single source of truth)
|
||||||
|
returncode, stdout, stderr = _call_chrome_utils('getMachineType')
|
||||||
|
if returncode == 0 and stdout.strip():
|
||||||
|
return stdout.strip()
|
||||||
|
|
||||||
|
# Fallback to Python computation
|
||||||
|
if os.environ.get('MACHINE_TYPE'):
|
||||||
|
return os.environ['MACHINE_TYPE']
|
||||||
|
|
||||||
|
machine = platform.machine().lower()
|
||||||
|
system = platform.system().lower()
|
||||||
|
if machine in ('arm64', 'aarch64'):
|
||||||
|
machine = 'arm64'
|
||||||
|
elif machine in ('x86_64', 'amd64'):
|
||||||
|
machine = 'x86_64'
|
||||||
|
return f"{machine}-{system}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_lib_dir() -> Path:
|
||||||
|
"""Get LIB_DIR path for platform-specific binaries.
|
||||||
|
|
||||||
|
Matches JS: getLibDir()
|
||||||
|
|
||||||
|
Tries chrome_utils.js first, falls back to Python computation.
|
||||||
|
"""
|
||||||
|
# Try JS first
|
||||||
|
returncode, stdout, stderr = _call_chrome_utils('getLibDir')
|
||||||
|
if returncode == 0 and stdout.strip():
|
||||||
|
return Path(stdout.strip())
|
||||||
|
|
||||||
|
# Fallback to Python
|
||||||
|
if os.environ.get('LIB_DIR'):
|
||||||
|
return Path(os.environ['LIB_DIR'])
|
||||||
|
from archivebox.config.common import STORAGE_CONFIG
|
||||||
|
return Path(str(STORAGE_CONFIG.LIB_DIR))
|
||||||
|
|
||||||
|
|
||||||
|
def get_node_modules_dir() -> Path:
|
||||||
|
"""Get NODE_MODULES_DIR path for npm packages.
|
||||||
|
|
||||||
|
Matches JS: getNodeModulesDir()
|
||||||
|
|
||||||
|
Tries chrome_utils.js first, falls back to Python computation.
|
||||||
|
"""
|
||||||
|
# Try JS first
|
||||||
|
returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir')
|
||||||
|
if returncode == 0 and stdout.strip():
|
||||||
|
return Path(stdout.strip())
|
||||||
|
|
||||||
|
# Fallback to Python
|
||||||
if os.environ.get('NODE_MODULES_DIR'):
|
if os.environ.get('NODE_MODULES_DIR'):
|
||||||
return Path(os.environ['NODE_MODULES_DIR'])
|
return Path(os.environ['NODE_MODULES_DIR'])
|
||||||
# Otherwise compute from LIB_DIR
|
lib_dir = get_lib_dir()
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
|
||||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
|
||||||
return lib_dir / 'npm' / 'node_modules'
|
return lib_dir / 'npm' / 'node_modules'
|
||||||
|
|
||||||
|
|
||||||
def get_test_env() -> dict:
|
def get_extensions_dir() -> str:
|
||||||
"""Get environment dict with NODE_MODULES_DIR set correctly for tests.
|
"""Get the Chrome extensions directory path.
|
||||||
|
|
||||||
Returns a copy of os.environ with NODE_MODULES_DIR added/updated.
|
Matches JS: getExtensionsDir()
|
||||||
Use this for all subprocess calls in plugin tests.
|
|
||||||
|
Tries chrome_utils.js first, falls back to Python computation.
|
||||||
"""
|
"""
|
||||||
env = os.environ.copy()
|
returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir')
|
||||||
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
|
if returncode == 0 and stdout.strip():
|
||||||
return env
|
return stdout.strip()
|
||||||
|
|
||||||
|
# Fallback to default computation if JS call fails
|
||||||
|
data_dir = os.environ.get('DATA_DIR', './data')
|
||||||
|
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
|
||||||
|
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
|
||||||
|
|
||||||
|
|
||||||
def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
|
def find_chromium(data_dir: Optional[str] = None) -> Optional[str]:
|
||||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
"""Find the Chromium binary path.
|
||||||
|
|
||||||
This uses the centralized findChromium() function which checks:
|
Matches JS: findChromium()
|
||||||
|
|
||||||
|
Uses chrome_utils.js which checks:
|
||||||
- CHROME_BINARY env var
|
- CHROME_BINARY env var
|
||||||
- @puppeteer/browsers install locations
|
- @puppeteer/browsers install locations
|
||||||
- System Chromium locations
|
- System Chromium locations
|
||||||
- Falls back to Chrome (with warning)
|
- Falls back to Chrome (with warning)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
data_dir: Optional DATA_DIR override
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Path to Chromium binary or None if not found
|
Path to Chromium binary or None if not found
|
||||||
"""
|
"""
|
||||||
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
env = os.environ.copy()
|
||||||
result = subprocess.run(
|
if data_dir:
|
||||||
['node', str(CHROME_UTILS), 'findChromium', str(search_dir)],
|
env['DATA_DIR'] = str(data_dir)
|
||||||
capture_output=True,
|
returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env)
|
||||||
text=True,
|
if returncode == 0 and stdout.strip():
|
||||||
timeout=10
|
return stdout.strip()
|
||||||
)
|
|
||||||
if result.returncode == 0 and result.stdout.strip():
|
|
||||||
return result.stdout.strip()
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_extensions_dir() -> str:
|
def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool:
|
||||||
"""Get the Chrome extensions directory using chrome_utils.js getExtensionsDir().
|
"""Kill a Chrome process by PID.
|
||||||
|
|
||||||
This uses the centralized path calculation from chrome_utils.js which checks:
|
Matches JS: killChrome()
|
||||||
- CHROME_EXTENSIONS_DIR env var
|
|
||||||
- DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions
|
Uses chrome_utils.js which handles:
|
||||||
|
- SIGTERM then SIGKILL
|
||||||
|
- Process group killing
|
||||||
|
- Zombie process cleanup
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pid: Process ID to kill
|
||||||
|
output_dir: Optional chrome output directory for PID file cleanup
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Path to extensions directory
|
True if the kill command succeeded
|
||||||
"""
|
"""
|
||||||
|
args = [str(pid)]
|
||||||
|
if output_dir:
|
||||||
|
args.append(str(output_dir))
|
||||||
|
returncode, stdout, stderr = _call_chrome_utils('killChrome', *args)
|
||||||
|
return returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_test_env() -> dict:
|
||||||
|
"""Get environment dict with all paths set correctly for tests.
|
||||||
|
|
||||||
|
Matches JS: getTestEnv()
|
||||||
|
|
||||||
|
Tries chrome_utils.js first for path values, builds env dict.
|
||||||
|
Use this for all subprocess calls in plugin tests.
|
||||||
|
"""
|
||||||
|
env = os.environ.copy()
|
||||||
|
|
||||||
|
# Try to get all paths from JS (single source of truth)
|
||||||
|
returncode, stdout, stderr = _call_chrome_utils('getTestEnv')
|
||||||
|
if returncode == 0 and stdout.strip():
|
||||||
|
try:
|
||||||
|
js_env = json.loads(stdout)
|
||||||
|
env.update(js_env)
|
||||||
|
return env
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback to Python computation
|
||||||
|
lib_dir = get_lib_dir()
|
||||||
|
env['LIB_DIR'] = str(lib_dir)
|
||||||
|
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
|
||||||
|
env['MACHINE_TYPE'] = get_machine_type()
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
# Backward compatibility aliases (deprecated, use new names)
|
||||||
|
find_chromium_binary = find_chromium
|
||||||
|
kill_chrome_via_js = kill_chrome
|
||||||
|
get_machine_type_from_js = get_machine_type
|
||||||
|
get_test_env_from_js = get_test_env
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Module-level constants (lazy-loaded on first access)
|
||||||
|
# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# These are computed once when first accessed
|
||||||
|
_LIB_DIR: Optional[Path] = None
|
||||||
|
_NODE_MODULES_DIR: Optional[Path] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_lib_dir_cached() -> Path:
|
||||||
|
global _LIB_DIR
|
||||||
|
if _LIB_DIR is None:
|
||||||
|
_LIB_DIR = get_lib_dir()
|
||||||
|
return _LIB_DIR
|
||||||
|
|
||||||
|
|
||||||
|
def _get_node_modules_dir_cached() -> Path:
|
||||||
|
global _NODE_MODULES_DIR
|
||||||
|
if _NODE_MODULES_DIR is None:
|
||||||
|
_NODE_MODULES_DIR = get_node_modules_dir()
|
||||||
|
return _NODE_MODULES_DIR
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level constants that can be imported directly
|
||||||
|
# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR
|
||||||
|
class _LazyPath:
|
||||||
|
"""Lazy path that computes value on first access."""
|
||||||
|
def __init__(self, getter):
|
||||||
|
self._getter = getter
|
||||||
|
self._value = None
|
||||||
|
|
||||||
|
def __fspath__(self):
|
||||||
|
if self._value is None:
|
||||||
|
self._value = self._getter()
|
||||||
|
return str(self._value)
|
||||||
|
|
||||||
|
def __truediv__(self, other):
|
||||||
|
if self._value is None:
|
||||||
|
self._value = self._getter()
|
||||||
|
return self._value / other
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.__fspath__()
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<LazyPath: {self.__fspath__()}>"
|
||||||
|
|
||||||
|
|
||||||
|
LIB_DIR = _LazyPath(_get_lib_dir_cached)
|
||||||
|
NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Hook Execution Helpers
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def run_hook(
|
||||||
|
hook_script: Path,
|
||||||
|
url: str,
|
||||||
|
snapshot_id: str,
|
||||||
|
cwd: Optional[Path] = None,
|
||||||
|
env: Optional[dict] = None,
|
||||||
|
timeout: int = 60,
|
||||||
|
extra_args: Optional[List[str]] = None,
|
||||||
|
) -> Tuple[int, str, str]:
|
||||||
|
"""Run a hook script and return (returncode, stdout, stderr).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
returncode, stdout, stderr = run_hook(
|
||||||
|
HOOK_SCRIPT, 'https://example.com', 'test-snap-123',
|
||||||
|
cwd=tmpdir, env=get_test_env()
|
||||||
|
)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hook_script: Path to the hook script
|
||||||
|
url: URL to process
|
||||||
|
snapshot_id: Snapshot ID
|
||||||
|
cwd: Working directory (default: current dir)
|
||||||
|
env: Environment dict (default: get_test_env())
|
||||||
|
timeout: Timeout in seconds
|
||||||
|
extra_args: Additional arguments to pass
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (returncode, stdout, stderr)
|
||||||
|
"""
|
||||||
|
if env is None:
|
||||||
|
env = get_test_env()
|
||||||
|
|
||||||
|
# Determine interpreter based on file extension
|
||||||
|
if hook_script.suffix == '.py':
|
||||||
|
cmd = ['python', str(hook_script)]
|
||||||
|
elif hook_script.suffix == '.js':
|
||||||
|
cmd = ['node', str(hook_script)]
|
||||||
|
else:
|
||||||
|
cmd = [str(hook_script)]
|
||||||
|
|
||||||
|
cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}'])
|
||||||
|
if extra_args:
|
||||||
|
cmd.extend(extra_args)
|
||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['node', str(CHROME_UTILS), 'getExtensionsDir'],
|
cmd,
|
||||||
|
cwd=str(cwd) if cwd else None,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=10,
|
env=env,
|
||||||
env=get_test_env()
|
timeout=timeout
|
||||||
)
|
)
|
||||||
if result.returncode == 0 and result.stdout.strip():
|
return result.returncode, result.stdout, result.stderr
|
||||||
return result.stdout.strip()
|
|
||||||
# Fallback to default computation if JS call fails
|
|
||||||
data_dir = os.environ.get('DATA_DIR', './data')
|
def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]:
|
||||||
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
|
"""Parse JSONL output from hook stdout and return the specified record type.
|
||||||
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
|
|
||||||
|
Usage:
|
||||||
|
result = parse_jsonl_output(stdout)
|
||||||
|
if result and result['status'] == 'succeeded':
|
||||||
|
print("Success!")
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stdout: The stdout from a hook execution
|
||||||
|
record_type: The 'type' field to look for (default: 'ArchiveResult')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The parsed JSON dict or None if not found
|
||||||
|
"""
|
||||||
|
for line in stdout.strip().split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if not line.startswith('{'):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
if record.get('type') == record_type:
|
||||||
|
return record
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def run_hook_and_parse(
|
||||||
|
hook_script: Path,
|
||||||
|
url: str,
|
||||||
|
snapshot_id: str,
|
||||||
|
cwd: Optional[Path] = None,
|
||||||
|
env: Optional[dict] = None,
|
||||||
|
timeout: int = 60,
|
||||||
|
extra_args: Optional[List[str]] = None,
|
||||||
|
) -> Tuple[int, Optional[Dict[str, Any]], str]:
|
||||||
|
"""Run a hook and parse its JSONL output.
|
||||||
|
|
||||||
|
Convenience function combining run_hook() and parse_jsonl_output().
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (returncode, parsed_result_or_none, stderr)
|
||||||
|
"""
|
||||||
|
returncode, stdout, stderr = run_hook(
|
||||||
|
hook_script, url, snapshot_id,
|
||||||
|
cwd=cwd, env=env, timeout=timeout, extra_args=extra_args
|
||||||
|
)
|
||||||
|
result = parse_jsonl_output(stdout)
|
||||||
|
return returncode, result, stderr
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Extension Test Helpers
|
||||||
|
# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def setup_test_env(tmpdir: Path) -> dict:
|
||||||
|
"""Set up isolated data/lib directory structure for extension tests.
|
||||||
|
|
||||||
|
Creates structure matching real ArchiveBox data dir:
|
||||||
|
<tmpdir>/data/
|
||||||
|
lib/
|
||||||
|
arm64-darwin/ (or x86_64-linux, etc.)
|
||||||
|
npm/
|
||||||
|
.bin/
|
||||||
|
node_modules/
|
||||||
|
personas/
|
||||||
|
Default/
|
||||||
|
chrome_extensions/
|
||||||
|
users/
|
||||||
|
testuser/
|
||||||
|
crawls/
|
||||||
|
snapshots/
|
||||||
|
|
||||||
|
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||||
|
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tmpdir: Base temporary directory for the test
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Environment dict with all paths set, or pytest.skip() if Chrome install fails
|
||||||
|
"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||||
|
machine = platform.machine().lower()
|
||||||
|
system = platform.system().lower()
|
||||||
|
if machine in ('arm64', 'aarch64'):
|
||||||
|
machine = 'arm64'
|
||||||
|
elif machine in ('x86_64', 'amd64'):
|
||||||
|
machine = 'x86_64'
|
||||||
|
machine_type = f"{machine}-{system}"
|
||||||
|
|
||||||
|
# Create proper directory structure matching real ArchiveBox layout
|
||||||
|
data_dir = tmpdir / 'data'
|
||||||
|
lib_dir = data_dir / 'lib' / machine_type
|
||||||
|
npm_dir = lib_dir / 'npm'
|
||||||
|
npm_bin_dir = npm_dir / '.bin'
|
||||||
|
node_modules_dir = npm_dir / 'node_modules'
|
||||||
|
|
||||||
|
# Extensions go under personas/Default/
|
||||||
|
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||||
|
|
||||||
|
# User data goes under users/{username}/
|
||||||
|
date_str = datetime.now().strftime('%Y%m%d')
|
||||||
|
users_dir = data_dir / 'users' / 'testuser'
|
||||||
|
crawls_dir = users_dir / 'crawls' / date_str
|
||||||
|
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||||
|
|
||||||
|
# Create all directories
|
||||||
|
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Build complete env dict
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update({
|
||||||
|
'DATA_DIR': str(data_dir),
|
||||||
|
'LIB_DIR': str(lib_dir),
|
||||||
|
'MACHINE_TYPE': machine_type,
|
||||||
|
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||||
|
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||||
|
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||||
|
'CRAWLS_DIR': str(crawls_dir),
|
||||||
|
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Only set headless if not already in environment (allow override for debugging)
|
||||||
|
if 'CHROME_HEADLESS' not in os.environ:
|
||||||
|
env['CHROME_HEADLESS'] = 'true'
|
||||||
|
|
||||||
|
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', str(CHROME_INSTALL_HOOK)],
|
||||||
|
capture_output=True, text=True, timeout=120, env=env
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||||
|
|
||||||
|
# Parse JSONL output to get CHROME_BINARY
|
||||||
|
chrome_binary = None
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||||
|
chrome_binary = data['abspath']
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not chrome_binary or not Path(chrome_binary).exists():
|
||||||
|
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||||
|
|
||||||
|
env['CHROME_BINARY'] = chrome_binary
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]:
|
||||||
|
"""Launch Chromium and return (process, cdp_url).
|
||||||
|
|
||||||
|
This launches Chrome using the chrome launch hook and waits for the CDP URL
|
||||||
|
to become available. Use this for extension tests that need direct CDP access.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
env: Environment dict (from setup_test_env)
|
||||||
|
chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.)
|
||||||
|
crawl_id: ID for the crawl
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (chrome_launch_process, cdp_url)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If Chrome fails to launch or CDP URL not available after 20s
|
||||||
|
"""
|
||||||
|
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
chrome_launch_process = subprocess.Popen(
|
||||||
|
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||||
|
cwd=str(chrome_dir),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
env=env
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for Chromium to launch and CDP URL to be available
|
||||||
|
cdp_url = None
|
||||||
|
for i in range(20):
|
||||||
|
if chrome_launch_process.poll() is not None:
|
||||||
|
stdout, stderr = chrome_launch_process.communicate()
|
||||||
|
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||||
|
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||||
|
if cdp_file.exists():
|
||||||
|
cdp_url = cdp_file.read_text().strip()
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if not cdp_url:
|
||||||
|
chrome_launch_process.kill()
|
||||||
|
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||||
|
|
||||||
|
return chrome_launch_process, cdp_url
|
||||||
|
|
||||||
|
|
||||||
|
def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None:
|
||||||
|
"""Clean up Chromium process launched by launch_chromium_session.
|
||||||
|
|
||||||
|
Uses chrome_utils.js killChrome for proper process group handling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chrome_launch_process: The Popen object from launch_chromium_session
|
||||||
|
chrome_dir: The chrome directory containing chrome.pid
|
||||||
|
"""
|
||||||
|
# First try to terminate the launch process gracefully
|
||||||
|
try:
|
||||||
|
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||||
|
chrome_launch_process.wait(timeout=5)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Read PID and use JS to kill with proper cleanup
|
||||||
|
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||||
|
if chrome_pid_file.exists():
|
||||||
|
try:
|
||||||
|
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||||
|
kill_chrome(chrome_pid, str(chrome_dir))
|
||||||
|
except (ValueError, FileNotFoundError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||||
|
"""Context manager for Chromium sessions with automatic cleanup.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url):
|
||||||
|
# Use cdp_url to connect with puppeteer
|
||||||
|
pass
|
||||||
|
# Chromium automatically cleaned up
|
||||||
|
|
||||||
|
Args:
|
||||||
|
env: Environment dict (from setup_test_env)
|
||||||
|
chrome_dir: Directory for Chrome files
|
||||||
|
crawl_id: ID for the crawl
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Tuple of (chrome_launch_process, cdp_url)
|
||||||
|
"""
|
||||||
|
chrome_launch_process = None
|
||||||
|
try:
|
||||||
|
chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id)
|
||||||
|
yield chrome_launch_process, cdp_url
|
||||||
|
finally:
|
||||||
|
if chrome_launch_process:
|
||||||
|
kill_chromium_session(chrome_launch_process, chrome_dir)
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Tab-based Test Helpers
|
||||||
|
# Used by tab-based tests (infiniscroll, modalcloser)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
def setup_chrome_session(
|
def setup_chrome_session(
|
||||||
@@ -210,25 +789,28 @@ def setup_chrome_session(
|
|||||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||||
|
|
||||||
|
|
||||||
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None:
|
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
|
||||||
"""Clean up Chrome processes.
|
"""Clean up Chrome processes using chrome_utils.js killChrome.
|
||||||
|
|
||||||
Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID.
|
Uses the centralized kill logic from chrome_utils.js which handles:
|
||||||
Ignores errors if processes are already dead.
|
- SIGTERM then SIGKILL
|
||||||
|
- Process group killing
|
||||||
|
- Zombie process cleanup
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
chrome_launch_process: The Popen object for the chrome launch hook
|
chrome_launch_process: The Popen object for the chrome launch hook
|
||||||
chrome_pid: The PID of the Chrome process
|
chrome_pid: The PID of the Chrome process
|
||||||
|
chrome_dir: Optional path to chrome output directory
|
||||||
"""
|
"""
|
||||||
|
# First try to terminate the launch process gracefully
|
||||||
try:
|
try:
|
||||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||||
chrome_launch_process.wait(timeout=5)
|
chrome_launch_process.wait(timeout=5)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
|
||||||
os.kill(chrome_pid, signal.SIGKILL)
|
# Use JS to kill Chrome with proper process group handling
|
||||||
except OSError:
|
kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None)
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
|||||||
@@ -28,70 +28,25 @@ import tempfile
|
|||||||
import shutil
|
import shutil
|
||||||
import platform
|
import platform
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
get_test_env,
|
||||||
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
get_lib_dir,
|
||||||
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
get_node_modules_dir,
|
||||||
|
find_chromium_binary,
|
||||||
|
CHROME_PLUGIN_DIR as PLUGIN_DIR,
|
||||||
|
CHROME_LAUNCH_HOOK,
|
||||||
|
CHROME_TAB_HOOK,
|
||||||
|
CHROME_NAVIGATE_HOOK,
|
||||||
|
)
|
||||||
|
|
||||||
# Get LIB_DIR and MACHINE_TYPE from environment or compute them
|
# Get LIB_DIR and NODE_MODULES_DIR from shared helpers
|
||||||
def get_lib_dir_and_machine_type():
|
LIB_DIR = get_lib_dir()
|
||||||
"""Get or compute LIB_DIR and MACHINE_TYPE for tests."""
|
NODE_MODULES_DIR = get_node_modules_dir()
|
||||||
from archivebox.config.paths import get_machine_type
|
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
|
||||||
|
|
||||||
lib_dir = os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)
|
|
||||||
machine_type = os.environ.get('MACHINE_TYPE') or get_machine_type()
|
|
||||||
|
|
||||||
return Path(lib_dir), machine_type
|
|
||||||
|
|
||||||
# Setup NODE_MODULES_DIR to find npm packages
|
|
||||||
LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type()
|
|
||||||
# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin)
|
|
||||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
|
||||||
NPM_PREFIX = LIB_DIR / 'npm'
|
NPM_PREFIX = LIB_DIR / 'npm'
|
||||||
|
|
||||||
# Chromium install location (relative to DATA_DIR)
|
# Chromium install location (relative to DATA_DIR)
|
||||||
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||||
|
|
||||||
def get_test_env():
|
|
||||||
"""Get environment with NODE_MODULES_DIR and CHROME_BINARY set correctly."""
|
|
||||||
env = os.environ.copy()
|
|
||||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
|
||||||
env['LIB_DIR'] = str(LIB_DIR)
|
|
||||||
env['MACHINE_TYPE'] = MACHINE_TYPE
|
|
||||||
# Ensure CHROME_BINARY is set to Chromium
|
|
||||||
if 'CHROME_BINARY' not in env:
|
|
||||||
chromium = find_chromium_binary()
|
|
||||||
if chromium:
|
|
||||||
env['CHROME_BINARY'] = chromium
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def find_chromium_binary(data_dir=None):
|
|
||||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
|
||||||
|
|
||||||
This uses the centralized findChromium() function which checks:
|
|
||||||
- CHROME_BINARY env var
|
|
||||||
- @puppeteer/browsers install locations (in data_dir/chromium)
|
|
||||||
- System Chromium locations
|
|
||||||
- Falls back to Chrome (with warning)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
|
||||||
"""
|
|
||||||
chrome_utils = PLUGIN_DIR / 'chrome_utils.js'
|
|
||||||
# Use provided data_dir, or fall back to env var, or current dir
|
|
||||||
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
|
||||||
result = subprocess.run(
|
|
||||||
['node', str(chrome_utils), 'findChromium', str(search_dir)],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=10
|
|
||||||
)
|
|
||||||
if result.returncode == 0 and result.stdout.strip():
|
|
||||||
return result.stdout.strip()
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session", autouse=True)
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
def ensure_chromium_and_puppeteer_installed():
|
def ensure_chromium_and_puppeteer_installed():
|
||||||
|
|||||||
@@ -20,29 +20,22 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_test_env,
|
||||||
|
get_plugin_dir,
|
||||||
|
get_hook_script,
|
||||||
|
run_hook_and_parse,
|
||||||
|
LIB_DIR,
|
||||||
|
NODE_MODULES_DIR,
|
||||||
|
PLUGINS_ROOT,
|
||||||
|
)
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
|
DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
|
||||||
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
|
NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
# Get LIB_DIR for NODE_MODULES_DIR
|
|
||||||
def get_lib_dir():
|
|
||||||
"""Get LIB_DIR for tests."""
|
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
|
||||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
|
||||||
|
|
||||||
LIB_DIR = get_lib_dir()
|
|
||||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
|
||||||
|
|
||||||
def get_test_env():
|
|
||||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
|
||||||
env = os.environ.copy()
|
|
||||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
|
||||||
env['LIB_DIR'] = str(LIB_DIR)
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
"""Verify on_Snapshot hook exists."""
|
"""Verify on_Snapshot hook exists."""
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Integration tests for favicon plugin
|
Integration tests for favicon plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
pass
|
|
||||||
1. Plugin script exists
|
1. Plugin script exists
|
||||||
2. requests library is available
|
2. requests library is available
|
||||||
3. Favicon extraction works for real example.com
|
3. Favicon extraction works for real example.com
|
||||||
@@ -21,9 +20,15 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_plugin_dir,
|
||||||
|
get_hook_script,
|
||||||
|
parse_jsonl_output,
|
||||||
|
)
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None)
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
|
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,14 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
setup_test_env,
|
||||||
|
launch_chromium_session,
|
||||||
|
kill_chromium_session,
|
||||||
|
CHROME_LAUNCH_HOOK,
|
||||||
|
PLUGINS_ROOT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
|
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
|
||||||
@@ -124,107 +132,6 @@ def test_no_configuration_required():
|
|||||||
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
|
||||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
|
||||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
|
||||||
|
|
||||||
|
|
||||||
def setup_test_env(tmpdir: Path) -> dict:
|
|
||||||
"""Set up isolated data/lib directory structure for tests.
|
|
||||||
|
|
||||||
Creates structure matching real ArchiveBox data dir:
|
|
||||||
<tmpdir>/data/
|
|
||||||
lib/
|
|
||||||
arm64-darwin/ (or x86_64-linux, etc.)
|
|
||||||
npm/
|
|
||||||
.bin/
|
|
||||||
node_modules/
|
|
||||||
personas/
|
|
||||||
Default/
|
|
||||||
chrome_extensions/
|
|
||||||
users/
|
|
||||||
testuser/
|
|
||||||
crawls/
|
|
||||||
snapshots/
|
|
||||||
|
|
||||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
|
||||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
|
||||||
"""
|
|
||||||
import platform
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
|
||||||
machine = platform.machine().lower()
|
|
||||||
system = platform.system().lower()
|
|
||||||
if machine in ('arm64', 'aarch64'):
|
|
||||||
machine = 'arm64'
|
|
||||||
elif machine in ('x86_64', 'amd64'):
|
|
||||||
machine = 'x86_64'
|
|
||||||
machine_type = f"{machine}-{system}"
|
|
||||||
|
|
||||||
# Create proper directory structure matching real ArchiveBox layout
|
|
||||||
data_dir = tmpdir / 'data'
|
|
||||||
lib_dir = data_dir / 'lib' / machine_type
|
|
||||||
npm_dir = lib_dir / 'npm'
|
|
||||||
npm_bin_dir = npm_dir / '.bin'
|
|
||||||
node_modules_dir = npm_dir / 'node_modules'
|
|
||||||
|
|
||||||
# Extensions go under personas/Default/
|
|
||||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
|
||||||
|
|
||||||
# User data goes under users/{username}/
|
|
||||||
date_str = datetime.now().strftime('%Y%m%d')
|
|
||||||
users_dir = data_dir / 'users' / 'testuser'
|
|
||||||
crawls_dir = users_dir / 'crawls' / date_str
|
|
||||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
|
||||||
|
|
||||||
# Create all directories
|
|
||||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Build complete env dict
|
|
||||||
env = os.environ.copy()
|
|
||||||
env.update({
|
|
||||||
'DATA_DIR': str(data_dir),
|
|
||||||
'LIB_DIR': str(lib_dir),
|
|
||||||
'MACHINE_TYPE': machine_type,
|
|
||||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
|
||||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
|
||||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
|
||||||
'CRAWLS_DIR': str(crawls_dir),
|
|
||||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
|
||||||
})
|
|
||||||
|
|
||||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
|
||||||
result = subprocess.run(
|
|
||||||
['python', str(CHROME_INSTALL_HOOK)],
|
|
||||||
capture_output=True, text=True, timeout=120, env=env
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
|
||||||
|
|
||||||
# Parse JSONL output to get CHROME_BINARY
|
|
||||||
chrome_binary = None
|
|
||||||
for line in result.stdout.strip().split('\n'):
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
data = json.loads(line)
|
|
||||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
|
||||||
chrome_binary = data['abspath']
|
|
||||||
break
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not chrome_binary or not Path(chrome_binary).exists():
|
|
||||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
|
||||||
|
|
||||||
env['CHROME_BINARY'] = chrome_binary
|
|
||||||
return env
|
|
||||||
|
|
||||||
TEST_URL = 'https://www.filmin.es/'
|
TEST_URL = 'https://www.filmin.es/'
|
||||||
|
|
||||||
|
|
||||||
@@ -420,54 +327,6 @@ const puppeteer = require('puppeteer-core');
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
|
||||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
|
||||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
chrome_launch_process = subprocess.Popen(
|
|
||||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
|
||||||
cwd=str(chrome_dir),
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
env=env
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for Chromium to launch and CDP URL to be available
|
|
||||||
cdp_url = None
|
|
||||||
for i in range(20):
|
|
||||||
if chrome_launch_process.poll() is not None:
|
|
||||||
stdout, stderr = chrome_launch_process.communicate()
|
|
||||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
|
||||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
|
||||||
if cdp_file.exists():
|
|
||||||
cdp_url = cdp_file.read_text().strip()
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if not cdp_url:
|
|
||||||
chrome_launch_process.kill()
|
|
||||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
|
||||||
|
|
||||||
return chrome_launch_process, cdp_url
|
|
||||||
|
|
||||||
|
|
||||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
|
||||||
"""Clean up Chromium process."""
|
|
||||||
try:
|
|
||||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
|
||||||
chrome_launch_process.wait(timeout=5)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
|
||||||
if chrome_pid_file.exists():
|
|
||||||
try:
|
|
||||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
|
||||||
os.kill(chrome_pid, signal.SIGKILL)
|
|
||||||
except (OSError, ValueError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||||
"""Check if cookie consent elements are visible on a page.
|
"""Check if cookie consent elements are visible on a page.
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Integration tests for mercury plugin
|
Integration tests for mercury plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
pass
|
|
||||||
1. Hook script exists
|
1. Hook script exists
|
||||||
2. Dependencies installed via validation hooks
|
2. Dependencies installed via validation hooks
|
||||||
3. Verify deps with abx-pkg
|
3. Verify deps with abx-pkg
|
||||||
@@ -19,9 +18,15 @@ import tempfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
get_plugin_dir,
|
||||||
MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None)
|
get_hook_script,
|
||||||
|
PLUGINS_ROOT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
|
MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*')
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
|
|||||||
@@ -21,29 +21,22 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_test_env,
|
||||||
|
get_plugin_dir,
|
||||||
|
get_hook_script,
|
||||||
|
run_hook_and_parse,
|
||||||
|
LIB_DIR,
|
||||||
|
NODE_MODULES_DIR,
|
||||||
|
PLUGINS_ROOT,
|
||||||
|
)
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
|
PDF_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_pdf.*')
|
||||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
# Get LIB_DIR for NODE_MODULES_DIR
|
|
||||||
def get_lib_dir():
|
|
||||||
"""Get LIB_DIR for tests."""
|
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
|
||||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
|
||||||
|
|
||||||
LIB_DIR = get_lib_dir()
|
|
||||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
|
||||||
|
|
||||||
def get_test_env():
|
|
||||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
|
||||||
env = os.environ.copy()
|
|
||||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
|
||||||
env['LIB_DIR'] = str(LIB_DIR)
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
"""Verify on_Snapshot hook exists."""
|
"""Verify on_Snapshot hook exists."""
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Integration tests for readability plugin
|
Integration tests for readability plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
pass
|
|
||||||
1. Validate hook checks for readability-extractor binary
|
1. Validate hook checks for readability-extractor binary
|
||||||
2. Verify deps with abx-pkg
|
2. Verify deps with abx-pkg
|
||||||
3. Plugin reports missing dependency correctly
|
3. Plugin reports missing dependency correctly
|
||||||
@@ -18,10 +17,15 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_plugin_dir,
|
||||||
|
get_hook_script,
|
||||||
|
PLUGINS_ROOT,
|
||||||
|
)
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*'))
|
READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*')
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -20,28 +20,20 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_test_env,
|
||||||
|
get_plugin_dir,
|
||||||
|
get_hook_script,
|
||||||
|
run_hook_and_parse,
|
||||||
|
LIB_DIR,
|
||||||
|
NODE_MODULES_DIR,
|
||||||
|
)
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
|
SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
# Get LIB_DIR for NODE_MODULES_DIR
|
|
||||||
def get_lib_dir():
|
|
||||||
"""Get LIB_DIR for tests."""
|
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
|
||||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
|
||||||
|
|
||||||
LIB_DIR = get_lib_dir()
|
|
||||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
|
||||||
|
|
||||||
def get_test_env():
|
|
||||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
|
||||||
env = os.environ.copy()
|
|
||||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
|
||||||
env['LIB_DIR'] = str(LIB_DIR)
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
"""Verify on_Snapshot hook exists."""
|
"""Verify on_Snapshot hook exists."""
|
||||||
|
|||||||
@@ -77,27 +77,9 @@ def has_staticfile_output() -> bool:
|
|||||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||||
|
|
||||||
|
|
||||||
# Chrome binary search paths
|
# Chrome session directory (relative to extractor output dir)
|
||||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for.
|
||||||
'chromium', 'chromium-browser', 'chromium-browser-beta',
|
# The centralized Chrome binary search is in chrome_utils.js findChromium().
|
||||||
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
|
|
||||||
]
|
|
||||||
CHROME_BINARY_NAMES_LINUX = [
|
|
||||||
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
|
|
||||||
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
|
|
||||||
]
|
|
||||||
CHROME_BINARY_NAMES_MACOS = [
|
|
||||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
||||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
|
||||||
]
|
|
||||||
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
|
|
||||||
|
|
||||||
ALL_CHROME_BINARIES = (
|
|
||||||
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
|
|
||||||
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
CHROME_SESSION_DIR = '../chrome'
|
CHROME_SESSION_DIR = '../chrome'
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ Tests verify:
|
|||||||
2. CLI-based singlefile extraction works
|
2. CLI-based singlefile extraction works
|
||||||
3. Dependencies available via abx-pkg
|
3. Dependencies available via abx-pkg
|
||||||
4. Output contains valid HTML
|
4. Output contains valid HTML
|
||||||
|
5. Connects to Chrome session via CDP when available
|
||||||
|
6. Works with extensions loaded (ublock, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@@ -16,10 +18,17 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_test_env,
|
||||||
|
get_plugin_dir,
|
||||||
|
get_hook_script,
|
||||||
|
setup_chrome_session,
|
||||||
|
cleanup_chrome,
|
||||||
|
)
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
|
SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py')
|
||||||
TEST_URL = "https://example.com"
|
TEST_URL = "https://example.com"
|
||||||
|
|
||||||
|
|
||||||
@@ -52,7 +61,7 @@ def test_singlefile_cli_archives_example_com():
|
|||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
tmpdir = Path(tmpdir)
|
tmpdir = Path(tmpdir)
|
||||||
|
|
||||||
env = os.environ.copy()
|
env = get_test_env()
|
||||||
env['SINGLEFILE_ENABLED'] = 'true'
|
env['SINGLEFILE_ENABLED'] = 'true'
|
||||||
|
|
||||||
# Run singlefile snapshot hook
|
# Run singlefile snapshot hook
|
||||||
@@ -78,5 +87,89 @@ def test_singlefile_cli_archives_example_com():
|
|||||||
assert 'Example Domain' in html_content, "Output should contain example.com content"
|
assert 'Example Domain' in html_content, "Output should contain example.com content"
|
||||||
|
|
||||||
|
|
||||||
|
def test_singlefile_with_chrome_session():
|
||||||
|
"""Test singlefile connects to existing Chrome session via CDP.
|
||||||
|
|
||||||
|
When a Chrome session exists (chrome/cdp_url.txt), singlefile should
|
||||||
|
connect to it instead of launching a new Chrome instance.
|
||||||
|
"""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
tmpdir = Path(tmpdir)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Set up Chrome session using shared helper
|
||||||
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
tmpdir=tmpdir,
|
||||||
|
crawl_id='singlefile-test-crawl',
|
||||||
|
snapshot_id='singlefile-test-snap',
|
||||||
|
test_url=TEST_URL,
|
||||||
|
navigate=False, # Don't navigate, singlefile will do that
|
||||||
|
timeout=20,
|
||||||
|
)
|
||||||
|
|
||||||
|
# singlefile looks for ../chrome/cdp_url.txt relative to cwd
|
||||||
|
# So we need to run from a directory that has ../chrome pointing to our chrome dir
|
||||||
|
singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile'
|
||||||
|
singlefile_output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Create symlink so singlefile can find the chrome session
|
||||||
|
chrome_link = singlefile_output_dir.parent / 'chrome'
|
||||||
|
if not chrome_link.exists():
|
||||||
|
chrome_link.symlink_to(tmpdir / 'crawl' / 'chrome')
|
||||||
|
|
||||||
|
env = get_test_env()
|
||||||
|
env['SINGLEFILE_ENABLED'] = 'true'
|
||||||
|
env['CHROME_HEADLESS'] = 'true'
|
||||||
|
|
||||||
|
# Run singlefile - it should find and use the existing Chrome session
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-test-snap'],
|
||||||
|
cwd=str(singlefile_output_dir),
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
env=env,
|
||||||
|
timeout=120
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify output
|
||||||
|
output_file = singlefile_output_dir / 'singlefile.html'
|
||||||
|
if output_file.exists():
|
||||||
|
html_content = output_file.read_text()
|
||||||
|
assert len(html_content) > 500, "Output file too small"
|
||||||
|
assert 'Example Domain' in html_content, "Should contain example.com content"
|
||||||
|
else:
|
||||||
|
# If singlefile couldn't connect to Chrome, it may have failed
|
||||||
|
# Check if it mentioned browser-server in its args (indicating it tried to use CDP)
|
||||||
|
assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \
|
||||||
|
f"Singlefile should attempt CDP connection. stderr: {result.stderr}"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||||
|
|
||||||
|
|
||||||
|
def test_singlefile_disabled_skips():
|
||||||
|
"""Test that SINGLEFILE_ENABLED=False exits without JSONL."""
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
tmpdir = Path(tmpdir)
|
||||||
|
|
||||||
|
env = get_test_env()
|
||||||
|
env['SINGLEFILE_ENABLED'] = 'False'
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
|
||||||
|
cwd=tmpdir,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
env=env,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.returncode == 0, f"Should exit 0 when disabled: {result.stderr}"
|
||||||
|
|
||||||
|
# Should NOT emit JSONL when disabled
|
||||||
|
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||||
|
assert len(jsonl_lines) == 0, f"Should not emit JSONL when disabled, but got: {jsonl_lines}"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
pytest.main([__file__, '-v'])
|
pytest.main([__file__, '-v'])
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Integration tests for title plugin
|
Integration tests for title plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
pass
|
|
||||||
1. Plugin script exists
|
1. Plugin script exists
|
||||||
2. Node.js is available
|
2. Node.js is available
|
||||||
3. Title extraction works for real example.com
|
3. Title extraction works for real example.com
|
||||||
@@ -20,9 +19,15 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_plugin_dir,
|
||||||
|
get_hook_script,
|
||||||
|
parse_jsonl_output,
|
||||||
|
)
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None)
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||||
|
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -16,184 +16,25 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
setup_test_env,
|
||||||
|
launch_chromium_session,
|
||||||
|
kill_chromium_session,
|
||||||
|
CHROME_LAUNCH_HOOK,
|
||||||
|
PLUGINS_ROOT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
|
||||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
||||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
||||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
|
||||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
|
||||||
|
|
||||||
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
|
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
|
||||||
|
|
||||||
|
|
||||||
def setup_test_env(tmpdir: Path) -> dict:
|
# Alias for backward compatibility with existing test names
|
||||||
"""Set up isolated data/lib directory structure for tests.
|
launch_chrome = launch_chromium_session
|
||||||
|
kill_chrome = kill_chromium_session
|
||||||
Creates structure matching real ArchiveBox data dir:
|
|
||||||
<tmpdir>/data/
|
|
||||||
lib/
|
|
||||||
arm64-darwin/ (or x86_64-linux, etc.)
|
|
||||||
npm/
|
|
||||||
.bin/
|
|
||||||
node_modules/
|
|
||||||
personas/
|
|
||||||
default/
|
|
||||||
chrome_extensions/
|
|
||||||
users/
|
|
||||||
testuser/
|
|
||||||
crawls/
|
|
||||||
snapshots/
|
|
||||||
|
|
||||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
|
||||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
|
||||||
"""
|
|
||||||
import platform
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
|
||||||
machine = platform.machine().lower()
|
|
||||||
system = platform.system().lower()
|
|
||||||
if machine in ('arm64', 'aarch64'):
|
|
||||||
machine = 'arm64'
|
|
||||||
elif machine in ('x86_64', 'amd64'):
|
|
||||||
machine = 'x86_64'
|
|
||||||
machine_type = f"{machine}-{system}"
|
|
||||||
|
|
||||||
# Create proper directory structure matching real ArchiveBox layout
|
|
||||||
data_dir = tmpdir / 'data'
|
|
||||||
lib_dir = data_dir / 'lib' / machine_type
|
|
||||||
npm_dir = lib_dir / 'npm'
|
|
||||||
npm_bin_dir = npm_dir / '.bin'
|
|
||||||
node_modules_dir = npm_dir / 'node_modules'
|
|
||||||
|
|
||||||
# Extensions go under personas/Default/
|
|
||||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
|
||||||
|
|
||||||
# User data goes under users/{username}/
|
|
||||||
date_str = datetime.now().strftime('%Y%m%d')
|
|
||||||
users_dir = data_dir / 'users' / 'testuser'
|
|
||||||
crawls_dir = users_dir / 'crawls' / date_str
|
|
||||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
|
||||||
|
|
||||||
# Create all directories
|
|
||||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Build complete env dict
|
|
||||||
env = os.environ.copy()
|
|
||||||
env.update({
|
|
||||||
'DATA_DIR': str(data_dir),
|
|
||||||
'LIB_DIR': str(lib_dir),
|
|
||||||
'MACHINE_TYPE': machine_type,
|
|
||||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
|
||||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
|
||||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
|
||||||
'CRAWLS_DIR': str(crawls_dir),
|
|
||||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
|
||||||
})
|
|
||||||
|
|
||||||
# Only set headless if not already in environment (allow override for debugging)
|
|
||||||
if 'CHROME_HEADLESS' not in os.environ:
|
|
||||||
env['CHROME_HEADLESS'] = 'true'
|
|
||||||
|
|
||||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
|
||||||
result = subprocess.run(
|
|
||||||
['python', str(CHROME_INSTALL_HOOK)],
|
|
||||||
capture_output=True, text=True, timeout=120, env=env
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
|
||||||
|
|
||||||
# Parse JSONL output to get CHROME_BINARY
|
|
||||||
chrome_binary = None
|
|
||||||
for line in result.stdout.strip().split('\n'):
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
data = json.loads(line)
|
|
||||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
|
||||||
chrome_binary = data['abspath']
|
|
||||||
break
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not chrome_binary or not Path(chrome_binary).exists():
|
|
||||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
|
||||||
|
|
||||||
env['CHROME_BINARY'] = chrome_binary
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
|
|
||||||
"""Launch Chromium and return (process, cdp_url)."""
|
|
||||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
process = subprocess.Popen(
|
|
||||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
|
||||||
cwd=str(chrome_dir),
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
env=env
|
|
||||||
)
|
|
||||||
|
|
||||||
cdp_url = None
|
|
||||||
extensions_ready = False
|
|
||||||
for _ in range(30):
|
|
||||||
if process.poll() is not None:
|
|
||||||
stdout, stderr = process.communicate()
|
|
||||||
raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
|
|
||||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
|
||||||
ext_file = chrome_dir / 'extensions.json'
|
|
||||||
if cdp_file.exists() and not cdp_url:
|
|
||||||
cdp_url = cdp_file.read_text().strip()
|
|
||||||
if ext_file.exists():
|
|
||||||
extensions_ready = True
|
|
||||||
if cdp_url and extensions_ready:
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if not cdp_url:
|
|
||||||
process.kill()
|
|
||||||
stdout, stderr = process.communicate()
|
|
||||||
raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
|
|
||||||
|
|
||||||
# Print chrome launch hook output for debugging
|
|
||||||
import select
|
|
||||||
if hasattr(select, 'poll'):
|
|
||||||
# Read any available stderr without blocking
|
|
||||||
import fcntl
|
|
||||||
import os as os_module
|
|
||||||
fd = process.stderr.fileno()
|
|
||||||
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
|
|
||||||
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
|
|
||||||
try:
|
|
||||||
stderr_output = process.stderr.read()
|
|
||||||
if stderr_output:
|
|
||||||
print(f"[Chrome Launch Hook Output]\n{stderr_output}")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return process, cdp_url
|
|
||||||
|
|
||||||
|
|
||||||
def kill_chrome(process, chrome_dir: Path):
|
|
||||||
"""Kill Chromium process."""
|
|
||||||
try:
|
|
||||||
process.send_signal(signal.SIGTERM)
|
|
||||||
process.wait(timeout=5)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
pid_file = chrome_dir / 'chrome.pid'
|
|
||||||
if pid_file.exists():
|
|
||||||
try:
|
|
||||||
os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class TestTwoCaptcha:
|
class TestTwoCaptcha:
|
||||||
|
|||||||
@@ -12,6 +12,14 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
setup_test_env,
|
||||||
|
launch_chromium_session,
|
||||||
|
kill_chromium_session,
|
||||||
|
CHROME_LAUNCH_HOOK,
|
||||||
|
PLUGINS_ROOT,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
|
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
|
||||||
@@ -157,64 +165,6 @@ def test_large_extension_size():
|
|||||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||||
|
|
||||||
|
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
|
||||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
|
||||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
|
||||||
|
|
||||||
|
|
||||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
|
||||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
|
||||||
import signal
|
|
||||||
import time
|
|
||||||
|
|
||||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
chrome_launch_process = subprocess.Popen(
|
|
||||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
|
||||||
cwd=str(chrome_dir),
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
env=env
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for Chromium to launch and CDP URL to be available
|
|
||||||
cdp_url = None
|
|
||||||
for i in range(20):
|
|
||||||
if chrome_launch_process.poll() is not None:
|
|
||||||
stdout, stderr = chrome_launch_process.communicate()
|
|
||||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
|
||||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
|
||||||
if cdp_file.exists():
|
|
||||||
cdp_url = cdp_file.read_text().strip()
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if not cdp_url:
|
|
||||||
chrome_launch_process.kill()
|
|
||||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
|
||||||
|
|
||||||
return chrome_launch_process, cdp_url
|
|
||||||
|
|
||||||
|
|
||||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
|
||||||
"""Clean up Chromium process."""
|
|
||||||
import signal
|
|
||||||
|
|
||||||
try:
|
|
||||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
|
||||||
chrome_launch_process.wait(timeout=5)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
|
||||||
if chrome_pid_file.exists():
|
|
||||||
try:
|
|
||||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
|
||||||
os.kill(chrome_pid, signal.SIGKILL)
|
|
||||||
except (OSError, ValueError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||||
"""Check ad blocking effectiveness by counting ad elements on page.
|
"""Check ad blocking effectiveness by counting ad elements on page.
|
||||||
|
|
||||||
@@ -350,103 +300,6 @@ const puppeteer = require('puppeteer-core');
|
|||||||
return json.loads(output_lines[-1])
|
return json.loads(output_lines[-1])
|
||||||
|
|
||||||
|
|
||||||
def setup_test_env(tmpdir: Path) -> dict:
|
|
||||||
"""Set up isolated data/lib directory structure for tests.
|
|
||||||
|
|
||||||
Creates structure matching real ArchiveBox data dir:
|
|
||||||
<tmpdir>/data/
|
|
||||||
lib/
|
|
||||||
arm64-darwin/ (or x86_64-linux, etc.)
|
|
||||||
npm/
|
|
||||||
.bin/
|
|
||||||
node_modules/
|
|
||||||
personas/
|
|
||||||
default/
|
|
||||||
chrome_extensions/
|
|
||||||
users/
|
|
||||||
testuser/
|
|
||||||
crawls/
|
|
||||||
snapshots/
|
|
||||||
|
|
||||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
|
||||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
|
||||||
"""
|
|
||||||
import platform
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
|
||||||
machine = platform.machine().lower()
|
|
||||||
system = platform.system().lower()
|
|
||||||
if machine in ('arm64', 'aarch64'):
|
|
||||||
machine = 'arm64'
|
|
||||||
elif machine in ('x86_64', 'amd64'):
|
|
||||||
machine = 'x86_64'
|
|
||||||
machine_type = f"{machine}-{system}"
|
|
||||||
|
|
||||||
# Create proper directory structure matching real ArchiveBox layout
|
|
||||||
data_dir = tmpdir / 'data'
|
|
||||||
lib_dir = data_dir / 'lib' / machine_type
|
|
||||||
npm_dir = lib_dir / 'npm'
|
|
||||||
npm_bin_dir = npm_dir / '.bin'
|
|
||||||
node_modules_dir = npm_dir / 'node_modules'
|
|
||||||
|
|
||||||
# Extensions go under personas/Default/
|
|
||||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
|
||||||
|
|
||||||
# User data goes under users/{username}/
|
|
||||||
date_str = datetime.now().strftime('%Y%m%d')
|
|
||||||
users_dir = data_dir / 'users' / 'testuser'
|
|
||||||
crawls_dir = users_dir / 'crawls' / date_str
|
|
||||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
|
||||||
|
|
||||||
# Create all directories
|
|
||||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# Build complete env dict
|
|
||||||
env = os.environ.copy()
|
|
||||||
env.update({
|
|
||||||
'DATA_DIR': str(data_dir),
|
|
||||||
'LIB_DIR': str(lib_dir),
|
|
||||||
'MACHINE_TYPE': machine_type,
|
|
||||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
|
||||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
|
||||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
|
||||||
'CRAWLS_DIR': str(crawls_dir),
|
|
||||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
|
||||||
})
|
|
||||||
|
|
||||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
|
||||||
result = subprocess.run(
|
|
||||||
['python', str(CHROME_INSTALL_HOOK)],
|
|
||||||
capture_output=True, text=True, timeout=120, env=env
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
|
||||||
|
|
||||||
# Parse JSONL output to get CHROME_BINARY
|
|
||||||
chrome_binary = None
|
|
||||||
for line in result.stdout.strip().split('\n'):
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
data = json.loads(line)
|
|
||||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
|
||||||
chrome_binary = data['abspath']
|
|
||||||
break
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not chrome_binary or not Path(chrome_binary).exists():
|
|
||||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
|
||||||
|
|
||||||
env['CHROME_BINARY'] = chrome_binary
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
# Test URL: Yahoo has many ads that uBlock should block
|
# Test URL: Yahoo has many ads that uBlock should block
|
||||||
TEST_URL = 'https://www.yahoo.com/'
|
TEST_URL = 'https://www.yahoo.com/'
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user