Add JS-Python path delegation to reduce Chrome-related duplication

- Add getMachineType, getLibDir, getNodeModulesDir, getTestEnv CLI commands to chrome_utils.js
  These are now the single source of truth for path calculations
- Update chrome_test_helpers.py with call_chrome_utils() dispatcher
- Add get_test_env_from_js(), get_machine_type_from_js(), kill_chrome_via_js() helpers
- Update cleanup_chrome and kill_chromium_session to use JS killChrome
- Remove unused Chrome binary search lists from singlefile hook (~25 lines)
- Update readability, mercury, favicon, title tests to use shared helpers
This commit is contained in:
Claude
2025-12-31 09:11:11 +00:00
parent d72ab7c397
commit adeffb4bc5
7 changed files with 273 additions and 76 deletions

View File

@@ -1333,6 +1333,83 @@ function getExtensionsDir() {
path.join(dataDir, 'personas', persona, 'chrome_extensions'); path.join(dataDir, 'personas', persona, 'chrome_extensions');
} }
/**
* Get machine type string for platform-specific paths.
* Matches Python's archivebox.config.paths.get_machine_type()
*
* @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin')
*/
function getMachineType() {
if (process.env.MACHINE_TYPE) {
return process.env.MACHINE_TYPE;
}
let machine = process.arch;
const system = process.platform;
// Normalize machine type to match Python's convention
if (machine === 'arm64' || machine === 'aarch64') {
machine = 'arm64';
} else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') {
machine = 'x86_64';
} else if (machine === 'ia32' || machine === 'x86') {
machine = 'x86';
}
return `${machine}-${system}`;
}
/**
* Get LIB_DIR path for platform-specific binaries.
* Returns DATA_DIR/lib/MACHINE_TYPE/
*
* @returns {string} - Absolute path to lib directory
*/
function getLibDir() {
if (process.env.LIB_DIR) {
return process.env.LIB_DIR;
}
const dataDir = getEnv('DATA_DIR', './data');
const machineType = getMachineType();
return path.join(dataDir, 'lib', machineType);
}
/**
* Get NODE_MODULES_DIR path for npm packages.
* Returns LIB_DIR/npm/node_modules/
*
* @returns {string} - Absolute path to node_modules directory
*/
function getNodeModulesDir() {
if (process.env.NODE_MODULES_DIR) {
return process.env.NODE_MODULES_DIR;
}
return path.join(getLibDir(), 'npm', 'node_modules');
}
/**
* Get all test environment paths as a JSON object.
* This is the single source of truth for path calculations - Python calls this
* to avoid duplicating path logic.
*
* @returns {Object} - Object with all test environment paths
*/
function getTestEnv() {
const dataDir = getEnv('DATA_DIR', './data');
const machineType = getMachineType();
const libDir = getLibDir();
const nodeModulesDir = getNodeModulesDir();
return {
DATA_DIR: dataDir,
MACHINE_TYPE: machineType,
LIB_DIR: libDir,
NODE_MODULES_DIR: nodeModulesDir,
NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'),
CHROME_EXTENSIONS_DIR: getExtensionsDir(),
};
}
/** /**
* Install a Chrome extension with caching support. * Install a Chrome extension with caching support.
* *
@@ -1442,8 +1519,13 @@ module.exports = {
getExtensionPaths, getExtensionPaths,
waitForExtensionTarget, waitForExtensionTarget,
getExtensionTargets, getExtensionTargets,
// Shared extension installer utilities // Shared path utilities (single source of truth for Python/JS)
getMachineType,
getLibDir,
getNodeModulesDir,
getExtensionsDir, getExtensionsDir,
getTestEnv,
// Shared extension installer utilities
installExtensionWithCache, installExtensionWithCache,
// Deprecated - use enableExtensions option instead // Deprecated - use enableExtensions option instead
getExtensionLaunchArgs, getExtensionLaunchArgs,
@@ -1457,18 +1539,31 @@ if (require.main === module) {
console.log('Usage: chrome_utils.js <command> [args...]'); console.log('Usage: chrome_utils.js <command> [args...]');
console.log(''); console.log('');
console.log('Commands:'); console.log('Commands:');
console.log(' findChromium'); console.log(' findChromium Find Chrome/Chromium binary');
console.log(' installChromium'); console.log(' installChromium Install Chromium via @puppeteer/browsers');
console.log(' installPuppeteerCore [npm_prefix]'); console.log(' installPuppeteerCore Install puppeteer-core npm package');
console.log(' launchChromium [output_dir] [extension_paths_json]'); console.log(' launchChromium Launch Chrome with CDP debugging');
console.log(' killChrome <pid> [output_dir]'); console.log(' killChrome <pid> Kill Chrome process by PID');
console.log(' killZombieChrome [data_dir]'); console.log(' killZombieChrome Clean up zombie Chrome processes');
console.log(' getExtensionId <path>'); console.log('');
console.log(' loadExtensionManifest <path>'); console.log(' getMachineType Get machine type (e.g., x86_64-linux)');
console.log(' getExtensionLaunchArgs <extensions_json>'); console.log(' getLibDir Get LIB_DIR path');
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]'); console.log(' getNodeModulesDir Get NODE_MODULES_DIR path');
console.log(' getExtensionsDir'); console.log(' getExtensionsDir Get Chrome extensions directory');
console.log(' installExtensionWithCache <webstore_id> <name>'); console.log(' getTestEnv Get all paths as JSON (for tests)');
console.log('');
console.log(' getExtensionId <path> Get extension ID from unpacked path');
console.log(' loadExtensionManifest Load extension manifest.json');
console.log(' loadOrInstallExtension Load or install an extension');
console.log(' installExtensionWithCache Install extension with caching');
console.log('');
console.log('Environment variables:');
console.log(' DATA_DIR Base data directory');
console.log(' LIB_DIR Library directory (computed if not set)');
console.log(' MACHINE_TYPE Machine type override');
console.log(' NODE_MODULES_DIR Node modules directory');
console.log(' CHROME_BINARY Chrome binary path');
console.log(' CHROME_EXTENSIONS_DIR Extensions directory');
process.exit(1); process.exit(1);
} }
@@ -1581,11 +1676,31 @@ if (require.main === module) {
break; break;
} }
case 'getMachineType': {
console.log(getMachineType());
break;
}
case 'getLibDir': {
console.log(getLibDir());
break;
}
case 'getNodeModulesDir': {
console.log(getNodeModulesDir());
break;
}
case 'getExtensionsDir': { case 'getExtensionsDir': {
console.log(getExtensionsDir()); console.log(getExtensionsDir());
break; break;
} }
case 'getTestEnv': {
console.log(JSON.stringify(getTestEnv(), null, 2));
break;
}
case 'installExtensionWithCache': { case 'installExtensionWithCache': {
const [webstore_id, name] = commandArgs; const [webstore_id, name] = commandArgs;
if (!webstore_id || !name) { if (!webstore_id || !name) {

View File

@@ -321,6 +321,51 @@ def run_hook_and_parse(
return returncode, result, stderr return returncode, result, stderr
def call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]:
"""Call chrome_utils.js CLI command.
This is the central dispatch for calling the JS utilities from Python.
All path calculations and Chrome operations are centralized in chrome_utils.js
to ensure consistency between Python and JavaScript code.
Args:
command: The CLI command (e.g., 'findChromium', 'getTestEnv')
*args: Additional command arguments
env: Environment dict (default: current env)
Returns:
Tuple of (returncode, stdout, stderr)
"""
cmd = ['node', str(CHROME_UTILS), command] + list(args)
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=30,
env=env or os.environ.copy()
)
return result.returncode, result.stdout, result.stderr
def get_test_env_from_js() -> Optional[Dict[str, str]]:
"""Get test environment paths from chrome_utils.js getTestEnv().
This is the single source of truth for path calculations.
Python calls JS to get all paths to avoid duplicating logic.
Returns:
Dict with DATA_DIR, MACHINE_TYPE, LIB_DIR, NODE_MODULES_DIR, etc.
or None if the JS call fails
"""
returncode, stdout, stderr = call_chrome_utils('getTestEnv')
if returncode == 0 and stdout.strip():
try:
return json.loads(stdout)
except json.JSONDecodeError:
pass
return None
def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]: def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
"""Find the Chromium binary using chrome_utils.js findChromium(). """Find the Chromium binary using chrome_utils.js findChromium().
@@ -336,15 +381,12 @@ def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
Returns: Returns:
Path to Chromium binary or None if not found Path to Chromium binary or None if not found
""" """
search_dir = data_dir or os.environ.get('DATA_DIR', '.') env = os.environ.copy()
result = subprocess.run( if data_dir:
['node', str(CHROME_UTILS), 'findChromium', str(search_dir)], env['DATA_DIR'] = str(data_dir)
capture_output=True, returncode, stdout, stderr = call_chrome_utils('findChromium', env=env)
text=True, if returncode == 0 and stdout.strip():
timeout=10 return stdout.strip()
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None return None
@@ -358,21 +400,52 @@ def get_extensions_dir() -> str:
Returns: Returns:
Path to extensions directory Path to extensions directory
""" """
result = subprocess.run( returncode, stdout, stderr = call_chrome_utils('getExtensionsDir')
['node', str(CHROME_UTILS), 'getExtensionsDir'], if returncode == 0 and stdout.strip():
capture_output=True, return stdout.strip()
text=True,
timeout=10,
env=get_test_env()
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
# Fallback to default computation if JS call fails # Fallback to default computation if JS call fails
data_dir = os.environ.get('DATA_DIR', './data') data_dir = os.environ.get('DATA_DIR', './data')
persona = os.environ.get('ACTIVE_PERSONA', 'Default') persona = os.environ.get('ACTIVE_PERSONA', 'Default')
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
def get_machine_type_from_js() -> Optional[str]:
"""Get machine type from chrome_utils.js getMachineType().
This is the single source of truth for machine type calculation.
Returns values like 'x86_64-linux', 'arm64-darwin'.
Returns:
Machine type string or None if the JS call fails
"""
returncode, stdout, stderr = call_chrome_utils('getMachineType')
if returncode == 0 and stdout.strip():
return stdout.strip()
return None
def kill_chrome_via_js(pid: int, output_dir: Optional[str] = None) -> bool:
"""Kill a Chrome process using chrome_utils.js killChrome().
This uses the centralized kill logic which handles:
- SIGTERM then SIGKILL
- Process group killing
- Zombie process cleanup
Args:
pid: Process ID to kill
output_dir: Optional chrome output directory for PID file cleanup
Returns:
True if the kill command succeeded
"""
args = [str(pid)]
if output_dir:
args.append(str(output_dir))
returncode, stdout, stderr = call_chrome_utils('killChrome', *args)
return returncode == 0
# ============================================================================= # =============================================================================
# Extension Test Helpers # Extension Test Helpers
# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha) # Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha)
@@ -535,21 +608,26 @@ def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple
def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None: def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None:
"""Clean up Chromium process launched by launch_chromium_session. """Clean up Chromium process launched by launch_chromium_session.
Uses chrome_utils.js killChrome for proper process group handling.
Args: Args:
chrome_launch_process: The Popen object from launch_chromium_session chrome_launch_process: The Popen object from launch_chromium_session
chrome_dir: The chrome directory containing chrome.pid chrome_dir: The chrome directory containing chrome.pid
""" """
# First try to terminate the launch process gracefully
try: try:
chrome_launch_process.send_signal(signal.SIGTERM) chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5) chrome_launch_process.wait(timeout=5)
except Exception: except Exception:
pass pass
# Read PID and use JS to kill with proper cleanup
chrome_pid_file = chrome_dir / 'chrome.pid' chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists(): if chrome_pid_file.exists():
try: try:
chrome_pid = int(chrome_pid_file.read_text().strip()) chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL) kill_chrome_via_js(chrome_pid, str(chrome_dir))
except (OSError, ValueError): except (ValueError, FileNotFoundError):
pass pass
@@ -683,25 +761,28 @@ def setup_chrome_session(
return chrome_launch_process, chrome_pid, snapshot_chrome_dir return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None: def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None:
"""Clean up Chrome processes. """Clean up Chrome processes using chrome_utils.js killChrome.
Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID. Uses the centralized kill logic from chrome_utils.js which handles:
Ignores errors if processes are already dead. - SIGTERM then SIGKILL
- Process group killing
- Zombie process cleanup
Args: Args:
chrome_launch_process: The Popen object for the chrome launch hook chrome_launch_process: The Popen object for the chrome launch hook
chrome_pid: The PID of the Chrome process chrome_pid: The PID of the Chrome process
chrome_dir: Optional path to chrome output directory
""" """
# First try to terminate the launch process gracefully
try: try:
chrome_launch_process.send_signal(signal.SIGTERM) chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5) chrome_launch_process.wait(timeout=5)
except Exception: except Exception:
pass pass
try:
os.kill(chrome_pid, signal.SIGKILL) # Use JS to kill Chrome with proper process group handling
except OSError: kill_chrome_via_js(chrome_pid, str(chrome_dir) if chrome_dir else None)
pass
@contextmanager @contextmanager

View File

@@ -2,7 +2,6 @@
Integration tests for favicon plugin Integration tests for favicon plugin
Tests verify: Tests verify:
pass
1. Plugin script exists 1. Plugin script exists
2. requests library is available 2. requests library is available
3. Favicon extraction works for real example.com 3. Favicon extraction works for real example.com
@@ -21,9 +20,15 @@ from pathlib import Path
import pytest import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
PLUGIN_DIR = Path(__file__).parent.parent
FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None) PLUGIN_DIR = get_plugin_dir(__file__)
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
TEST_URL = 'https://example.com' TEST_URL = 'https://example.com'

View File

@@ -2,7 +2,6 @@
Integration tests for mercury plugin Integration tests for mercury plugin
Tests verify: Tests verify:
pass
1. Hook script exists 1. Hook script exists
2. Dependencies installed via validation hooks 2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg 3. Verify deps with abx-pkg
@@ -19,9 +18,15 @@ import tempfile
from pathlib import Path from pathlib import Path
import pytest import pytest
PLUGIN_DIR = Path(__file__).parent.parent from archivebox.plugins.chrome.tests.chrome_test_helpers import (
PLUGINS_ROOT = PLUGIN_DIR.parent get_plugin_dir,
MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None) get_hook_script,
PLUGINS_ROOT,
)
PLUGIN_DIR = get_plugin_dir(__file__)
MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*')
TEST_URL = 'https://example.com' TEST_URL = 'https://example.com'
def test_hook_script_exists(): def test_hook_script_exists():

View File

@@ -2,7 +2,6 @@
Integration tests for readability plugin Integration tests for readability plugin
Tests verify: Tests verify:
pass
1. Validate hook checks for readability-extractor binary 1. Validate hook checks for readability-extractor binary
2. Verify deps with abx-pkg 2. Verify deps with abx-pkg
3. Plugin reports missing dependency correctly 3. Plugin reports missing dependency correctly
@@ -18,10 +17,15 @@ from pathlib import Path
import pytest import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
PLUGINS_ROOT,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent PLUGIN_DIR = get_plugin_dir(__file__)
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*')) READABILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_readability.*')
TEST_URL = 'https://example.com' TEST_URL = 'https://example.com'

View File

@@ -77,27 +77,9 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir()) return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Chrome binary search paths # Chrome session directory (relative to extractor output dir)
CHROMIUM_BINARY_NAMES_LINUX = [ # Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for.
'chromium', 'chromium-browser', 'chromium-browser-beta', # The centralized Chrome binary search is in chrome_utils.js findChromium().
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
]
CHROME_BINARY_NAMES_LINUX = [
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
]
CHROME_BINARY_NAMES_MACOS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
]
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
ALL_CHROME_BINARIES = (
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
)
CHROME_SESSION_DIR = '../chrome' CHROME_SESSION_DIR = '../chrome'

View File

@@ -2,7 +2,6 @@
Integration tests for title plugin Integration tests for title plugin
Tests verify: Tests verify:
pass
1. Plugin script exists 1. Plugin script exists
2. Node.js is available 2. Node.js is available
3. Title extraction works for real example.com 3. Title extraction works for real example.com
@@ -20,9 +19,15 @@ from pathlib import Path
import pytest import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
PLUGIN_DIR = Path(__file__).parent.parent
TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None) PLUGIN_DIR = get_plugin_dir(__file__)
TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*')
TEST_URL = 'https://example.com' TEST_URL = 'https://example.com'