centralize chrome pid and zombie logic in chrome_utils

This commit is contained in:
Nick Sweeting
2025-12-29 17:57:23 -08:00
parent 4ba3e8d120
commit b670612685
8 changed files with 75 additions and 213 deletions

View File

@@ -45,6 +45,17 @@ function getEnvBool(name, defaultValue = false) {
return defaultValue;
}
/**
* Get integer environment variable.
* @param {string} name - Environment variable name
* @param {number} [defaultValue=0] - Default value if not set
* @returns {number} - Integer value
*/
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
/**
* Parse resolution string into width/height.
* @param {string} resolution - Resolution string like "1440,2000"
@@ -1004,6 +1015,7 @@ module.exports = {
// Environment helpers
getEnv,
getEnvBool,
getEnvInt,
parseResolution,
// PID file management
writePidWithMtime,

View File

@@ -25,12 +25,18 @@
const fs = require('fs');
const path = require('path');
const { spawn } = require('child_process');
const http = require('http');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('./chrome_utils.js');
const {
findChromium,
getEnv,
getEnvBool,
parseResolution,
findFreePort,
waitForDebugPort,
} = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_tab';
@@ -50,18 +56,6 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Cleanup handler for SIGTERM - close this snapshot's tab
async function cleanup() {
try {
@@ -91,63 +85,6 @@ async function cleanup() {
process.on('SIGTERM', cleanup);
process.on('SIGINT', cleanup);
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
// Find a free port
function findFreePort() {
return new Promise((resolve, reject) => {
const server = require('net').createServer();
server.unref();
server.on('error', reject);
server.listen(0, () => {
const port = server.address().port;
server.close(() => resolve(port));
});
});
}
// Wait for Chrome's DevTools port to be ready
function waitForDebugPort(port, timeout = 30000) {
const startTime = Date.now();
return new Promise((resolve, reject) => {
const tryConnect = () => {
if (Date.now() - startTime > timeout) {
reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
return;
}
const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
let data = '';
res.on('data', chunk => data += chunk);
res.on('end', () => {
try {
const info = JSON.parse(data);
resolve(info);
} catch (e) {
setTimeout(tryConnect, 100);
}
});
});
req.on('error', () => {
setTimeout(tryConnect, 100);
});
req.setTimeout(1000, () => {
req.destroy();
setTimeout(tryConnect, 100);
});
};
tryConnect();
});
}
// Try to find the crawl's Chrome session
function findCrawlChromeSession(crawlId) {
if (!crawlId) return null;

View File

@@ -95,7 +95,7 @@ def find_chromium_binary():
@pytest.fixture(scope="session", autouse=True)
def ensure_chromium_and_puppeteer_installed():
"""Ensure Chromium and puppeteer are installed before running tests."""
from abx_pkg import Binary, NpmProvider
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
# Rebuild pydantic models
NpmProvider.model_rebuild()

View File

@@ -18,17 +18,18 @@
* DOM_ENABLED: Enable DOM extraction (default: true)
*/
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
const {
findChromium,
getEnv,
getEnvBool,
getEnvInt,
parseResolution,
} = require('../chrome/chrome_utils.js');
// Check if DOM is enabled BEFORE requiring puppeteer
if (!getEnvBool('DOM_ENABLED', true)) {
@@ -38,13 +39,7 @@ if (!getEnvBool('DOM_ENABLED', true)) {
}
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'dom';
@@ -64,11 +59,6 @@ function parseArgs() {
return args;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
@@ -100,12 +90,6 @@ function getCdpUrl() {
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
async function dumpDom(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');

View File

@@ -21,21 +21,16 @@
* INFINISCROLL_EXPAND_DETAILS: Expand <details> and comments (default: true)
*/
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
const {
getEnv,
getEnvBool,
getEnvInt,
} = require('../chrome/chrome_utils.js');
// Check if infiniscroll is enabled BEFORE requiring puppeteer
if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
@@ -43,10 +38,6 @@ if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
process.exit(0);
}
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'infiniscroll';

View File

@@ -18,17 +18,18 @@
* PDF_ENABLED: Enable PDF generation (default: true)
*/
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
const {
findChromium,
getEnv,
getEnvBool,
getEnvInt,
parseResolution,
} = require('../chrome/chrome_utils.js');
// Check if PDF is enabled BEFORE requiring puppeteer
if (!getEnvBool('PDF_ENABLED', true)) {
@@ -38,12 +39,7 @@ if (!getEnvBool('PDF_ENABLED', true)) {
}
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'pdf';
@@ -63,11 +59,6 @@ function parseArgs() {
return args;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
@@ -99,12 +90,6 @@ function getCdpUrl() {
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
async function printToPdf(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');

View File

@@ -18,17 +18,18 @@
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
*/
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
const {
findChromium,
getEnv,
getEnvBool,
getEnvInt,
parseResolution,
} = require('../chrome/chrome_utils.js');
// Check if screenshot is enabled BEFORE requiring puppeteer
if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
@@ -38,12 +39,7 @@ if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
}
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
const { findChromium } = require('../chrome/chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'screenshot';
@@ -63,11 +59,6 @@ function parseArgs() {
return args;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
@@ -99,12 +90,6 @@ function getCdpUrl() {
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
async function takeScreenshot(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');

View File

@@ -158,49 +158,19 @@ def test_large_extension_size():
def setup_test_lib_dirs(tmpdir: Path) -> dict:
"""Create isolated lib directories for tests and return env dict.
"""Get lib directories for tests, using project's existing node_modules.
Sets up:
LIB_DIR: tmpdir/lib/<arch>
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
Uses the project's node_modules to avoid slow npm install during tests.
"""
import platform
arch = platform.machine()
system = platform.system().lower()
arch_dir = f"{arch}-{system}"
# Use project's existing node_modules (puppeteer-core already installed)
project_root = Path(__file__).parent.parent.parent.parent.parent
node_modules_dir = project_root / 'node_modules'
lib_dir = tmpdir / 'lib' / arch_dir
npm_dir = lib_dir / 'npm'
node_modules_dir = npm_dir / 'node_modules'
npm_bin_dir = npm_dir / 'bin'
pip_venv_dir = lib_dir / 'pip' / 'venv'
pip_bin_dir = pip_venv_dir / 'bin'
# Create directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
pip_bin_dir.mkdir(parents=True, exist_ok=True)
# Install puppeteer-core to the test node_modules if not present
if not (node_modules_dir / 'puppeteer-core').exists():
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
capture_output=True,
text=True,
timeout=120
)
if result.returncode != 0:
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
pytest.skip("puppeteer-core not installed in project node_modules")
return {
'LIB_DIR': str(lib_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'NPM_BIN_DIR': str(npm_bin_dir),
'PIP_VENV_DIR': str(pip_venv_dir),
'PIP_BIN_DIR': str(pip_bin_dir),
}
@@ -268,11 +238,10 @@ def test_extension_loads_in_chromium():
# Step 1: Install the uBlock extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=120
timeout=15
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
@@ -298,7 +267,7 @@ def test_extension_loads_in_chromium():
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
for i in range(10):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
@@ -306,7 +275,7 @@ def test_extension_loads_in_chromium():
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
time.sleep(0.5)
assert cdp_url, "Chromium CDP URL not found after 20s"
print(f"Chromium launched with CDP URL: {cdp_url}")
@@ -409,7 +378,7 @@ const puppeteer = require('puppeteer-core');
capture_output=True,
text=True,
env=env,
timeout=90
timeout=10
)
print(f"stderr: {result.stderr}")
@@ -473,11 +442,10 @@ def test_blocks_ads_on_test_page():
# Step 1: Install the uBlock extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=120
timeout=15
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
@@ -582,7 +550,7 @@ const puppeteer = require('puppeteer-core');
capture_output=True,
text=True,
env=env,
timeout=90
timeout=10
)
print(f"stderr: {result.stderr}")