mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-05 18:35:50 +10:00
centralize chrome pid and zombie logic in chrome_utils
This commit is contained in:
@@ -45,6 +45,17 @@ function getEnvBool(name, defaultValue = false) {
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get integer environment variable.
|
||||
* @param {string} name - Environment variable name
|
||||
* @param {number} [defaultValue=0] - Default value if not set
|
||||
* @returns {number} - Integer value
|
||||
*/
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse resolution string into width/height.
|
||||
* @param {string} resolution - Resolution string like "1440,2000"
|
||||
@@ -1004,6 +1015,7 @@ module.exports = {
|
||||
// Environment helpers
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseResolution,
|
||||
// PID file management
|
||||
writePidWithMtime,
|
||||
|
||||
@@ -25,12 +25,18 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { spawn } = require('child_process');
|
||||
const http = require('http');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('./chrome_utils.js');
|
||||
const {
|
||||
findChromium,
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
parseResolution,
|
||||
findFreePort,
|
||||
waitForDebugPort,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_tab';
|
||||
@@ -50,18 +56,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Cleanup handler for SIGTERM - close this snapshot's tab
|
||||
async function cleanup() {
|
||||
try {
|
||||
@@ -91,63 +85,6 @@ async function cleanup() {
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
// Find a free port
|
||||
function findFreePort() {
|
||||
return new Promise((resolve, reject) => {
|
||||
const server = require('net').createServer();
|
||||
server.unref();
|
||||
server.on('error', reject);
|
||||
server.listen(0, () => {
|
||||
const port = server.address().port;
|
||||
server.close(() => resolve(port));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Wait for Chrome's DevTools port to be ready
|
||||
function waitForDebugPort(port, timeout = 30000) {
|
||||
const startTime = Date.now();
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const tryConnect = () => {
|
||||
if (Date.now() - startTime > timeout) {
|
||||
reject(new Error(`Timeout waiting for Chrome debug port ${port}`));
|
||||
return;
|
||||
}
|
||||
|
||||
const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => {
|
||||
let data = '';
|
||||
res.on('data', chunk => data += chunk);
|
||||
res.on('end', () => {
|
||||
try {
|
||||
const info = JSON.parse(data);
|
||||
resolve(info);
|
||||
} catch (e) {
|
||||
setTimeout(tryConnect, 100);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
req.on('error', () => {
|
||||
setTimeout(tryConnect, 100);
|
||||
});
|
||||
|
||||
req.setTimeout(1000, () => {
|
||||
req.destroy();
|
||||
setTimeout(tryConnect, 100);
|
||||
});
|
||||
};
|
||||
|
||||
tryConnect();
|
||||
});
|
||||
}
|
||||
|
||||
// Try to find the crawl's Chrome session
|
||||
function findCrawlChromeSession(crawlId) {
|
||||
if (!crawlId) return null;
|
||||
|
||||
@@ -95,7 +95,7 @@ def find_chromium_binary():
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def ensure_chromium_and_puppeteer_installed():
|
||||
"""Ensure Chromium and puppeteer are installed before running tests."""
|
||||
from abx_pkg import Binary, NpmProvider
|
||||
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
|
||||
|
||||
# Rebuild pydantic models
|
||||
NpmProvider.model_rebuild()
|
||||
|
||||
@@ -18,17 +18,18 @@
|
||||
* DOM_ENABLED: Enable DOM extraction (default: true)
|
||||
*/
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
const {
|
||||
findChromium,
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseResolution,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Check if DOM is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('DOM_ENABLED', true)) {
|
||||
@@ -38,13 +39,7 @@ if (!getEnvBool('DOM_ENABLED', true)) {
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'dom';
|
||||
@@ -64,11 +59,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
@@ -100,12 +90,6 @@ function getCdpUrl() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
async function dumpDom(url) {
|
||||
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
|
||||
@@ -21,21 +21,16 @@
|
||||
* INFINISCROLL_EXPAND_DETAILS: Expand <details> and comments (default: true)
|
||||
*/
|
||||
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
const {
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Check if infiniscroll is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
|
||||
@@ -43,10 +38,6 @@ if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'infiniscroll';
|
||||
|
||||
@@ -18,17 +18,18 @@
|
||||
* PDF_ENABLED: Enable PDF generation (default: true)
|
||||
*/
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
const {
|
||||
findChromium,
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseResolution,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Check if PDF is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('PDF_ENABLED', true)) {
|
||||
@@ -38,12 +39,7 @@ if (!getEnvBool('PDF_ENABLED', true)) {
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'pdf';
|
||||
@@ -63,11 +59,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
@@ -99,12 +90,6 @@ function getCdpUrl() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
async function printToPdf(url) {
|
||||
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
|
||||
@@ -18,17 +18,18 @@
|
||||
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
|
||||
*/
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
const {
|
||||
findChromium,
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseResolution,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Check if screenshot is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
|
||||
@@ -38,12 +39,7 @@ if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { findChromium } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'screenshot';
|
||||
@@ -63,11 +59,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
@@ -99,12 +90,6 @@ function getCdpUrl() {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
async function takeScreenshot(url) {
|
||||
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
|
||||
@@ -158,49 +158,19 @@ def test_large_extension_size():
|
||||
|
||||
|
||||
def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
||||
"""Create isolated lib directories for tests and return env dict.
|
||||
"""Get lib directories for tests, using project's existing node_modules.
|
||||
|
||||
Sets up:
|
||||
LIB_DIR: tmpdir/lib/<arch>
|
||||
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
|
||||
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
|
||||
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
|
||||
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
|
||||
Uses the project's node_modules to avoid slow npm install during tests.
|
||||
"""
|
||||
import platform
|
||||
arch = platform.machine()
|
||||
system = platform.system().lower()
|
||||
arch_dir = f"{arch}-{system}"
|
||||
# Use project's existing node_modules (puppeteer-core already installed)
|
||||
project_root = Path(__file__).parent.parent.parent.parent.parent
|
||||
node_modules_dir = project_root / 'node_modules'
|
||||
|
||||
lib_dir = tmpdir / 'lib' / arch_dir
|
||||
npm_dir = lib_dir / 'npm'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
pip_venv_dir = lib_dir / 'pip' / 'venv'
|
||||
pip_bin_dir = pip_venv_dir / 'bin'
|
||||
|
||||
# Create directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
pip_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Install puppeteer-core to the test node_modules if not present
|
||||
if not (node_modules_dir / 'puppeteer-core').exists():
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
|
||||
pytest.skip("puppeteer-core not installed in project node_modules")
|
||||
|
||||
return {
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'PIP_VENV_DIR': str(pip_venv_dir),
|
||||
'PIP_BIN_DIR': str(pip_bin_dir),
|
||||
}
|
||||
|
||||
|
||||
@@ -268,11 +238,10 @@ def test_extension_loads_in_chromium():
|
||||
# Step 1: Install the uBlock extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
timeout=15
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
@@ -298,7 +267,7 @@ def test_extension_loads_in_chromium():
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
for i in range(10):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
@@ -306,7 +275,7 @@ def test_extension_loads_in_chromium():
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
time.sleep(0.5)
|
||||
|
||||
assert cdp_url, "Chromium CDP URL not found after 20s"
|
||||
print(f"Chromium launched with CDP URL: {cdp_url}")
|
||||
@@ -409,7 +378,7 @@ const puppeteer = require('puppeteer-core');
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
timeout=10
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
@@ -473,11 +442,10 @@ def test_blocks_ads_on_test_page():
|
||||
# Step 1: Install the uBlock extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
timeout=15
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
@@ -582,7 +550,7 @@ const puppeteer = require('puppeteer-core');
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
timeout=10
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
|
||||
Reference in New Issue
Block a user