diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index 333cf418..fa331ee5 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -45,6 +45,17 @@ function getEnvBool(name, defaultValue = false) { return defaultValue; } +/** + * Get integer environment variable. + * @param {string} name - Environment variable name + * @param {number} [defaultValue=0] - Default value if not set + * @returns {number} - Integer value + */ +function getEnvInt(name, defaultValue = 0) { + const val = parseInt(getEnv(name, String(defaultValue)), 10); + return isNaN(val) ? defaultValue : val; +} + /** * Parse resolution string into width/height. * @param {string} resolution - Resolution string like "1440,2000" @@ -1004,6 +1015,7 @@ module.exports = { // Environment helpers getEnv, getEnvBool, + getEnvInt, parseResolution, // PID file management writePidWithMtime, diff --git a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js index f8b740f7..537ec5bf 100755 --- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js @@ -25,12 +25,18 @@ const fs = require('fs'); const path = require('path'); const { spawn } = require('child_process'); -const http = require('http'); // Add NODE_MODULES_DIR to module resolution paths if set if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); -const { findChromium } = require('./chrome_utils.js'); +const { + findChromium, + getEnv, + getEnvBool, + parseResolution, + findFreePort, + waitForDebugPort, +} = require('./chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'chrome_tab'; @@ -50,18 +56,6 @@ function parseArgs() { return args; } -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - // Cleanup handler for SIGTERM - close this snapshot's tab async function cleanup() { try { @@ -91,63 +85,6 @@ async function cleanup() { process.on('SIGTERM', cleanup); process.on('SIGINT', cleanup); -// Parse resolution string -function parseResolution(resolution) { - const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); - return { width: width || 1440, height: height || 2000 }; -} - -// Find a free port -function findFreePort() { - return new Promise((resolve, reject) => { - const server = require('net').createServer(); - server.unref(); - server.on('error', reject); - server.listen(0, () => { - const port = server.address().port; - server.close(() => resolve(port)); - }); - }); -} - -// Wait for Chrome's DevTools port to be ready -function waitForDebugPort(port, timeout = 30000) { - const startTime = Date.now(); - - return new Promise((resolve, reject) => { - const tryConnect = () => { - if (Date.now() - startTime > timeout) { - reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); - return; - } - - const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { - let data = ''; - res.on('data', chunk => data += chunk); - res.on('end', () => { - try { - const info = JSON.parse(data); - resolve(info); - } catch (e) { - setTimeout(tryConnect, 100); - } - }); - }); - - req.on('error', () => { - setTimeout(tryConnect, 100); - }); - - req.setTimeout(1000, () => { - req.destroy(); - setTimeout(tryConnect, 100); - }); - }; - - tryConnect(); - }); -} - // Try to find the crawl's Chrome session function findCrawlChromeSession(crawlId) { if (!crawlId) return null; diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 380c16ae..0d580244 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -95,7 +95,7 @@ def find_chromium_binary(): @pytest.fixture(scope="session", autouse=True) def ensure_chromium_and_puppeteer_installed(): """Ensure Chromium and puppeteer are installed before running tests.""" - from abx_pkg import Binary, NpmProvider + from abx_pkg import Binary, NpmProvider, BinProviderOverrides # Rebuild pydantic models NpmProvider.model_rebuild() diff --git a/archivebox/plugins/dom/on_Snapshot__53_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js index 20e5fcea..11ed9c18 100644 --- a/archivebox/plugins/dom/on_Snapshot__53_dom.js +++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js @@ -18,17 +18,18 @@ * DOM_ENABLED: Enable DOM extraction (default: true) */ -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} +const fs = require('fs'); +const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} +const { + findChromium, + getEnv, + getEnvBool, + getEnvInt, + parseResolution, +} = require('../chrome/chrome_utils.js'); // Check if DOM is enabled BEFORE requiring puppeteer if (!getEnvBool('DOM_ENABLED', true)) { @@ -38,13 +39,7 @@ if (!getEnvBool('DOM_ENABLED', true)) { } // Now safe to require puppeteer -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - const puppeteer = require('puppeteer-core'); -const { findChromium } = require('../chrome/chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'dom'; @@ -64,11 +59,6 @@ function parseArgs() { return args; } -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - // Check if staticfile extractor already downloaded this URL const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { @@ -100,12 +90,6 @@ function getCdpUrl() { return null; } -// Parse resolution string -function parseResolution(resolution) { - const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); - return { width: width || 1440, height: height || 2000 }; -} - async function dumpDom(url) { const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js index 506e8371..3003d370 100755 --- a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js +++ b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js @@ -21,21 +21,16 @@ * INFINISCROLL_EXPAND_DETAILS: Expand
and comments (default: true) */ -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} +const fs = require('fs'); +const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} +const { + getEnv, + getEnvBool, + getEnvInt, +} = require('../chrome/chrome_utils.js'); // Check if infiniscroll is enabled BEFORE requiring puppeteer if (!getEnvBool('INFINISCROLL_ENABLED', true)) { @@ -43,10 +38,6 @@ if (!getEnvBool('INFINISCROLL_ENABLED', true)) { process.exit(0); } -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'infiniscroll'; diff --git a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js index e42a8a6e..ccb30b01 100644 --- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js +++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js @@ -18,17 +18,18 @@ * PDF_ENABLED: Enable PDF generation (default: true) */ -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} +const fs = require('fs'); +const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} +const { + findChromium, + getEnv, + getEnvBool, + getEnvInt, + parseResolution, +} = require('../chrome/chrome_utils.js'); // Check if PDF is enabled BEFORE requiring puppeteer if (!getEnvBool('PDF_ENABLED', true)) { @@ -38,12 +39,7 @@ if (!getEnvBool('PDF_ENABLED', true)) { } // Now safe to require puppeteer -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); -const { findChromium } = require('../chrome/chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'pdf'; @@ -63,11 +59,6 @@ function parseArgs() { return args; } -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - // Check if staticfile extractor already downloaded this URL const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { @@ -99,12 +90,6 @@ function getCdpUrl() { return null; } -// Parse resolution string -function parseResolution(resolution) { - const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); - return { width: width || 1440, height: height || 2000 }; -} - async function printToPdf(url) { const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); diff --git a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js index da25c459..ac968883 100644 --- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js +++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js @@ -18,17 +18,18 @@ * SCREENSHOT_ENABLED: Enable screenshot capture (default: true) */ -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} +const fs = require('fs'); +const path = require('path'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} +const { + findChromium, + getEnv, + getEnvBool, + getEnvInt, + parseResolution, +} = require('../chrome/chrome_utils.js'); // Check if screenshot is enabled BEFORE requiring puppeteer if (!getEnvBool('SCREENSHOT_ENABLED', true)) { @@ -38,12 +39,7 @@ if (!getEnvBool('SCREENSHOT_ENABLED', true)) { } // Now safe to require puppeteer -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); -const { findChromium } = require('../chrome/chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'screenshot'; @@ -63,11 +59,6 @@ function parseArgs() { return args; } -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - // Check if staticfile extractor already downloaded this URL const STATICFILE_DIR = '../staticfile'; function hasStaticFileOutput() { @@ -99,12 +90,6 @@ function getCdpUrl() { return null; } -// Parse resolution string -function parseResolution(resolution) { - const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); - return { width: width || 1440, height: height || 2000 }; -} - async function takeScreenshot(url) { const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index 2ba718e0..5780e0b2 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -158,49 +158,19 @@ def test_large_extension_size(): def setup_test_lib_dirs(tmpdir: Path) -> dict: - """Create isolated lib directories for tests and return env dict. + """Get lib directories for tests, using project's existing node_modules. - Sets up: - LIB_DIR: tmpdir/lib/ - NODE_MODULES_DIR: tmpdir/lib//npm/node_modules - NPM_BIN_DIR: tmpdir/lib//npm/bin - PIP_VENV_DIR: tmpdir/lib//pip/venv - PIP_BIN_DIR: tmpdir/lib//pip/venv/bin + Uses the project's node_modules to avoid slow npm install during tests. """ - import platform - arch = platform.machine() - system = platform.system().lower() - arch_dir = f"{arch}-{system}" + # Use project's existing node_modules (puppeteer-core already installed) + project_root = Path(__file__).parent.parent.parent.parent.parent + node_modules_dir = project_root / 'node_modules' - lib_dir = tmpdir / 'lib' / arch_dir - npm_dir = lib_dir / 'npm' - node_modules_dir = npm_dir / 'node_modules' - npm_bin_dir = npm_dir / 'bin' - pip_venv_dir = lib_dir / 'pip' / 'venv' - pip_bin_dir = pip_venv_dir / 'bin' - - # Create directories - node_modules_dir.mkdir(parents=True, exist_ok=True) - npm_bin_dir.mkdir(parents=True, exist_ok=True) - pip_bin_dir.mkdir(parents=True, exist_ok=True) - - # Install puppeteer-core to the test node_modules if not present if not (node_modules_dir / 'puppeteer-core').exists(): - result = subprocess.run( - ['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'], - capture_output=True, - text=True, - timeout=120 - ) - if result.returncode != 0: - pytest.skip(f"Failed to install puppeteer-core: {result.stderr}") + pytest.skip("puppeteer-core not installed in project node_modules") return { - 'LIB_DIR': str(lib_dir), 'NODE_MODULES_DIR': str(node_modules_dir), - 'NPM_BIN_DIR': str(npm_bin_dir), - 'PIP_VENV_DIR': str(pip_venv_dir), - 'PIP_BIN_DIR': str(pip_bin_dir), } @@ -268,11 +238,10 @@ def test_extension_loads_in_chromium(): # Step 1: Install the uBlock extension result = subprocess.run( ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir), capture_output=True, text=True, env=env, - timeout=120 + timeout=15 ) assert result.returncode == 0, f"Extension install failed: {result.stderr}" @@ -298,7 +267,7 @@ def test_extension_loads_in_chromium(): # Wait for Chromium to launch and CDP URL to be available cdp_url = None - for i in range(20): + for i in range(10): if chrome_launch_process.poll() is not None: stdout, stderr = chrome_launch_process.communicate() raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") @@ -306,7 +275,7 @@ def test_extension_loads_in_chromium(): if cdp_file.exists(): cdp_url = cdp_file.read_text().strip() break - time.sleep(1) + time.sleep(0.5) assert cdp_url, "Chromium CDP URL not found after 20s" print(f"Chromium launched with CDP URL: {cdp_url}") @@ -409,7 +378,7 @@ const puppeteer = require('puppeteer-core'); capture_output=True, text=True, env=env, - timeout=90 + timeout=10 ) print(f"stderr: {result.stderr}") @@ -473,11 +442,10 @@ def test_blocks_ads_on_test_page(): # Step 1: Install the uBlock extension result = subprocess.run( ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir), capture_output=True, text=True, env=env, - timeout=120 + timeout=15 ) assert result.returncode == 0, f"Extension install failed: {result.stderr}" @@ -582,7 +550,7 @@ const puppeteer = require('puppeteer-core'); capture_output=True, text=True, env=env, - timeout=90 + timeout=10 ) print(f"stderr: {result.stderr}")