diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index 245e0ba9..b4370fde 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -1312,6 +1312,99 @@ function findChromium() { return null; } +// ============================================================================ +// Shared Extension Installer Utilities +// ============================================================================ + +/** + * Get the extensions directory path. + * Centralized path calculation used by extension installers and chrome launch. + * + * Path is derived from environment variables in this priority: + * 1. CHROME_EXTENSIONS_DIR (explicit override) + * 2. DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions (default) + * + * @returns {string} - Absolute path to extensions directory + */ +function getExtensionsDir() { + const dataDir = getEnv('DATA_DIR', './data'); + const persona = getEnv('ACTIVE_PERSONA', 'Default'); + return getEnv('CHROME_EXTENSIONS_DIR') || + path.join(dataDir, 'personas', persona, 'chrome_extensions'); +} + +/** + * Install a Chrome extension with caching support. + * + * This is the main entry point for extension installer hooks. It handles: + * - Checking for cached extension metadata + * - Installing the extension if not cached + * - Writing cache file for future runs + * + * @param {Object} extension - Extension metadata object + * @param {string} extension.webstore_id - Chrome Web Store extension ID + * @param {string} extension.name - Human-readable extension name (used for cache file) + * @param {Object} [options] - Options + * @param {string} [options.extensionsDir] - Override extensions directory + * @param {boolean} [options.quiet=false] - Suppress info logging + * @returns {Promise} - Installed extension metadata or null on failure + */ +async function installExtensionWithCache(extension, options = {}) { + const { + extensionsDir = getExtensionsDir(), + quiet = false, + } = options; + + const cacheFile = path.join(extensionsDir, `${extension.name}.extension.json`); + + // Check if extension is already cached and valid + if (fs.existsSync(cacheFile)) { + try { + const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); + const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); + + if (fs.existsSync(manifestPath)) { + if (!quiet) { + console.log(`[*] ${extension.name} extension already installed (using cache)`); + } + return cached; + } + } catch (e) { + // Cache file corrupted, re-install + console.warn(`[⚠️] Extension cache corrupted for ${extension.name}, re-installing...`); + } + } + + // Install extension + if (!quiet) { + console.log(`[*] Installing ${extension.name} extension...`); + } + + const installedExt = await loadOrInstallExtension(extension, extensionsDir); + + if (!installedExt) { + console.error(`[❌] Failed to install ${extension.name} extension`); + return null; + } + + // Write cache file + try { + await fs.promises.mkdir(extensionsDir, { recursive: true }); + await fs.promises.writeFile(cacheFile, JSON.stringify(installedExt, null, 2)); + if (!quiet) { + console.log(`[+] Extension metadata written to ${cacheFile}`); + } + } catch (e) { + console.warn(`[⚠️] Failed to write cache file: ${e.message}`); + } + + if (!quiet) { + console.log(`[+] ${extension.name} extension installed`); + } + + return installedExt; +} + // Export all functions module.exports = { // Environment helpers @@ -1349,6 +1442,9 @@ module.exports = { getExtensionPaths, waitForExtensionTarget, getExtensionTargets, + // Shared extension installer utilities + getExtensionsDir, + installExtensionWithCache, // Deprecated - use enableExtensions option instead getExtensionLaunchArgs, }; @@ -1371,6 +1467,8 @@ if (require.main === module) { console.log(' loadExtensionManifest '); console.log(' getExtensionLaunchArgs '); console.log(' loadOrInstallExtension [extensions_dir]'); + console.log(' getExtensionsDir'); + console.log(' installExtensionWithCache '); process.exit(1); } @@ -1483,6 +1581,26 @@ if (require.main === module) { break; } + case 'getExtensionsDir': { + console.log(getExtensionsDir()); + break; + } + + case 'installExtensionWithCache': { + const [webstore_id, name] = commandArgs; + if (!webstore_id || !name) { + console.error('Usage: installExtensionWithCache '); + process.exit(1); + } + const ext = await installExtensionWithCache({ webstore_id, name }); + if (ext) { + console.log(JSON.stringify(ext, null, 2)); + } else { + process.exit(1); + } + break; + } + default: console.error(`Unknown command: ${command}`); process.exit(1); diff --git a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js index 58cafca0..0799f3ad 100644 --- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js @@ -38,6 +38,7 @@ const { killChrome, getEnv, writePidWithMtime, + getExtensionsDir, } = require('./chrome_utils.js'); // Extractor metadata @@ -115,8 +116,7 @@ async function main() { if (version) console.error(`[*] Version: ${version}`); // Load installed extensions - const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || - path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + const extensionsDir = getExtensionsDir(); const userDataDir = getEnv('CHROME_USER_DATA_DIR'); if (userDataDir) { diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py new file mode 100644 index 00000000..97928323 --- /dev/null +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -0,0 +1,276 @@ +""" +Shared Chrome test helpers for plugin integration tests. + +This module provides common utilities for Chrome-based plugin tests, reducing +duplication across test files. It uses the JavaScript utilities from chrome_utils.js +where appropriate. + +Usage: + from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + setup_chrome_session, + cleanup_chrome, + find_chromium_binary, + get_node_modules_dir, + ) +""" + +import os +import signal +import subprocess +import time +from pathlib import Path +from typing import Tuple, Optional +from contextlib import contextmanager + + +# Plugin directory locations +CHROME_PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent + +# Hook script locations +CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js' +CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js' +CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None) +CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js' + + +def get_node_modules_dir() -> Path: + """Get NODE_MODULES_DIR for tests, checking env first. + + Returns the path to the node_modules directory, checking: + 1. NODE_MODULES_DIR environment variable + 2. Computed from LIB_DIR via ArchiveBox config + """ + if os.environ.get('NODE_MODULES_DIR'): + return Path(os.environ['NODE_MODULES_DIR']) + # Otherwise compute from LIB_DIR + from archivebox.config.common import STORAGE_CONFIG + lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) + return lib_dir / 'npm' / 'node_modules' + + +def get_test_env() -> dict: + """Get environment dict with NODE_MODULES_DIR set correctly for tests. + + Returns a copy of os.environ with NODE_MODULES_DIR added/updated. + Use this for all subprocess calls in plugin tests. + """ + env = os.environ.copy() + env['NODE_MODULES_DIR'] = str(get_node_modules_dir()) + return env + + +def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]: + """Find the Chromium binary using chrome_utils.js findChromium(). + + This uses the centralized findChromium() function which checks: + - CHROME_BINARY env var + - @puppeteer/browsers install locations + - System Chromium locations + - Falls back to Chrome (with warning) + + Args: + data_dir: Directory where chromium was installed (contains chromium/ subdir) + + Returns: + Path to Chromium binary or None if not found + """ + search_dir = data_dir or os.environ.get('DATA_DIR', '.') + result = subprocess.run( + ['node', str(CHROME_UTILS), 'findChromium', str(search_dir)], + capture_output=True, + text=True, + timeout=10 + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + return None + + +def get_extensions_dir() -> str: + """Get the Chrome extensions directory using chrome_utils.js getExtensionsDir(). + + This uses the centralized path calculation from chrome_utils.js which checks: + - CHROME_EXTENSIONS_DIR env var + - DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions + + Returns: + Path to extensions directory + """ + result = subprocess.run( + ['node', str(CHROME_UTILS), 'getExtensionsDir'], + capture_output=True, + text=True, + timeout=10, + env=get_test_env() + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + # Fallback to default computation if JS call fails + data_dir = os.environ.get('DATA_DIR', './data') + persona = os.environ.get('ACTIVE_PERSONA', 'Default') + return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') + + +def setup_chrome_session( + tmpdir: Path, + crawl_id: str = 'test-crawl', + snapshot_id: str = 'test-snapshot', + test_url: str = 'about:blank', + navigate: bool = True, + timeout: int = 15, +) -> Tuple[subprocess.Popen, int, Path]: + """Set up a Chrome session with tab and optional navigation. + + Creates the directory structure, launches Chrome, creates a tab, + and optionally navigates to the test URL. + + Args: + tmpdir: Temporary directory for test files + crawl_id: ID to use for the crawl + snapshot_id: ID to use for the snapshot + test_url: URL to navigate to (if navigate=True) + navigate: Whether to navigate to the URL after creating tab + timeout: Seconds to wait for Chrome to start + + Returns: + Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir) + + Raises: + RuntimeError: If Chrome fails to start or tab creation fails + """ + crawl_dir = Path(tmpdir) / 'crawl' + crawl_dir.mkdir(exist_ok=True) + chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir(exist_ok=True) + + env = get_test_env() + env['CHROME_HEADLESS'] = 'true' + + # Launch Chrome at crawl level + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chrome to launch + for i in range(timeout): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") + if (chrome_dir / 'cdp_url.txt').exists(): + break + time.sleep(1) + + if not (chrome_dir / 'cdp_url.txt').exists(): + raise RuntimeError(f"Chrome CDP URL not found after {timeout}s") + + chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) + + # Create snapshot directory structure + snapshot_dir = Path(tmpdir) / 'snapshot' + snapshot_dir.mkdir(exist_ok=True) + snapshot_chrome_dir = snapshot_dir / 'chrome' + snapshot_chrome_dir.mkdir(exist_ok=True) + + # Create tab + tab_env = env.copy() + tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + result = subprocess.run( + ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=60, + env=tab_env + ) + if result.returncode != 0: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError(f"Tab creation failed: {result.stderr}") + + # Navigate to URL if requested + if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank': + result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + if result.returncode != 0: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError(f"Navigation failed: {result.stderr}") + + return chrome_launch_process, chrome_pid, snapshot_chrome_dir + + +def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None: + """Clean up Chrome processes. + + Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID. + Ignores errors if processes are already dead. + + Args: + chrome_launch_process: The Popen object for the chrome launch hook + chrome_pid: The PID of the Chrome process + """ + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except Exception: + pass + try: + os.kill(chrome_pid, signal.SIGKILL) + except OSError: + pass + + +@contextmanager +def chrome_session( + tmpdir: Path, + crawl_id: str = 'test-crawl', + snapshot_id: str = 'test-snapshot', + test_url: str = 'about:blank', + navigate: bool = True, + timeout: int = 15, +): + """Context manager for Chrome sessions with automatic cleanup. + + Usage: + with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir): + # Run tests with chrome session + pass + # Chrome automatically cleaned up + + Args: + tmpdir: Temporary directory for test files + crawl_id: ID to use for the crawl + snapshot_id: ID to use for the snapshot + test_url: URL to navigate to (if navigate=True) + navigate: Whether to navigate to the URL after creating tab + timeout: Seconds to wait for Chrome to start + + Yields: + Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir) + """ + chrome_launch_process = None + chrome_pid = None + try: + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + tmpdir=tmpdir, + crawl_id=crawl_id, + snapshot_id=snapshot_id, + test_url=test_url, + navigate=navigate, + timeout=timeout, + ) + yield chrome_launch_process, chrome_pid, snapshot_chrome_dir + finally: + if chrome_launch_process and chrome_pid: + cleanup_chrome(chrome_launch_process, chrome_pid) diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py index 966f3071..eee44ce4 100644 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -14,7 +14,6 @@ Tests verify: import json import os import re -import signal import subprocess import time import tempfile @@ -22,37 +21,19 @@ from pathlib import Path import pytest +# Import shared Chrome test helpers +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + setup_chrome_session, + cleanup_chrome, +) + PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None) -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' -CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js' -CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None) TEST_URL = 'https://www.singsing.movie/' -def get_node_modules_dir(): - """Get NODE_MODULES_DIR for tests, checking env first.""" - # Check if NODE_MODULES_DIR is already set in environment - if os.environ.get('NODE_MODULES_DIR'): - return Path(os.environ['NODE_MODULES_DIR']) - # Otherwise compute from LIB_DIR - from archivebox.config.common import STORAGE_CONFIG - lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) - return lib_dir / 'npm' / 'node_modules' - - -NODE_MODULES_DIR = get_node_modules_dir() - - -def get_test_env(): - """Get environment with NODE_MODULES_DIR set correctly.""" - env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) - return env - - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found" @@ -117,95 +98,18 @@ def test_fails_gracefully_without_chrome_session(): f"Should mention chrome/CDP/puppeteer in error: {result.stderr}" -def setup_chrome_session(tmpdir): - """Helper to set up Chrome session with tab and navigation.""" - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' - - # Launch Chrome at crawl level - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chrome to launch - for i in range(15): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") - if (chrome_dir / 'cdp_url.txt').exists(): - break - time.sleep(1) - - if not (chrome_dir / 'cdp_url.txt').exists(): - raise RuntimeError("Chrome CDP URL not found after 15s") - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - # Create snapshot directory structure - snapshot_dir = Path(tmpdir) / 'snapshot' - snapshot_dir.mkdir() - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir() - - # Create tab - tab_env = env.copy() - tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=tab_env - ) - if result.returncode != 0: - raise RuntimeError(f"Tab creation failed: {result.stderr}") - - # Navigate to URL - result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - if result.returncode != 0: - raise RuntimeError(f"Navigation failed: {result.stderr}") - - return chrome_launch_process, chrome_pid, snapshot_chrome_dir - - -def cleanup_chrome(chrome_launch_process, chrome_pid): - """Helper to clean up Chrome processes.""" - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass - - def test_scrolls_page_and_outputs_stats(): """Integration test: scroll page and verify JSONL output format.""" with tempfile.TemporaryDirectory() as tmpdir: chrome_launch_process = None chrome_pid = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + Path(tmpdir), + crawl_id='test-infiniscroll', + snapshot_id='snap-infiniscroll', + test_url=TEST_URL, + ) # Create infiniscroll output directory (sibling to chrome) infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' @@ -265,7 +169,12 @@ def test_config_scroll_limit_honored(): chrome_launch_process = None chrome_pid = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + Path(tmpdir), + crawl_id='test-scroll-limit', + snapshot_id='snap-limit', + test_url=TEST_URL, + ) infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' infiniscroll_dir.mkdir() @@ -317,7 +226,12 @@ def test_config_timeout_honored(): chrome_launch_process = None chrome_pid = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + Path(tmpdir), + crawl_id='test-timeout', + snapshot_id='snap-timeout', + test_url=TEST_URL, + ) infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' infiniscroll_dir.mkdir() diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js index f2df6629..2a8053cd 100755 --- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js +++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js @@ -17,11 +17,8 @@ * - Works on thousands of websites out of the box */ -const path = require('path'); -const fs = require('fs'); - // Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); +const { installExtensionWithCache } = require('../chrome/chrome_utils.js'); // Extension metadata const EXTENSION = { @@ -29,69 +26,17 @@ const EXTENSION = { name: 'istilldontcareaboutcookies', }; -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -/** - * Install the I Still Don't Care About Cookies extension - */ -async function installCookiesExtension() { - console.log('[*] Installing I Still Don\'t Care About Cookies extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension'); - return null; - } - - console.log('[+] I Still Don\'t Care About Cookies extension installed'); - console.log('[+] Cookie banners will be automatically dismissed during archiving'); - - return extension; -} - /** + * Main entry point - install extension before archiving + * * Note: This extension works out of the box with no configuration needed. * It automatically detects and dismisses cookie banners on page load. */ - -/** - * Main entry point - install extension before archiving - */ async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json'); + const extension = await installExtensionWithCache(EXTENSION); - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[⚠️] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installCookiesExtension(); - - // Export extension metadata for chrome plugin to load if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); + console.log('[+] Cookie banners will be automatically dismissed during archiving'); } return extension; @@ -100,7 +45,6 @@ async function main() { // Export functions for use by other plugins module.exports = { EXTENSION, - installCookiesExtension, }; // Run if executed directly diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py index 970bee94..1039d99c 100644 --- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py +++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py @@ -22,38 +22,20 @@ from pathlib import Path import pytest +# Import shared Chrome test helpers +from archivebox.plugins.chrome.tests.chrome_test_helpers import ( + get_test_env, + setup_chrome_session, + cleanup_chrome, +) + PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None) -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' -CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js' -CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None) TEST_URL = 'https://www.singsing.movie/' COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/' -def get_node_modules_dir(): - """Get NODE_MODULES_DIR for tests, checking env first.""" - # Check if NODE_MODULES_DIR is already set in environment - if os.environ.get('NODE_MODULES_DIR'): - return Path(os.environ['NODE_MODULES_DIR']) - # Otherwise compute from LIB_DIR - from archivebox.config.common import STORAGE_CONFIG - lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) - return lib_dir / 'npm' / 'node_modules' - - -NODE_MODULES_DIR = get_node_modules_dir() - - -def get_test_env(): - """Get environment with NODE_MODULES_DIR set correctly.""" - env = os.environ.copy() - env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR) - return env - - def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" assert MODALCLOSER_HOOK is not None, "Modalcloser hook not found" @@ -118,76 +100,6 @@ def test_fails_gracefully_without_chrome_session(): f"Should mention chrome/CDP/puppeteer in error: {result.stderr}" -def setup_chrome_session(tmpdir): - """Helper to set up Chrome session with tab.""" - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' - - # Launch Chrome at crawl level - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chrome to launch - for i in range(15): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") - if (chrome_dir / 'cdp_url.txt').exists(): - break - time.sleep(1) - - if not (chrome_dir / 'cdp_url.txt').exists(): - raise RuntimeError("Chrome CDP URL not found after 15s") - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - # Create snapshot directory structure - snapshot_dir = Path(tmpdir) / 'snapshot' - snapshot_dir.mkdir() - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir() - - # Create tab - tab_env = env.copy() - tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser', '--crawl-id=test-modalcloser'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=tab_env - ) - if result.returncode != 0: - raise RuntimeError(f"Tab creation failed: {result.stderr}") - - return chrome_launch_process, chrome_pid, snapshot_chrome_dir - - -def cleanup_chrome(chrome_launch_process, chrome_pid): - """Helper to clean up Chrome processes.""" - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass - - def test_background_script_handles_sigterm(): """Test that background script runs and handles SIGTERM correctly.""" with tempfile.TemporaryDirectory() as tmpdir: @@ -195,7 +107,12 @@ def test_background_script_handles_sigterm(): chrome_pid = None modalcloser_process = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + Path(tmpdir), + crawl_id='test-modalcloser', + snapshot_id='snap-modalcloser', + test_url=TEST_URL, + ) # Create modalcloser output directory (sibling to chrome) modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' @@ -265,7 +182,12 @@ def test_dialog_handler_logs_dialogs(): chrome_pid = None modalcloser_process = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + Path(tmpdir), + crawl_id='test-dialog', + snapshot_id='snap-dialog', + test_url=TEST_URL, + ) modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' modalcloser_dir.mkdir() @@ -313,7 +235,12 @@ def test_config_poll_interval(): chrome_pid = None modalcloser_process = None try: - chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session( + Path(tmpdir), + crawl_id='test-poll', + snapshot_id='snap-poll', + test_url=TEST_URL, + ) modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' modalcloser_dir.mkdir() diff --git a/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js b/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js index 8335a0d9..04b15d73 100755 --- a/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js +++ b/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js @@ -16,11 +16,8 @@ * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc. */ -const path = require('path'); -const fs = require('fs'); - // Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); +const { installExtensionWithCache } = require('../chrome/chrome_utils.js'); // Extension metadata const EXTENSION = { @@ -28,76 +25,25 @@ const EXTENSION = { name: 'twocaptcha', }; -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - /** - * Install and configure the 2captcha extension - */ -async function installCaptchaExtension() { - console.log('[*] Installing 2captcha extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[❌] Failed to install 2captcha extension'); - return null; - } - - // Check if API key is configured - const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA; - if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { - console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured'); - console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving'); - } else { - console.log('[+] 2captcha extension installed and API key configured'); - } - - return extension; -} - -/** - * Note: 2captcha configuration is now handled by chrome plugin + * Main entry point - install extension before archiving + * + * Note: 2captcha configuration is handled by on_Crawl__25_configure_twocaptcha_extension_options.js * during first-time browser setup to avoid repeated configuration on every snapshot. * The API key is injected via chrome.storage API once per browser session. */ - -/** - * Main entry point - install extension before archiving - */ async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json'); + const extension = await installExtensionWithCache(EXTENSION); - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] 2captcha extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[⚠️] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installCaptchaExtension(); - - // Export extension metadata for chrome plugin to load if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); + // Check if API key is configured + const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA; + if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { + console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured'); + console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving'); + } else { + console.log('[+] 2captcha extension installed and API key configured'); + } } return extension; @@ -106,7 +52,6 @@ async function main() { // Export functions for use by other plugins module.exports = { EXTENSION, - installCaptchaExtension, }; // Run if executed directly diff --git a/archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js b/archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js index b8a0219c..deb1ada7 100755 --- a/archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js +++ b/archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js @@ -18,11 +18,8 @@ * - Uses efficient blocking with filter lists */ -const path = require('path'); -const fs = require('fs'); - // Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); +const { installExtensionWithCache } = require('../chrome/chrome_utils.js'); // Extension metadata const EXTENSION = { @@ -30,69 +27,17 @@ const EXTENSION = { name: 'ublock', }; -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -/** - * Install the uBlock Origin extension - */ -async function installUblockExtension() { - console.log('[*] Installing uBlock Origin extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[❌] Failed to install uBlock Origin extension'); - return null; - } - - console.log('[+] uBlock Origin extension installed'); - console.log('[+] Ads and trackers will be blocked during archiving'); - - return extension; -} - /** + * Main entry point - install extension before archiving + * * Note: uBlock Origin works automatically with default filter lists. * No configuration needed - blocks ads, trackers, and malware domains out of the box. */ - -/** - * Main entry point - install extension before archiving - */ async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json'); + const extension = await installExtensionWithCache(EXTENSION); - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] uBlock Origin extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[⚠️] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installUblockExtension(); - - // Export extension metadata for chrome plugin to load if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); + console.log('[+] Ads and trackers will be blocked during archiving'); } return extension; @@ -101,7 +46,6 @@ async function main() { // Export functions for use by other plugins module.exports = { EXTENSION, - installUblockExtension, }; // Run if executed directly