diff --git a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js index d025be81..f21666c1 100644 --- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js @@ -8,7 +8,7 @@ * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for * --load-extension and --disable-extensions-except flags. * - * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id= --source-url= + * Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id= --source-url= * Output: Writes to current directory (executor creates chrome/ dir): * - cdp_url.txt: WebSocket URL for CDP connection * - chrome.pid: Chromium process ID (for cleanup) @@ -165,14 +165,6 @@ async function main() { chromePid = result.pid; const cdpUrl = result.cdpUrl; - // Write extensions metadata - if (installedExtensions.length > 0) { - fs.writeFileSync( - path.join(OUTPUT_DIR, 'extensions.json'), - JSON.stringify(installedExtensions, null, 2) - ); - } - // Connect puppeteer for extension verification console.error(`[*] Connecting puppeteer to CDP...`); const browser = await puppeteer.connect({ @@ -181,30 +173,84 @@ async function main() { }); browserInstance = browser; - // Verify extensions loaded + // Get actual extension IDs from chrome://extensions page if (extensionPaths.length > 0) { - await new Promise(r => setTimeout(r, 3000)); + await new Promise(r => setTimeout(r, 2000)); - const targets = browser.targets(); - console.error(`[*] All browser targets (${targets.length}):`); - for (const t of targets) { - console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`); + try { + const extPage = await browser.newPage(); + await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 }); + await new Promise(r => setTimeout(r, 2000)); + + // Parse extension info from the page + const extensionsFromPage = await extPage.evaluate(() => { + const extensions = []; + // Extensions manager uses shadow DOM + const manager = document.querySelector('extensions-manager'); + if (!manager || !manager.shadowRoot) return extensions; + + const itemList = manager.shadowRoot.querySelector('extensions-item-list'); + if (!itemList || !itemList.shadowRoot) return extensions; + + const items = itemList.shadowRoot.querySelectorAll('extensions-item'); + for (const item of items) { + const id = item.getAttribute('id'); + const nameEl = item.shadowRoot?.querySelector('#name'); + const name = nameEl?.textContent?.trim() || ''; + if (id && name) { + extensions.push({ id, name }); + } + } + return extensions; + }); + + console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`); + for (const e of extensionsFromPage) { + console.error(` - ${e.id}: "${e.name}"`); + } + + // Match extensions by name (strict matching) + for (const ext of installedExtensions) { + // Read the extension's manifest to get its display name + const manifestPath = path.join(ext.unpacked_path, 'manifest.json'); + if (fs.existsSync(manifestPath)) { + const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); + const manifestName = manifest.name || ''; + console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`); + + // Find matching extension from page by exact name match first + let match = extensionsFromPage.find(e => e.name === manifestName); + + // If no exact match, try case-insensitive exact match + if (!match) { + match = extensionsFromPage.find(e => + e.name.toLowerCase() === manifestName.toLowerCase() + ); + } + + if (match) { + ext.id = match.id; + console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`); + } else { + console.error(`[!] No match found for: ${ext.name} (${manifestName})`); + } + } + } + + await extPage.close(); + } catch (e) { + console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`); } - const extTargets = targets.filter(t => - t.url().startsWith('chrome-extension://') || - t.type() === 'service_worker' || - t.type() === 'background_page' - ); - - // Filter out built-in extensions + // Fallback: check browser targets + const targets = browser.targets(); const builtinIds = [ 'nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf', 'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai', ]; - const customExtTargets = extTargets.filter(t => { + const customExtTargets = targets.filter(t => { const url = t.url(); if (!url.startsWith('chrome-extension://')) return false; const extId = url.split('://')[1].split('/')[0]; @@ -216,7 +262,7 @@ async function main() { for (const target of customExtTargets) { const url = target.url(); const extId = url.split('://')[1].split('/')[0]; - console.error(`[+] Extension loaded: ${extId} (${target.type()})`); + console.error(`[+] Extension target: ${extId} (${target.type()})`); } if (customExtTargets.length === 0 && extensionPaths.length > 0) { @@ -225,6 +271,14 @@ async function main() { } } + // Write extensions metadata with actual IDs + if (installedExtensions.length > 0) { + fs.writeFileSync( + path.join(OUTPUT_DIR, 'extensions.json'), + JSON.stringify(installedExtensions, null, 2) + ); + } + console.error(`[+] Chromium session started for crawl ${crawlId}`); console.error(`[+] CDP URL: ${cdpUrl}`); console.error(`[+] PID: ${chromePid}`); diff --git a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js index 537ec5bf..300bed51 100755 --- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js @@ -2,7 +2,7 @@ /** * Create a Chrome tab for this snapshot in the shared crawl Chrome session. * - * If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js), + * If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js), * this connects to it and creates a new tab. Otherwise, falls back to launching * its own Chrome instance. * @@ -215,7 +215,7 @@ async function launchNewChrome(url, binary) { console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`); // Write PID immediately for cleanup - fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid)); + fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid)); try { // Wait for Chrome to be ready diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 3aa7f2be..ca8ad874 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -29,7 +29,7 @@ import shutil import platform PLUGIN_DIR = Path(__file__).parent.parent -CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js' +CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js' CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js' CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None) @@ -176,6 +176,7 @@ def test_chrome_launch_and_tab_creation(): crawl_dir = Path(tmpdir) / 'crawl' crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir() # Get test environment with NODE_MODULES_DIR set env = get_test_env() @@ -184,7 +185,7 @@ def test_chrome_launch_and_tab_creation(): # Launch Chrome at crawl level (background process) chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, @@ -292,7 +293,7 @@ def test_chrome_navigation(): # Launch Chrome (background process) chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, @@ -363,7 +364,7 @@ def test_tab_cleanup_on_sigterm(): # Launch Chrome (background process) chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, @@ -423,11 +424,12 @@ def test_multiple_snapshots_share_chrome(): crawl_dir = Path(tmpdir) / 'crawl' crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir() # Launch Chrome at crawl level chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, @@ -513,7 +515,7 @@ def test_chrome_cleanup_on_crawl_end(): # Launch Chrome in background chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, @@ -554,11 +556,12 @@ def test_zombie_prevention_hook_killed(): crawl_dir = Path(tmpdir) / 'crawl' crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir() # Launch Chrome chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py index ba0dca66..966f3071 100644 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -26,7 +26,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None) -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js' CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None) TEST_URL = 'https://www.singsing.movie/' @@ -122,6 +122,7 @@ def setup_chrome_session(tmpdir): crawl_dir = Path(tmpdir) / 'crawl' crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir() env = get_test_env() env['CHROME_HEADLESS'] = 'true' @@ -129,7 +130,7 @@ def setup_chrome_session(tmpdir): # Launch Chrome at crawl level chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py index 63fa0f9a..b5b93288 100644 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py @@ -16,7 +16,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None) +INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None) def test_install_script_exists(): @@ -124,78 +124,106 @@ def test_no_configuration_required(): assert "API" not in (result.stdout + result.stderr) or result.returncode == 0 -def setup_test_lib_dirs(tmpdir: Path) -> dict: - """Create isolated lib directories for tests and return env dict. +PLUGINS_ROOT = PLUGIN_DIR.parent +CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py' +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' - Sets up: - LIB_DIR: tmpdir/lib/ - NODE_MODULES_DIR: tmpdir/lib//npm/node_modules - NPM_BIN_DIR: tmpdir/lib//npm/bin - PIP_VENV_DIR: tmpdir/lib//pip/venv - PIP_BIN_DIR: tmpdir/lib//pip/venv/bin + +def setup_test_env(tmpdir: Path) -> dict: + """Set up isolated data/lib directory structure for tests. + + Creates structure matching real ArchiveBox data dir: + /data/ + lib/ + arm64-darwin/ (or x86_64-linux, etc.) + npm/ + .bin/ + node_modules/ + personas/ + Default/ + chrome_extensions/ + users/ + testuser/ + crawls/ + snapshots/ + + Calls chrome install hook which handles puppeteer-core and chromium installation. + Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. """ import platform - arch = platform.machine() + from datetime import datetime + + # Determine machine type (matches archivebox.config.paths.get_machine_type()) + machine = platform.machine().lower() system = platform.system().lower() - arch_dir = f"{arch}-{system}" + if machine in ('arm64', 'aarch64'): + machine = 'arm64' + elif machine in ('x86_64', 'amd64'): + machine = 'x86_64' + machine_type = f"{machine}-{system}" - lib_dir = tmpdir / 'lib' / arch_dir + # Create proper directory structure matching real ArchiveBox layout + data_dir = tmpdir / 'data' + lib_dir = data_dir / 'lib' / machine_type npm_dir = lib_dir / 'npm' + npm_bin_dir = npm_dir / '.bin' node_modules_dir = npm_dir / 'node_modules' - npm_bin_dir = npm_dir / 'bin' - pip_venv_dir = lib_dir / 'pip' / 'venv' - pip_bin_dir = pip_venv_dir / 'bin' - # Create directories + # Extensions go under personas/Default/ + chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' + + # User data goes under users/{username}/ + date_str = datetime.now().strftime('%Y%m%d') + users_dir = data_dir / 'users' / 'testuser' + crawls_dir = users_dir / 'crawls' / date_str + snapshots_dir = users_dir / 'snapshots' / date_str + + # Create all directories node_modules_dir.mkdir(parents=True, exist_ok=True) npm_bin_dir.mkdir(parents=True, exist_ok=True) - pip_bin_dir.mkdir(parents=True, exist_ok=True) + chrome_extensions_dir.mkdir(parents=True, exist_ok=True) + crawls_dir.mkdir(parents=True, exist_ok=True) + snapshots_dir.mkdir(parents=True, exist_ok=True) - # Install puppeteer-core to the test node_modules if not present - if not (node_modules_dir / 'puppeteer-core').exists(): - result = subprocess.run( - ['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'], - capture_output=True, - text=True, - timeout=120 - ) - if result.returncode != 0: - pytest.skip(f"Failed to install puppeteer-core: {result.stderr}") - - return { + # Build complete env dict + env = os.environ.copy() + env.update({ + 'DATA_DIR': str(data_dir), 'LIB_DIR': str(lib_dir), - 'NODE_MODULES_DIR': str(node_modules_dir), + 'MACHINE_TYPE': machine_type, 'NPM_BIN_DIR': str(npm_bin_dir), - 'PIP_VENV_DIR': str(pip_venv_dir), - 'PIP_BIN_DIR': str(pip_bin_dir), - } + 'NODE_MODULES_DIR': str(node_modules_dir), + 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), + 'CRAWLS_DIR': str(crawls_dir), + 'SNAPSHOTS_DIR': str(snapshots_dir), + }) - -PLUGINS_ROOT = PLUGIN_DIR.parent - - -def find_chromium_binary(): - """Find the Chromium binary using chrome_utils.js findChromium(). - - This uses the centralized findChromium() function which checks: - - CHROME_BINARY env var - - @puppeteer/browsers install locations - - System Chromium locations - - Falls back to Chrome (with warning) - """ - chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js' + # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) result = subprocess.run( - ['node', str(chrome_utils), 'findChromium'], - capture_output=True, - text=True, - timeout=10 + ['python', str(CHROME_INSTALL_HOOK)], + capture_output=True, text=True, timeout=120, env=env ) - if result.returncode == 0 and result.stdout.strip(): - return result.stdout.strip() - return None + if result.returncode != 0: + pytest.skip(f"Chrome install hook failed: {result.stderr}") + # Parse JSONL output to get CHROME_BINARY + chrome_binary = None + for line in result.stdout.strip().split('\n'): + if not line.strip(): + continue + try: + data = json.loads(line) + if data.get('type') == 'Binary' and data.get('abspath'): + chrome_binary = data['abspath'] + break + except json.JSONDecodeError: + continue -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' + if not chrome_binary or not Path(chrome_binary).exists(): + pytest.skip(f"Chromium binary not found: {chrome_binary}") + + env['CHROME_BINARY'] = chrome_binary + return env TEST_URL = 'https://www.filmin.es/' @@ -210,22 +238,11 @@ def test_extension_loads_in_chromium(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Set up isolated lib directories for this test - lib_env = setup_test_lib_dirs(tmpdir) + # Set up isolated env with proper directory structure + env = setup_test_env(tmpdir) + env.setdefault('CHROME_HEADLESS', 'true') - # Set up extensions directory - ext_dir = tmpdir / 'chrome_extensions' - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env.update(lib_env) - env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) - env['CHROME_HEADLESS'] = 'true' - - # Ensure CHROME_BINARY points to Chromium - chromium = find_chromium_binary() - if chromium: - env['CHROME_BINARY'] = chromium + ext_dir = Path(env['CHROME_EXTENSIONS_DIR']) # Step 1: Install the extension result = subprocess.run( @@ -245,13 +262,16 @@ def test_extension_loads_in_chromium(): print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) - crawl_dir = tmpdir / 'crawl' - crawl_dir.mkdir() + crawl_id = 'test-cookies' + crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id + crawl_dir.mkdir(parents=True, exist_ok=True) chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir(parents=True, exist_ok=True) + env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'], - cwd=str(crawl_dir), + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, @@ -400,156 +420,362 @@ const puppeteer = require('puppeteer-core'); pass -def test_hides_cookie_consent_on_filmin(): - """Live test: verify extension hides cookie consent popup on filmin.es. +def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str): + """Launch Chromium and return (process, cdp_url) or raise on failure.""" + chrome_dir.mkdir(parents=True, exist_ok=True) - Uses Chromium with extensions loaded automatically via chrome hook. - """ - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) - # Set up isolated lib directories for this test - lib_env = setup_test_lib_dirs(tmpdir) + # Wait for Chromium to launch and CDP URL to be available + cdp_url = None + for i in range(20): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) - # Set up extensions directory - ext_dir = tmpdir / 'chrome_extensions' - ext_dir.mkdir(parents=True) + if not cdp_url: + chrome_launch_process.kill() + raise RuntimeError("Chromium CDP URL not found after 20s") - env = os.environ.copy() - env.update(lib_env) - env['CHROME_EXTENSIONS_DIR'] = str(ext_dir) - env['CHROME_HEADLESS'] = 'true' + return chrome_launch_process, cdp_url - # Ensure CHROME_BINARY points to Chromium - chromium = find_chromium_binary() - if chromium: - env['CHROME_BINARY'] = chromium - - # Step 1: Install the extension - result = subprocess.run( - ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir), - capture_output=True, - text=True, - env=env, - timeout=60 - ) - assert result.returncode == 0, f"Extension install failed: {result.stderr}" - - # Verify extension cache was created - cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json' - assert cache_file.exists(), "Extension cache not created" - ext_data = json.loads(cache_file.read_text()) - print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") - - # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) - crawl_dir = tmpdir / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'], - cwd=str(crawl_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chromium to launch and CDP URL to be available - cdp_url = None - for i in range(20): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - break - time.sleep(1) - - assert cdp_url, "Chromium CDP URL not found after 20s" - print(f"Chromium launched with CDP URL: {cdp_url}") +def kill_chromium_session(chrome_launch_process, chrome_dir: Path): + """Clean up Chromium process.""" + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + chrome_pid_file = chrome_dir / 'chrome.pid' + if chrome_pid_file.exists(): try: - # Step 3: Connect to Chromium and test cookie consent hiding - test_script = f''' + chrome_pid = int(chrome_pid_file.read_text().strip()) + os.kill(chrome_pid, signal.SIGKILL) + except (OSError, ValueError): + pass + + +def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict: + """Check if cookie consent elements are visible on a page. + + Returns dict with: + - visible: bool - whether any cookie consent element is visible + - selector: str - which selector matched (if visible) + - elements_found: list - all cookie-related elements found in DOM + - html_snippet: str - snippet of the page HTML for debugging + """ + test_script = f''' if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); const puppeteer = require('puppeteer-core'); (async () => {{ const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); - // Wait for extension to initialize - await new Promise(r => setTimeout(r, 2000)); - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'); + await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); await page.setViewport({{ width: 1440, height: 900 }}); - console.error('Navigating to {TEST_URL}...'); - await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }}); + console.error('Navigating to {test_url}...'); + await page.goto('{test_url}', {{ waitUntil: 'networkidle2', timeout: 30000 }}); - // Wait for extension content script to process page - await new Promise(r => setTimeout(r, 5000)); + // Wait for page to fully render and any cookie scripts to run + await new Promise(r => setTimeout(r, 3000)); - // Check cookie consent visibility + // Check cookie consent visibility using multiple common selectors const result = await page.evaluate(() => {{ - const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay']; + // Common cookie consent selectors used by various consent management platforms + const selectors = [ + // CookieYes + '.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal', + // OneTrust + '#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter', + // Cookiebot + '#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay', + // Generic cookie banners + '[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]', + '[class*="cookie-popup"]', '[class*="cookie-modal"]', '[class*="cookie-dialog"]', + '[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]', + '[id*="cookieconsent"]', '[id*="cookie-law"]', + // GDPR banners + '[class*="gdpr"]', '[id*="gdpr"]', + // Consent banners + '[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-popup"]', + // Privacy banners + '[class*="privacy-banner"]', '[class*="privacy-notice"]', + // Common frameworks + '.cc-window', '.cc-banner', '#cc-main', // Cookie Consent by Insites + '.qc-cmp2-container', // Quantcast + '.sp-message-container', // SourcePoint + ]; + + const elementsFound = []; + let visibleElement = null; + for (const sel of selectors) {{ - const el = document.querySelector(sel); - if (el) {{ - const style = window.getComputedStyle(el); - const rect = el.getBoundingClientRect(); - const visible = style.display !== 'none' && - style.visibility !== 'hidden' && - rect.width > 0 && rect.height > 0; - if (visible) return {{ visible: true, selector: sel }}; + try {{ + const elements = document.querySelectorAll(sel); + for (const el of elements) {{ + const style = window.getComputedStyle(el); + const rect = el.getBoundingClientRect(); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0' && + rect.width > 0 && rect.height > 0; + + elementsFound.push({{ + selector: sel, + visible: isVisible, + display: style.display, + visibility: style.visibility, + opacity: style.opacity, + width: rect.width, + height: rect.height + }}); + + if (isVisible && !visibleElement) {{ + visibleElement = {{ selector: sel, width: rect.width, height: rect.height }}; + }} + }} + }} catch (e) {{ + // Invalid selector, skip }} }} - return {{ visible: false }}; + + // Also grab a snippet of the HTML to help debug + const bodyHtml = document.body.innerHTML.slice(0, 2000); + const hasCookieKeyword = bodyHtml.toLowerCase().includes('cookie') || + bodyHtml.toLowerCase().includes('consent') || + bodyHtml.toLowerCase().includes('gdpr'); + + return {{ + visible: visibleElement !== null, + selector: visibleElement ? visibleElement.selector : null, + elements_found: elementsFound, + has_cookie_keyword_in_html: hasCookieKeyword, + html_snippet: bodyHtml.slice(0, 500) + }}; }}); - console.error('Cookie consent:', JSON.stringify(result)); + console.error('Cookie consent check result:', JSON.stringify({{ + visible: result.visible, + selector: result.selector, + elements_found_count: result.elements_found.length + }})); + browser.disconnect(); console.log(JSON.stringify(result)); }})(); ''' - script_path = tmpdir / 'test_extension.js' - script_path.write_text(test_script) + script_path = script_dir / 'check_cookies.js' + script_path.write_text(test_script) - result = subprocess.run( - ['node', str(script_path)], - cwd=str(tmpdir), - capture_output=True, - text=True, - env=env, - timeout=90 + result = subprocess.run( + ['node', str(script_path)], + cwd=str(script_dir), + capture_output=True, + text=True, + env=env, + timeout=90 + ) + + if result.returncode != 0: + raise RuntimeError(f"Cookie check script failed: {result.stderr}") + + output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] + if not output_lines: + raise RuntimeError(f"No JSON output from cookie check: {result.stdout}\nstderr: {result.stderr}") + + return json.loads(output_lines[-1]) + + +def test_hides_cookie_consent_on_filmin(): + """Live test: verify extension hides cookie consent popup on filmin.es. + + This test runs TWO browser sessions: + 1. WITHOUT extension - verifies cookie consent IS visible (baseline) + 2. WITH extension - verifies cookie consent is HIDDEN + + This ensures we're actually testing the extension's effect, not just + that a page happens to not have cookie consent. + """ + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Set up isolated env with proper directory structure + env_base = setup_test_env(tmpdir) + env_base['CHROME_HEADLESS'] = 'true' + + ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR']) + + # ============================================================ + # STEP 1: BASELINE - Run WITHOUT extension, verify cookie consent IS visible + # ============================================================ + print("\n" + "="*60) + print("STEP 1: BASELINE TEST (no extension)") + print("="*60) + + data_dir = Path(env_base['DATA_DIR']) + + env_no_ext = env_base.copy() + env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions') + (data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True) + + # Launch baseline Chromium in crawls directory + baseline_crawl_id = 'baseline-no-ext' + baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id + baseline_crawl_dir.mkdir(parents=True, exist_ok=True) + baseline_chrome_dir = baseline_crawl_dir / 'chrome' + env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir) + baseline_process = None + + try: + baseline_process, baseline_cdp_url = launch_chromium_session( + env_no_ext, baseline_chrome_dir, baseline_crawl_id + ) + print(f"Baseline Chromium launched: {baseline_cdp_url}") + + # Wait a moment for browser to be ready + time.sleep(2) + + baseline_result = check_cookie_consent_visibility( + baseline_cdp_url, TEST_URL, env_no_ext, tmpdir ) - print(f"stderr: {result.stderr}") - print(f"stdout: {result.stdout}") + print(f"Baseline result: visible={baseline_result['visible']}, " + f"elements_found={len(baseline_result['elements_found'])}") - assert result.returncode == 0, f"Test failed: {result.stderr}" - - output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] - assert output_lines, f"No JSON output: {result.stdout}" - - test_result = json.loads(output_lines[-1]) - assert not test_result['visible'], \ - f"Cookie consent should be hidden by extension. Result: {test_result}" + if baseline_result['elements_found']: + print("Elements found in baseline:") + for el in baseline_result['elements_found'][:5]: # Show first 5 + print(f" - {el['selector']}: visible={el['visible']}, " + f"display={el['display']}, size={el['width']}x{el['height']}") finally: - # Clean up Chromium - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - chrome_pid_file = chrome_dir / 'chrome.pid' - if chrome_pid_file.exists(): - try: - chrome_pid = int(chrome_pid_file.read_text().strip()) - os.kill(chrome_pid, signal.SIGKILL) - except (OSError, ValueError): - pass + if baseline_process: + kill_chromium_session(baseline_process, baseline_chrome_dir) + + # Verify baseline shows cookie consent + if not baseline_result['visible']: + # If no cookie consent visible in baseline, we can't test the extension + # This could happen if: + # - The site changed and no longer shows cookie consent + # - Cookie consent is region-specific + # - Our selectors don't match this site + print("\nWARNING: No cookie consent visible in baseline!") + print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}") + print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}") + + pytest.skip( + f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. " + f"Elements found: {len(baseline_result['elements_found'])}. " + f"The site may have changed or cookie consent may be region-specific." + ) + + print(f"\n✓ Baseline confirmed: Cookie consent IS visible (selector: {baseline_result['selector']})") + + # ============================================================ + # STEP 2: Install the extension + # ============================================================ + print("\n" + "="*60) + print("STEP 2: INSTALLING EXTENSION") + print("="*60) + + env_with_ext = env_base.copy() + env_with_ext['CHROME_EXTENSIONS_DIR'] = str(ext_dir) + + result = subprocess.run( + ['node', str(INSTALL_SCRIPT)], + cwd=str(tmpdir), + capture_output=True, + text=True, + env=env_with_ext, + timeout=60 + ) + assert result.returncode == 0, f"Extension install failed: {result.stderr}" + + cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json' + assert cache_file.exists(), "Extension cache not created" + ext_data = json.loads(cache_file.read_text()) + print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") + + # ============================================================ + # STEP 3: Run WITH extension, verify cookie consent is HIDDEN + # ============================================================ + print("\n" + "="*60) + print("STEP 3: TEST WITH EXTENSION") + print("="*60) + + # Launch extension test Chromium in crawls directory + ext_crawl_id = 'test-with-ext' + ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id + ext_crawl_dir.mkdir(parents=True, exist_ok=True) + ext_chrome_dir = ext_crawl_dir / 'chrome' + env_with_ext['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir) + ext_process = None + + try: + ext_process, ext_cdp_url = launch_chromium_session( + env_with_ext, ext_chrome_dir, ext_crawl_id + ) + print(f"Extension Chromium launched: {ext_cdp_url}") + + # Check that extension was loaded + extensions_file = ext_chrome_dir / 'extensions.json' + if extensions_file.exists(): + loaded_exts = json.loads(extensions_file.read_text()) + print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") + + # Wait for extension to initialize + time.sleep(3) + + ext_result = check_cookie_consent_visibility( + ext_cdp_url, TEST_URL, env_with_ext, tmpdir + ) + + print(f"Extension result: visible={ext_result['visible']}, " + f"elements_found={len(ext_result['elements_found'])}") + + if ext_result['elements_found']: + print("Elements found with extension:") + for el in ext_result['elements_found'][:5]: + print(f" - {el['selector']}: visible={el['visible']}, " + f"display={el['display']}, size={el['width']}x{el['height']}") + + finally: + if ext_process: + kill_chromium_session(ext_process, ext_chrome_dir) + + # ============================================================ + # STEP 4: Compare results + # ============================================================ + print("\n" + "="*60) + print("STEP 4: COMPARISON") + print("="*60) + print(f"Baseline (no extension): cookie consent visible = {baseline_result['visible']}") + print(f"With extension: cookie consent visible = {ext_result['visible']}") + + assert baseline_result['visible'], \ + "Baseline should show cookie consent (this shouldn't happen, we checked above)" + + assert not ext_result['visible'], \ + f"Cookie consent should be HIDDEN by extension.\n" \ + f"Baseline showed consent at: {baseline_result['selector']}\n" \ + f"But with extension, consent is still visible.\n" \ + f"Elements still visible: {[e for e in ext_result['elements_found'] if e['visible']]}" + + print("\n✓ SUCCESS: Extension correctly hides cookie consent!") + print(f" - Baseline showed consent at: {baseline_result['selector']}") + print(f" - Extension successfully hid it") diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py index b0b185f8..970bee94 100644 --- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py +++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py @@ -26,7 +26,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None) -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js' CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None) TEST_URL = 'https://www.singsing.movie/' @@ -123,6 +123,7 @@ def setup_chrome_session(tmpdir): crawl_dir = Path(tmpdir) / 'crawl' crawl_dir.mkdir() chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir() env = get_test_env() env['CHROME_HEADLESS'] = 'true' @@ -130,7 +131,7 @@ def setup_chrome_session(tmpdir): # Launch Chrome at crawl level chrome_launch_process = subprocess.Popen( ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'], - cwd=str(crawl_dir), + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, diff --git a/archivebox/plugins/twocaptcha/config.json b/archivebox/plugins/twocaptcha/config.json index ba1a1383..d6c08ecf 100644 --- a/archivebox/plugins/twocaptcha/config.json +++ b/archivebox/plugins/twocaptcha/config.json @@ -4,18 +4,47 @@ "additionalProperties": false, "required_plugins": ["chrome"], "properties": { - "CAPTCHA2_ENABLED": { + "TWOCAPTCHA_ENABLED": { "type": "boolean", "default": true, - "x-aliases": ["USE_CAPTCHA2"], - "description": "Enable Captcha2 browser extension for CAPTCHA solving" + "x-aliases": ["CAPTCHA2_ENABLED", "USE_CAPTCHA2", "USE_TWOCAPTCHA"], + "description": "Enable 2captcha browser extension for automatic CAPTCHA solving" }, - "CAPTCHA2_TIMEOUT": { + "TWOCAPTCHA_API_KEY": { + "type": "string", + "default": "", + "x-aliases": ["API_KEY_2CAPTCHA", "CAPTCHA2_API_KEY"], + "x-sensitive": true, + "description": "2captcha API key for CAPTCHA solving service (get from https://2captcha.com)" + }, + "TWOCAPTCHA_RETRY_COUNT": { + "type": "integer", + "default": 3, + "minimum": 0, + "maximum": 10, + "x-aliases": ["CAPTCHA2_RETRY_COUNT"], + "description": "Number of times to retry CAPTCHA solving on error" + }, + "TWOCAPTCHA_RETRY_DELAY": { + "type": "integer", + "default": 5, + "minimum": 0, + "maximum": 60, + "x-aliases": ["CAPTCHA2_RETRY_DELAY"], + "description": "Delay in seconds between CAPTCHA solving retries" + }, + "TWOCAPTCHA_TIMEOUT": { "type": "integer", "default": 60, "minimum": 5, "x-fallback": "TIMEOUT", + "x-aliases": ["CAPTCHA2_TIMEOUT"], "description": "Timeout for CAPTCHA solving in seconds" + }, + "TWOCAPTCHA_AUTO_SUBMIT": { + "type": "boolean", + "default": false, + "description": "Automatically submit forms after CAPTCHA is solved" } } } diff --git a/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js b/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js index 5465e0cd..8335a0d9 100755 --- a/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js +++ b/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js @@ -12,7 +12,7 @@ * Hook: on_Crawl (runs once per crawl, not per snapshot) * * Requirements: - * - API_KEY_2CAPTCHA environment variable must be set + * - TWOCAPTCHA_API_KEY environment variable must be set * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc. */ @@ -47,10 +47,10 @@ async function installCaptchaExtension() { } // Check if API key is configured - const apiKey = process.env.API_KEY_2CAPTCHA; + const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA; if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { - console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured'); - console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving'); + console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured'); + console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving'); } else { console.log('[+] 2captcha extension installed and API key configured'); } diff --git a/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js b/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js index 8a1dc440..a3e1235a 100755 --- a/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js +++ b/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js @@ -2,14 +2,21 @@ /** * 2Captcha Extension Configuration * - * Configures the 2captcha extension with API key after Crawl-level Chrome session starts. - * Runs once per crawl to inject API key into extension storage. + * Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts. + * Runs once per crawl to inject configuration into extension storage. * - * Priority: 11 (after chrome_launch at 20) + * Priority: 25 (after chrome_launch at 30, before snapshots start) * Hook: on_Crawl (runs once per crawl, not per snapshot) * + * Config Options (from config.json / environment): + * - TWOCAPTCHA_API_KEY: API key for 2captcha service + * - TWOCAPTCHA_ENABLED: Enable/disable the extension + * - TWOCAPTCHA_RETRY_COUNT: Number of retries on error + * - TWOCAPTCHA_RETRY_DELAY: Delay between retries (seconds) + * - TWOCAPTCHA_AUTO_SUBMIT: Auto-submit forms after solving + * * Requirements: - * - API_KEY_2CAPTCHA environment variable must be set + * - TWOCAPTCHA_API_KEY environment variable must be set * - chrome plugin must have loaded extensions (extensions.json must exist) */ @@ -36,6 +43,20 @@ function getEnv(name, defaultValue = '') { return (process.env[name] || defaultValue).trim(); } +// Get boolean environment variable +function getEnvBool(name, defaultValue = false) { + const val = getEnv(name, '').toLowerCase(); + if (['true', '1', 'yes', 'on'].includes(val)) return true; + if (['false', '0', 'no', 'off'].includes(val)) return false; + return defaultValue; +} + +// Get integer environment variable +function getEnvInt(name, defaultValue = 0) { + const val = parseInt(getEnv(name, String(defaultValue)), 10); + return isNaN(val) ? defaultValue : val; +} + // Parse command line arguments function parseArgs() { const args = {}; @@ -48,6 +69,82 @@ function parseArgs() { return args; } +/** + * Get 2captcha configuration from environment variables. + * Supports both TWOCAPTCHA_* and legacy API_KEY_2CAPTCHA naming. + */ +function getTwoCaptchaConfig() { + const apiKey = getEnv('TWOCAPTCHA_API_KEY') || getEnv('API_KEY_2CAPTCHA') || getEnv('CAPTCHA2_API_KEY'); + const isEnabled = getEnvBool('TWOCAPTCHA_ENABLED', true); + const retryCount = getEnvInt('TWOCAPTCHA_RETRY_COUNT', 3); + const retryDelay = getEnvInt('TWOCAPTCHA_RETRY_DELAY', 5); + const autoSubmit = getEnvBool('TWOCAPTCHA_AUTO_SUBMIT', false); + + // Build the full config object matching the extension's storage structure + // Structure: chrome.storage.local.set({config: {...}}) + return { + // API key - both variants for compatibility + apiKey: apiKey, + api_key: apiKey, + + // Plugin enabled state + isPluginEnabled: isEnabled, + + // Retry settings + repeatOnErrorTimes: retryCount, + repeatOnErrorDelay: retryDelay, + + // Auto-submit setting + autoSubmitForms: autoSubmit, + submitFormsDelay: 0, + + // Enable all CAPTCHA types + enabledForNormal: true, + enabledForRecaptchaV2: true, + enabledForInvisibleRecaptchaV2: true, + enabledForRecaptchaV3: true, + enabledForRecaptchaAudio: false, + enabledForGeetest: true, + enabledForGeetest_v4: true, + enabledForKeycaptcha: true, + enabledForArkoselabs: true, + enabledForLemin: true, + enabledForYandex: true, + enabledForCapyPuzzle: true, + enabledForTurnstile: true, + enabledForAmazonWaf: true, + enabledForMTCaptcha: true, + + // Auto-solve all CAPTCHA types + autoSolveNormal: true, + autoSolveRecaptchaV2: true, + autoSolveInvisibleRecaptchaV2: true, + autoSolveRecaptchaV3: true, + autoSolveRecaptchaAudio: false, + autoSolveGeetest: true, + autoSolveGeetest_v4: true, + autoSolveKeycaptcha: true, + autoSolveArkoselabs: true, + autoSolveLemin: true, + autoSolveYandex: true, + autoSolveCapyPuzzle: true, + autoSolveTurnstile: true, + autoSolveAmazonWaf: true, + autoSolveMTCaptcha: true, + + // Other settings with sensible defaults + recaptchaV2Type: 'token', + recaptchaV3MinScore: 0.3, + buttonPosition: 'inner', + useProxy: false, + proxy: '', + proxytype: 'HTTP', + blackListDomain: '', + autoSubmitRules: [], + normalSources: [], + }; +} + async function configure2Captcha() { // Check if already configured in this session if (fs.existsSync(CONFIG_MARKER)) { @@ -55,29 +152,23 @@ async function configure2Captcha() { return { success: true, skipped: true }; } + // Get configuration + const config = getTwoCaptchaConfig(); + // Check if API key is set - const apiKey = getEnv('API_KEY_2CAPTCHA'); - if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { - console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured'); - console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving'); - return { success: false, error: 'API_KEY_2CAPTCHA not configured' }; + if (!config.apiKey || config.apiKey === 'YOUR_API_KEY_HERE') { + console.warn('[!] 2captcha extension loaded but TWOCAPTCHA_API_KEY not configured'); + console.warn('[!] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving'); + return { success: false, error: 'TWOCAPTCHA_API_KEY not configured' }; } - // Load extensions metadata - const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json'); - if (!fs.existsSync(extensionsFile)) { - return { success: false, error: 'extensions.json not found - chrome plugin must run first' }; - } - - const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8')); - const captchaExt = extensions.find(ext => ext.name === 'twocaptcha'); - - if (!captchaExt) { - console.error('[*] 2captcha extension not installed, skipping configuration'); - return { success: true, skipped: true }; - } - - console.error('[*] Configuring 2captcha extension with API key...'); + console.error('[*] Configuring 2captcha extension...'); + console.error(`[*] API Key: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`); + console.error(`[*] Enabled: ${config.isPluginEnabled}`); + console.error(`[*] Retry Count: ${config.repeatOnErrorTimes}`); + console.error(`[*] Retry Delay: ${config.repeatOnErrorDelay}s`); + console.error(`[*] Auto Submit: ${config.autoSubmitForms}`); + console.error(`[*] Auto Solve: all CAPTCHA types enabled`); try { // Connect to the existing Chrome session via CDP @@ -90,138 +181,116 @@ async function configure2Captcha() { const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); try { - // Method 1: Try to inject via extension background page - if (captchaExt.target && captchaExt.target_ctx) { - console.error('[*] Attempting to configure via extension background page...'); + // First, navigate to a page to trigger extension content scripts and wake up service worker + console.error('[*] Waking up extension by visiting a page...'); + const triggerPage = await browser.newPage(); + try { + await triggerPage.goto('https://www.google.com', { waitUntil: 'domcontentloaded', timeout: 10000 }); + await new Promise(r => setTimeout(r, 3000)); // Give extension time to initialize + } catch (e) { + console.warn(`[!] Trigger page failed: ${e.message}`); + } + try { await triggerPage.close(); } catch (e) {} - // Reconnect to the browser to get fresh target context - const targets = await browser.targets(); - const extTarget = targets.find(t => - t.url().startsWith(`chrome-extension://${captchaExt.id}`) - ); - - if (extTarget) { - const extContext = await extTarget.worker() || await extTarget.page(); - - if (extContext) { - await extContext.evaluate((key) => { - // Try all common storage patterns - if (typeof chrome !== 'undefined' && chrome.storage) { - chrome.storage.local.set({ - apiKey: key, - api_key: key, - '2captcha_apikey': key, - apikey: key, - 'solver-api-key': key, - }); - chrome.storage.sync.set({ - apiKey: key, - api_key: key, - '2captcha_apikey': key, - apikey: key, - 'solver-api-key': key, - }); - } - - // Also try localStorage as fallback - if (typeof localStorage !== 'undefined') { - localStorage.setItem('apiKey', key); - localStorage.setItem('2captcha_apikey', key); - localStorage.setItem('solver-api-key', key); - } - }, apiKey); - - console.error('[+] 2captcha API key configured successfully via background page'); - - // Mark as configured - fs.writeFileSync(CONFIG_MARKER, new Date().toISOString()); - - return { success: true, method: 'background_page' }; - } - } + // Get 2captcha extension info from extensions.json + const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json'); + if (!fs.existsSync(extensionsFile)) { + return { success: false, error: 'extensions.json not found - chrome plugin must run first' }; } - // Method 2: Try to configure via options page - console.error('[*] Attempting to configure via options page...'); - const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`; - const configPage = await browser.newPage(); + const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8')); + const captchaExt = extensions.find(ext => ext.name === 'twocaptcha'); + + if (!captchaExt) { + console.error('[*] 2captcha extension not installed, skipping configuration'); + return { success: true, skipped: true }; + } + + if (!captchaExt.id) { + return { success: false, error: '2captcha extension ID not found in extensions.json' }; + } + + const extensionId = captchaExt.id; + console.error(`[*] 2captcha Extension ID: ${extensionId}`); + + // Configure via options page + console.error('[*] Configuring via options page...'); + const optionsUrl = `chrome-extension://${extensionId}/options/options.html`; + + let configPage = await browser.newPage(); try { - await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 }); - - const configured = await configPage.evaluate((key) => { - // Try to find API key input field - const selectors = [ - 'input[name*="apikey" i]', - 'input[id*="apikey" i]', - 'input[name*="api-key" i]', - 'input[id*="api-key" i]', - 'input[name*="key" i]', - 'input[placeholder*="api" i]', - 'input[type="text"]', - ]; - - for (const selector of selectors) { - const input = document.querySelector(selector); - if (input) { - input.value = key; - input.dispatchEvent(new Event('input', { bubbles: true })); - input.dispatchEvent(new Event('change', { bubbles: true })); - - // Try to find and click save button - const saveSelectors = [ - 'button[type="submit"]', - 'input[type="submit"]', - 'button:contains("Save")', - 'button:contains("Apply")', - ]; - - for (const btnSel of saveSelectors) { - const btn = document.querySelector(btnSel); - if (btn) { - btn.click(); - break; - } - } - - // Also save to storage - if (typeof chrome !== 'undefined' && chrome.storage) { - chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - } - - return true; - } - } - - // Fallback: Just save to storage - if (typeof chrome !== 'undefined' && chrome.storage) { - chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - return true; - } - - return false; - }, apiKey); - - await configPage.close(); - - if (configured) { - console.error('[+] 2captcha API key configured successfully via options page'); - - // Mark as configured - fs.writeFileSync(CONFIG_MARKER, new Date().toISOString()); - - return { success: true, method: 'options_page' }; - } - } catch (e) { - console.warn(`[⚠️] Failed to configure via options page: ${e.message}`); + // Navigate to options page - catch error but continue since page may still load try { - await configPage.close(); - } catch (e2) {} - } + await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 }); + } catch (navError) { + // Navigation may throw ERR_BLOCKED_BY_CLIENT but page still loads + console.error(`[*] Navigation threw error (may still work): ${navError.message}`); + } - return { success: false, error: 'Could not configure via any method' }; + // Wait a moment for page to settle + await new Promise(r => setTimeout(r, 3000)); + + // Check all pages for the extension page (Chrome may open it in a different tab) + const pages = await browser.pages(); + for (const page of pages) { + const url = page.url(); + if (url.startsWith(`chrome-extension://${extensionId}`)) { + configPage = page; + break; + } + } + + const currentUrl = configPage.url(); + console.error(`[*] Current URL: ${currentUrl}`); + + if (!currentUrl.startsWith(`chrome-extension://${extensionId}`)) { + return { success: false, error: `Failed to navigate to options page, got: ${currentUrl}` }; + } + + // Wait for Config object to be available + console.error('[*] Waiting for Config object...'); + await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 }); + + // Use chrome.storage.local.set with the config wrapper + const result = await configPage.evaluate((cfg) => { + return new Promise((resolve) => { + if (typeof chrome !== 'undefined' && chrome.storage) { + chrome.storage.local.set({ config: cfg }, () => { + if (chrome.runtime.lastError) { + resolve({ success: false, error: chrome.runtime.lastError.message }); + } else { + resolve({ success: true, method: 'options_page' }); + } + }); + } else { + resolve({ success: false, error: 'chrome.storage not available' }); + } + }); + }, config); + + if (result.success) { + console.error(`[+] 2captcha configured via ${result.method}`); + fs.writeFileSync(CONFIG_MARKER, JSON.stringify({ + timestamp: new Date().toISOString(), + method: result.method, + extensionId: extensionId, + config: { + apiKeySet: !!config.apiKey, + isPluginEnabled: config.isPluginEnabled, + repeatOnErrorTimes: config.repeatOnErrorTimes, + repeatOnErrorDelay: config.repeatOnErrorDelay, + autoSubmitForms: config.autoSubmitForms, + autoSolveEnabled: true, + } + }, null, 2)); + return { success: true, method: result.method }; + } + + return { success: false, error: result.error || 'Config failed' }; + } finally { + try { await configPage.close(); } catch (e) {} + } } finally { browser.disconnect(); } @@ -236,7 +305,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url= --snapshot-id='); + console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url= --snapshot-id='); process.exit(1); } diff --git a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py index ab4f4a4b..2e3e6d9d 100644 --- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py +++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py @@ -1,184 +1,398 @@ """ -Unit tests for twocaptcha plugin +Integration tests for twocaptcha plugin -Tests invoke the plugin hooks as external processes and verify outputs/side effects. +Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs + +NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium. """ import json import os +import signal import subprocess import tempfile +import time from pathlib import Path import pytest PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None) -CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None) +PLUGINS_ROOT = PLUGIN_DIR.parent +INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js' +CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js' +CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py' +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' + +TEST_URL = 'https://2captcha.com/demo/recaptcha-v2' -def test_install_script_exists(): - """Verify install script exists""" - assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}" +def setup_test_env(tmpdir: Path) -> dict: + """Set up isolated data/lib directory structure for tests. + + Creates structure matching real ArchiveBox data dir: + /data/ + lib/ + arm64-darwin/ (or x86_64-linux, etc.) + npm/ + .bin/ + node_modules/ + personas/ + default/ + chrome_extensions/ + users/ + testuser/ + crawls/ + snapshots/ + + Calls chrome install hook which handles puppeteer-core and chromium installation. + Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. + """ + import platform + from datetime import datetime + + # Determine machine type (matches archivebox.config.paths.get_machine_type()) + machine = platform.machine().lower() + system = platform.system().lower() + if machine in ('arm64', 'aarch64'): + machine = 'arm64' + elif machine in ('x86_64', 'amd64'): + machine = 'x86_64' + machine_type = f"{machine}-{system}" + + # Create proper directory structure matching real ArchiveBox layout + data_dir = tmpdir / 'data' + lib_dir = data_dir / 'lib' / machine_type + npm_dir = lib_dir / 'npm' + npm_bin_dir = npm_dir / '.bin' + node_modules_dir = npm_dir / 'node_modules' + + # Extensions go under personas/Default/ + chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' + + # User data goes under users/{username}/ + date_str = datetime.now().strftime('%Y%m%d') + users_dir = data_dir / 'users' / 'testuser' + crawls_dir = users_dir / 'crawls' / date_str + snapshots_dir = users_dir / 'snapshots' / date_str + + # Create all directories + node_modules_dir.mkdir(parents=True, exist_ok=True) + npm_bin_dir.mkdir(parents=True, exist_ok=True) + chrome_extensions_dir.mkdir(parents=True, exist_ok=True) + crawls_dir.mkdir(parents=True, exist_ok=True) + snapshots_dir.mkdir(parents=True, exist_ok=True) + + # Build complete env dict + env = os.environ.copy() + env.update({ + 'DATA_DIR': str(data_dir), + 'LIB_DIR': str(lib_dir), + 'MACHINE_TYPE': machine_type, + 'NPM_BIN_DIR': str(npm_bin_dir), + 'NODE_MODULES_DIR': str(node_modules_dir), + 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), + 'CRAWLS_DIR': str(crawls_dir), + 'SNAPSHOTS_DIR': str(snapshots_dir), + }) + + # Only set headless if not already in environment (allow override for debugging) + if 'CHROME_HEADLESS' not in os.environ: + env['CHROME_HEADLESS'] = 'true' + + # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) + result = subprocess.run( + ['python', str(CHROME_INSTALL_HOOK)], + capture_output=True, text=True, timeout=120, env=env + ) + if result.returncode != 0: + pytest.skip(f"Chrome install hook failed: {result.stderr}") + + # Parse JSONL output to get CHROME_BINARY + chrome_binary = None + for line in result.stdout.strip().split('\n'): + if not line.strip(): + continue + try: + data = json.loads(line) + if data.get('type') == 'Binary' and data.get('abspath'): + chrome_binary = data['abspath'] + break + except json.JSONDecodeError: + continue + + if not chrome_binary or not Path(chrome_binary).exists(): + pytest.skip(f"Chromium binary not found: {chrome_binary}") + + env['CHROME_BINARY'] = chrome_binary + return env -def test_config_script_exists(): - """Verify config script exists""" - assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}" +def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str): + """Launch Chromium and return (process, cdp_url).""" + chrome_dir.mkdir(parents=True, exist_ok=True) + + process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + cdp_url = None + for _ in range(30): + if process.poll() is not None: + stdout, stderr = process.communicate() + raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) + + if not cdp_url: + process.kill() + stdout, stderr = process.communicate() + raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}") + + # Wait for extensions.json to be written (chrome launch hook parses chrome://extensions) + extensions_file = chrome_dir / 'extensions.json' + for _ in range(15): + if extensions_file.exists(): + break + time.sleep(1) + + # Print chrome launch hook output for debugging + import select + if hasattr(select, 'poll'): + # Read any available stderr without blocking + import fcntl + import os as os_module + fd = process.stderr.fileno() + fl = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK) + try: + stderr_output = process.stderr.read() + if stderr_output: + print(f"[Chrome Launch Hook Output]\n{stderr_output}") + except: + pass + + return process, cdp_url -def test_extension_metadata(): - """Test that twocaptcha extension has correct metadata""" - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions") - - # Just check the script can be loaded - result = subprocess.run( - ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"], - capture_output=True, - text=True, - env=env - ) - - assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}" - - metadata = json.loads(result.stdout) - assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" - assert metadata["name"] == "twocaptcha" +def kill_chrome(process, chrome_dir: Path): + """Kill Chromium process.""" + try: + process.send_signal(signal.SIGTERM) + process.wait(timeout=5) + except: + pass + pid_file = chrome_dir / 'chrome.pid' + if pid_file.exists(): + try: + os.kill(int(pid_file.read_text().strip()), signal.SIGKILL) + except: + pass -def test_install_creates_cache(): - """Test that install creates extension cache""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) +class TestTwoCaptcha: + """Integration tests requiring TWOCAPTCHA_API_KEY.""" - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - env["API_KEY_2CAPTCHA"] = "test_api_key" + @pytest.fixture(autouse=True) + def setup(self): + self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA') + if not self.api_key: + pytest.skip("TWOCAPTCHA_API_KEY required") - # Run install script - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) + def test_install_and_load(self): + """Extension installs and loads in Chromium.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = setup_test_env(tmpdir) + env['TWOCAPTCHA_API_KEY'] = self.api_key - # Check output mentions installation - assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout + # Install + result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True) + assert result.returncode == 0, f"Install failed: {result.stderr}" - # Check cache file was created - cache_file = ext_dir / "twocaptcha.extension.json" - assert cache_file.exists(), "Cache file should be created" + cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json' + assert cache.exists() + data = json.loads(cache.read_text()) + assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo' - # Verify cache content - cache_data = json.loads(cache_file.read_text()) - assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" - assert cache_data["name"] == "twocaptcha" - assert "unpacked_path" in cache_data - assert "version" in cache_data + # Launch Chromium in crawls directory + crawl_id = 'test' + crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id + chrome_dir = crawl_dir / 'chrome' + env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) + + try: + exts = json.loads((chrome_dir / 'extensions.json').read_text()) + assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}" + print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}") + finally: + kill_chrome(process, chrome_dir) + + def test_config_applied(self): + """Configuration is applied to extension and verified via Config.getAll().""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = setup_test_env(tmpdir) + env['TWOCAPTCHA_API_KEY'] = self.api_key + env['TWOCAPTCHA_RETRY_COUNT'] = '5' + env['TWOCAPTCHA_RETRY_DELAY'] = '10' + + subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True) + + # Launch Chromium in crawls directory + crawl_id = 'cfg' + crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id + chrome_dir = crawl_dir / 'chrome' + env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) + + try: + result = subprocess.run( + ['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'], + env=env, timeout=30, capture_output=True, text=True + ) + assert result.returncode == 0, f"Config failed: {result.stderr}" + assert (chrome_dir / '.twocaptcha_configured').exists() + + # Verify config via options.html and Config.getAll() + # Get the actual extension ID from the config marker (Chrome computes IDs differently) + config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text()) + ext_id = config_marker['extensionId'] + script = f''' +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); +(async () => {{ + const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + + // Load options.html and use Config.getAll() to verify + const optionsUrl = 'chrome-extension://{ext_id}/options/options.html'; + const page = await browser.newPage(); + console.error('[*] Loading options page:', optionsUrl); + + // Navigate - catch error but continue since page may still load + try {{ + await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }}); + }} catch (e) {{ + console.error('[*] Navigation threw error (may still work):', e.message); + }} + + // Wait for page to settle + await new Promise(r => setTimeout(r, 2000)); + console.error('[*] Current URL:', page.url()); + + // Wait for Config object to be available + await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }}); + + // Call Config.getAll() - the extension's own API (returns a Promise) + const cfg = await page.evaluate(async () => await Config.getAll()); + console.error('[*] Config.getAll() returned:', JSON.stringify(cfg)); + + await page.close(); + browser.disconnect(); + console.log(JSON.stringify(cfg)); +}})(); +''' + (tmpdir / 'v.js').write_text(script) + r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True) + print(r.stderr) + assert r.returncode == 0, f"Verify failed: {r.stderr}" + + cfg = json.loads(r.stdout.strip().split('\n')[-1]) + print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}") + + # Verify all the fields we care about + assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}" + assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}" + assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}" + assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}" + assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}" + assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}" + assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}" + assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}" + + print(f"[+] Config verified via Config.getAll()!") + finally: + kill_chrome(process, chrome_dir) + + def test_solves_recaptcha(self): + """Extension solves reCAPTCHA on demo page.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = setup_test_env(tmpdir) + env['TWOCAPTCHA_API_KEY'] = self.api_key + + subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True) + + # Launch Chromium in crawls directory + crawl_id = 'solve' + crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id + chrome_dir = crawl_dir / 'chrome' + env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) + + try: + subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True) + + script = f''' +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); +(async () => {{ + const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + const page = await browser.newPage(); + await page.setViewport({{ width: 1440, height: 900 }}); + console.error('[*] Loading {TEST_URL}...'); + await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }}); + await new Promise(r => setTimeout(r, 3000)); + + const start = Date.now(); + const maxWait = 90000; + + while (Date.now() - start < maxWait) {{ + const state = await page.evaluate(() => {{ + const resp = document.querySelector('textarea[name="g-recaptcha-response"]'); + const solver = document.querySelector('.captcha-solver'); + return {{ + solved: resp ? resp.value.length > 0 : false, + state: solver?.getAttribute('data-state'), + text: solver?.textContent?.trim() || '' + }}; + }}); + const sec = Math.round((Date.now() - start) / 1000); + console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30)); + if (state.solved) {{ console.error('[+] SOLVED!'); break; }} + if (state.state === 'error') {{ console.error('[!] ERROR'); break; }} + await new Promise(r => setTimeout(r, 2000)); + }} + + const final = await page.evaluate(() => {{ + const resp = document.querySelector('textarea[name="g-recaptcha-response"]'); + return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }}; + }}); + browser.disconnect(); + console.log(JSON.stringify(final)); +}})(); +''' + (tmpdir / 's.js').write_text(script) + print("\n[*] Solving CAPTCHA (10-60s)...") + r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True) + print(r.stderr) + assert r.returncode == 0, f"Failed: {r.stderr}" + + final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1]) + assert final.get('solved'), f"Not solved: {final}" + print(f"[+] SOLVED! {final.get('preview','')[:30]}...") + finally: + kill_chrome(process, chrome_dir) -def test_install_twice_uses_cache(): - """Test that running install twice uses existing cache on second run""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - env["API_KEY_2CAPTCHA"] = "test_api_key" - - # First install - downloads the extension - result1 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - assert result1.returncode == 0, f"First install failed: {result1.stderr}" - - # Verify cache was created - cache_file = ext_dir / "twocaptcha.extension.json" - assert cache_file.exists(), "Cache file should exist after first install" - - # Second install - should use cache - result2 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=30 - ) - assert result2.returncode == 0, f"Second install failed: {result2.stderr}" - - # Second run should mention cache reuse - assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0 - - -def test_install_warns_without_api_key(): - """Test that install warns when API key not configured""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - # Don't set API_KEY_2CAPTCHA - - # Run install script - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should warn about missing API key - combined_output = result.stdout + result.stderr - assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output - - -def test_install_success_with_api_key(): - """Test that install succeeds when API key is configured""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123" - - # Run install script - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should mention API key configured - combined_output = result.stdout + result.stderr - assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output - - -def test_config_script_structure(): - """Test that config script has proper structure""" - # Verify the script exists and contains expected markers - script_content = CONFIG_SCRIPT.read_text() - - # Should mention configuration marker file - assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content - - # Should mention API key - assert "API_KEY_2CAPTCHA" in script_content - - # Should have main function or be executable - assert "async function" in script_content or "main" in script_content +if __name__ == '__main__': + pytest.main([__file__, '-xvs']) diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index 99d7fcaf..f5acaa52 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -14,7 +14,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_ublock.*'), None) +INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None) def test_install_script_exists(): @@ -158,26 +158,221 @@ def test_large_extension_size(): PLUGINS_ROOT = PLUGIN_DIR.parent -CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py' -CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' +CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py' +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js' + + +def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str): + """Launch Chromium and return (process, cdp_url) or raise on failure.""" + import signal + import time + + chrome_dir.mkdir(parents=True, exist_ok=True) + + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chromium to launch and CDP URL to be available + cdp_url = None + for i in range(20): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") + cdp_file = chrome_dir / 'cdp_url.txt' + if cdp_file.exists(): + cdp_url = cdp_file.read_text().strip() + break + time.sleep(1) + + if not cdp_url: + chrome_launch_process.kill() + raise RuntimeError("Chromium CDP URL not found after 20s") + + return chrome_launch_process, cdp_url + + +def kill_chromium_session(chrome_launch_process, chrome_dir: Path): + """Clean up Chromium process.""" + import signal + + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + chrome_pid_file = chrome_dir / 'chrome.pid' + if chrome_pid_file.exists(): + try: + chrome_pid = int(chrome_pid_file.read_text().strip()) + os.kill(chrome_pid, signal.SIGKILL) + except (OSError, ValueError): + pass + + +def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict: + """Check ad blocking effectiveness by counting ad elements on page. + + Returns dict with: + - adElementsFound: int - number of ad-related elements found + - adElementsVisible: int - number of visible ad elements + - blockedRequests: int - number of blocked network requests (ads/trackers) + - totalRequests: int - total network requests made + - percentBlocked: int - percentage of ad elements hidden (0-100) + """ + test_script = f''' +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); + +(async () => {{ + const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + + const page = await browser.newPage(); + await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); + await page.setViewport({{ width: 1440, height: 900 }}); + + // Track network requests + let blockedRequests = 0; + let totalRequests = 0; + const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr', + 'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo', + 'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini']; + + page.on('request', request => {{ + totalRequests++; + const url = request.url().toLowerCase(); + if (adDomains.some(d => url.includes(d))) {{ + // This is an ad request + }} + }}); + + page.on('requestfailed', request => {{ + const url = request.url().toLowerCase(); + if (adDomains.some(d => url.includes(d))) {{ + blockedRequests++; + }} + }}); + + console.error('Navigating to {test_url}...'); + await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }}); + + // Wait for page to fully render and ads to load + await new Promise(r => setTimeout(r, 5000)); + + // Check for ad elements in the DOM + const result = await page.evaluate(() => {{ + // Common ad-related selectors + const adSelectors = [ + // Generic ad containers + '[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]', + '[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]', + '[class*="advertisement"]', '[id*="advertisement"]', + '[class*="sponsored"]', '[id*="sponsored"]', + // Google ads + 'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]', + // Yahoo specific + '[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]', + '[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]', + // iframes (often ads) + 'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]', + // Common ad sizes + '[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]', + '[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]', + ]; + + let adElementsFound = 0; + let adElementsVisible = 0; + + for (const selector of adSelectors) {{ + try {{ + const elements = document.querySelectorAll(selector); + for (const el of elements) {{ + adElementsFound++; + const style = window.getComputedStyle(el); + const rect = el.getBoundingClientRect(); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0' && + rect.width > 0 && rect.height > 0; + if (isVisible) {{ + adElementsVisible++; + }} + }} + }} catch (e) {{ + // Invalid selector, skip + }} + }} + + return {{ + adElementsFound, + adElementsVisible, + pageTitle: document.title + }}; + }}); + + result.blockedRequests = blockedRequests; + result.totalRequests = totalRequests; + // Calculate how many ad elements were hidden (found but not visible) + const hiddenAds = result.adElementsFound - result.adElementsVisible; + result.percentBlocked = result.adElementsFound > 0 + ? Math.round((hiddenAds / result.adElementsFound) * 100) + : 0; + + console.error('Ad blocking result:', JSON.stringify(result)); + browser.disconnect(); + console.log(JSON.stringify(result)); +}})(); +''' + script_path = script_dir / 'check_ads.js' + script_path.write_text(test_script) + + result = subprocess.run( + ['node', str(script_path)], + cwd=str(script_dir), + capture_output=True, + text=True, + env=env, + timeout=90 + ) + + if result.returncode != 0: + raise RuntimeError(f"Ad check script failed: {result.stderr}") + + output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] + if not output_lines: + raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}") + + return json.loads(output_lines[-1]) def setup_test_env(tmpdir: Path) -> dict: """Set up isolated data/lib directory structure for tests. - Creates structure like: + Creates structure matching real ArchiveBox data dir: /data/ lib/ arm64-darwin/ (or x86_64-linux, etc.) npm/ - bin/ + .bin/ node_modules/ - chrome_extensions/ + personas/ + default/ + chrome_extensions/ + users/ + testuser/ + crawls/ + snapshots/ Calls chrome install hook which handles puppeteer-core and chromium installation. Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. """ import platform + from datetime import datetime # Determine machine type (matches archivebox.config.paths.get_machine_type()) machine = platform.machine().lower() @@ -188,18 +383,28 @@ def setup_test_env(tmpdir: Path) -> dict: machine = 'x86_64' machine_type = f"{machine}-{system}" - # Create proper directory structure + # Create proper directory structure matching real ArchiveBox layout data_dir = tmpdir / 'data' lib_dir = data_dir / 'lib' / machine_type npm_dir = lib_dir / 'npm' - npm_bin_dir = npm_dir / 'bin' + npm_bin_dir = npm_dir / '.bin' node_modules_dir = npm_dir / 'node_modules' - chrome_extensions_dir = data_dir / 'chrome_extensions' + + # Extensions go under personas/Default/ + chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' + + # User data goes under users/{username}/ + date_str = datetime.now().strftime('%Y%m%d') + users_dir = data_dir / 'users' / 'testuser' + crawls_dir = users_dir / 'crawls' / date_str + snapshots_dir = users_dir / 'snapshots' / date_str # Create all directories node_modules_dir.mkdir(parents=True, exist_ok=True) npm_bin_dir.mkdir(parents=True, exist_ok=True) chrome_extensions_dir.mkdir(parents=True, exist_ok=True) + crawls_dir.mkdir(parents=True, exist_ok=True) + snapshots_dir.mkdir(parents=True, exist_ok=True) # Build complete env dict env = os.environ.copy() @@ -210,12 +415,14 @@ def setup_test_env(tmpdir: Path) -> dict: 'NPM_BIN_DIR': str(npm_bin_dir), 'NODE_MODULES_DIR': str(node_modules_dir), 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), + 'CRAWLS_DIR': str(crawls_dir), + 'SNAPSHOTS_DIR': str(snapshots_dir), }) # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL) result = subprocess.run( ['python', str(CHROME_INSTALL_HOOK)], - capture_output=True, text=True, timeout=10, env=env + capture_output=True, text=True, timeout=120, env=env ) if result.returncode != 0: pytest.skip(f"Chrome install hook failed: {result.stderr}") @@ -240,8 +447,8 @@ def setup_test_env(tmpdir: Path) -> dict: return env -# Test URL: ad blocker test page that shows if ads are blocked -TEST_URL = 'https://d3ward.github.io/toolz/adblock.html' +# Test URL: Yahoo has many ads that uBlock should block +TEST_URL = 'https://www.yahoo.com/' @pytest.mark.timeout(15) @@ -290,14 +497,18 @@ def test_extension_loads_in_chromium(): print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True) print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True) print("[test] Launching Chromium...", flush=True) - data_dir = Path(env['DATA_DIR']) - crawl_dir = data_dir / 'crawl' - crawl_dir.mkdir() + + # Launch Chromium in crawls directory + crawl_id = 'test-ublock' + crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id + crawl_dir.mkdir(parents=True, exist_ok=True) chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir(parents=True, exist_ok=True) + env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'], - cwd=str(crawl_dir), + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, @@ -457,161 +668,177 @@ const puppeteer = require('puppeteer-core'); def test_blocks_ads_on_test_page(): """Live test: verify uBlock Origin blocks ads on a test page. - Uses Chromium with extensions loaded automatically via chrome hook. - Tests against d3ward's ad blocker test page which checks ad domains. + This test runs TWO browser sessions: + 1. WITHOUT extension - verifies ads are NOT blocked (baseline) + 2. WITH extension - verifies ads ARE blocked + + This ensures we're actually testing the extension's effect, not just + that a test page happens to show ads as blocked. """ - import signal import time with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) # Set up isolated env with proper directory structure - env = setup_test_env(tmpdir) - env['CHROME_HEADLESS'] = 'true' + env_base = setup_test_env(tmpdir) + env_base['CHROME_HEADLESS'] = 'true' - ext_dir = Path(env['CHROME_EXTENSIONS_DIR']) + # ============================================================ + # STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked + # ============================================================ + print("\n" + "="*60) + print("STEP 1: BASELINE TEST (no extension)") + print("="*60) + + data_dir = Path(env_base['DATA_DIR']) + + env_no_ext = env_base.copy() + env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions') + (data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True) + + # Launch baseline Chromium in crawls directory + baseline_crawl_id = 'baseline-no-ext' + baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id + baseline_crawl_dir.mkdir(parents=True, exist_ok=True) + baseline_chrome_dir = baseline_crawl_dir / 'chrome' + env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir) + baseline_process = None + + try: + baseline_process, baseline_cdp_url = launch_chromium_session( + env_no_ext, baseline_chrome_dir, baseline_crawl_id + ) + print(f"Baseline Chromium launched: {baseline_cdp_url}") + + # Wait a moment for browser to be ready + time.sleep(2) + + baseline_result = check_ad_blocking( + baseline_cdp_url, TEST_URL, env_no_ext, tmpdir + ) + + print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads " + f"(found {baseline_result['adElementsFound']} ad elements)") + + finally: + if baseline_process: + kill_chromium_session(baseline_process, baseline_chrome_dir) + + # Verify baseline shows ads ARE visible (not blocked) + if baseline_result['adElementsFound'] == 0: + pytest.skip( + f"Cannot test extension: no ad elements found on {TEST_URL}. " + f"The page may have changed or loaded differently." + ) + + if baseline_result['adElementsVisible'] == 0: + print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!") + print("This suggests either:") + print(" - There's another ad blocker interfering") + print(" - Network-level ad blocking is in effect") + + pytest.skip( + f"Cannot test extension: baseline shows no visible ads " + f"despite finding {baseline_result['adElementsFound']} ad elements." + ) + + print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension") + + # ============================================================ + # STEP 2: Install the uBlock extension + # ============================================================ + print("\n" + "="*60) + print("STEP 2: INSTALLING EXTENSION") + print("="*60) + + ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR']) - # Step 1: Install the uBlock extension result = subprocess.run( ['node', str(INSTALL_SCRIPT)], capture_output=True, text=True, - env=env, - timeout=15 + env=env_base, + timeout=60 ) assert result.returncode == 0, f"Extension install failed: {result.stderr}" - # Verify extension cache was created cache_file = ext_dir / 'ublock.extension.json' assert cache_file.exists(), "Extension cache not created" ext_data = json.loads(cache_file.read_text()) print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") - # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) - data_dir = Path(env['DATA_DIR']) - crawl_dir = data_dir / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' + # ============================================================ + # STEP 3: Run WITH extension, verify ads ARE blocked + # ============================================================ + print("\n" + "="*60) + print("STEP 3: TEST WITH EXTENSION") + print("="*60) - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'], - cwd=str(crawl_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chrome to launch and CDP URL to be available - cdp_url = None - for i in range(20): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - break - time.sleep(1) - - assert cdp_url, "Chrome CDP URL not found after 20s" - print(f"Chrome launched with CDP URL: {cdp_url}") - - # Check that extensions were loaded - extensions_file = chrome_dir / 'extensions.json' - if extensions_file.exists(): - loaded_exts = json.loads(extensions_file.read_text()) - print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") + # Launch extension test Chromium in crawls directory + ext_crawl_id = 'test-with-ext' + ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id + ext_crawl_dir.mkdir(parents=True, exist_ok=True) + ext_chrome_dir = ext_crawl_dir / 'chrome' + env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir) + ext_process = None try: - # Step 3: Connect to Chrome and test ad blocking - test_script = f''' -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); + ext_process, ext_cdp_url = launch_chromium_session( + env_base, ext_chrome_dir, ext_crawl_id + ) + print(f"Extension Chromium launched: {ext_cdp_url}") -(async () => {{ - const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); + # Check that extension was loaded + extensions_file = ext_chrome_dir / 'extensions.json' + if extensions_file.exists(): + loaded_exts = json.loads(extensions_file.read_text()) + print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") - // Wait for extension to initialize - await new Promise(r => setTimeout(r, 500)); + # Wait for extension to initialize + time.sleep(3) - // Check extension loaded by looking at targets - const targets = browser.targets(); - const extTargets = targets.filter(t => - t.url().startsWith('chrome-extension://') || - t.type() === 'service_worker' || - t.type() === 'background_page' - ); - console.error('Extension targets found:', extTargets.length); - extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60))); - - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'); - await page.setViewport({{ width: 1440, height: 900 }}); - - console.error('Navigating to {TEST_URL}...'); - await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }}); - - // Wait for the test page to run its checks - await new Promise(r => setTimeout(r, 5000)); - - // The d3ward test page shows blocked percentage - const result = await page.evaluate(() => {{ - const scoreEl = document.querySelector('#score'); - const score = scoreEl ? scoreEl.textContent : null; - const blockedItems = document.querySelectorAll('.blocked').length; - const totalItems = document.querySelectorAll('.testlist li').length; - return {{ - score, - blockedItems, - totalItems, - percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0 - }}; - }}); - - console.error('Ad blocking result:', JSON.stringify(result)); - browser.disconnect(); - console.log(JSON.stringify(result)); -}})(); -''' - script_path = tmpdir / 'test_ublock.js' - script_path.write_text(test_script) - - result = subprocess.run( - ['node', str(script_path)], - cwd=str(tmpdir), - capture_output=True, - text=True, - env=env, - timeout=10 + ext_result = check_ad_blocking( + ext_cdp_url, TEST_URL, env_base, tmpdir ) - print(f"stderr: {result.stderr}") - print(f"stdout: {result.stdout}") - - assert result.returncode == 0, f"Test failed: {result.stderr}" - - output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] - assert output_lines, f"No JSON output: {result.stdout}" - - test_result = json.loads(output_lines[-1]) - - # uBlock should block most ad domains on the test page - assert test_result['percentBlocked'] >= 50, \ - f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}" + print(f"Extension result: {ext_result['adElementsVisible']} visible ads " + f"(found {ext_result['adElementsFound']} ad elements)") finally: - # Clean up Chrome - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - chrome_pid_file = chrome_dir / 'chrome.pid' - if chrome_pid_file.exists(): - try: - chrome_pid = int(chrome_pid_file.read_text().strip()) - os.kill(chrome_pid, signal.SIGKILL) - except (OSError, ValueError): - pass + if ext_process: + kill_chromium_session(ext_process, ext_chrome_dir) + + # ============================================================ + # STEP 4: Compare results + # ============================================================ + print("\n" + "="*60) + print("STEP 4: COMPARISON") + print("="*60) + print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads") + print(f"With extension: {ext_result['adElementsVisible']} visible ads") + + # Calculate reduction in visible ads + ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible'] + reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0 + + print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)") + + # Extension should significantly reduce visible ads + assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \ + f"uBlock should reduce visible ads.\n" \ + f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \ + f"With extension: {ext_result['adElementsVisible']} visible ads\n" \ + f"Expected fewer ads with extension." + + # Extension should block at least 30% of ads + assert reduction_percent >= 30, \ + f"uBlock should block at least 30% of ads.\n" \ + f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \ + f"With extension: {ext_result['adElementsVisible']} visible ads\n" \ + f"Reduction: only {reduction_percent:.0f}% (expected at least 30%)" + + print(f"\n✓ SUCCESS: uBlock correctly blocks ads!") + print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads") + print(f" - With extension: {ext_result['adElementsVisible']} visible ads") + print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)") diff --git a/old/TODO_chrome_plugin_cleanup.md b/old/TODO_chrome_plugin_cleanup.md index 3db673e6..90b7716f 100644 --- a/old/TODO_chrome_plugin_cleanup.md +++ b/old/TODO_chrome_plugin_cleanup.md @@ -133,7 +133,7 @@ This plugin provides shared Chrome infrastructure for other plugins. It manages chrome/ ├── on_Crawl__00_chrome_install_config.py # Configure Chrome settings ├── on_Crawl__00_chrome_install.py # Install Chrome binary -├── on_Crawl__20_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg) +├── on_Crawl__30_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg) ├── on_Snapshot__20_chrome_tab.bg.js # Open tab (Snapshot-level, bg) ├── on_Snapshot__30_chrome_navigate.js # Navigate to URL (foreground) ├── on_Snapshot__45_chrome_tab_cleanup.py # Close tab, kill bg hooks