extension test fixes

2026-01-02 17:05:38 +10:00 · 2025-12-30 18:28:14 -08:00
parent dd2302ad92
commit 42d3fb7025
12 changed files with 1512 additions and 688 deletions
--- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
@@ -8,7 +8,7 @@
 * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
 * --load-extension and --disable-extensions-except flags.
 *
- * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
+ * Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
 * Output: Writes to current directory (executor creates chrome/ dir):
 *   - cdp_url.txt: WebSocket URL for CDP connection
 *   - chrome.pid: Chromium process ID (for cleanup)
@@ -165,14 +165,6 @@ async function main() {
        chromePid = result.pid;
        const cdpUrl = result.cdpUrl;

-        // Write extensions metadata
-        if (installedExtensions.length > 0) {
-            fs.writeFileSync(
-                path.join(OUTPUT_DIR, 'extensions.json'),
-                JSON.stringify(installedExtensions, null, 2)
-            );
-        }
-
        // Connect puppeteer for extension verification
        console.error(`[*] Connecting puppeteer to CDP...`);
        const browser = await puppeteer.connect({
@@ -181,30 +173,84 @@ async function main() {
        });
        browserInstance = browser;

-        // Verify extensions loaded
+        // Get actual extension IDs from chrome://extensions page
        if (extensionPaths.length > 0) {
-            await new Promise(r => setTimeout(r, 3000));
+            await new Promise(r => setTimeout(r, 2000));

-            const targets = browser.targets();
-            console.error(`[*] All browser targets (${targets.length}):`);
-            for (const t of targets) {
-                console.error(`    - ${t.type()}: ${t.url().slice(0, 80)}`);
+            try {
+                const extPage = await browser.newPage();
+                await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
+                await new Promise(r => setTimeout(r, 2000));
+
+                // Parse extension info from the page
+                const extensionsFromPage = await extPage.evaluate(() => {
+                    const extensions = [];
+                    // Extensions manager uses shadow DOM
+                    const manager = document.querySelector('extensions-manager');
+                    if (!manager || !manager.shadowRoot) return extensions;
+
+                    const itemList = manager.shadowRoot.querySelector('extensions-item-list');
+                    if (!itemList || !itemList.shadowRoot) return extensions;
+
+                    const items = itemList.shadowRoot.querySelectorAll('extensions-item');
+                    for (const item of items) {
+                        const id = item.getAttribute('id');
+                        const nameEl = item.shadowRoot?.querySelector('#name');
+                        const name = nameEl?.textContent?.trim() || '';
+                        if (id && name) {
+                            extensions.push({ id, name });
+                        }
+                    }
+                    return extensions;
+                });
+
+                console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
+                for (const e of extensionsFromPage) {
+                    console.error(`    - ${e.id}: "${e.name}"`);
+                }
+
+                // Match extensions by name (strict matching)
+                for (const ext of installedExtensions) {
+                    // Read the extension's manifest to get its display name
+                    const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
+                    if (fs.existsSync(manifestPath)) {
+                        const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
+                        const manifestName = manifest.name || '';
+                        console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
+
+                        // Find matching extension from page by exact name match first
+                        let match = extensionsFromPage.find(e => e.name === manifestName);
+
+                        // If no exact match, try case-insensitive exact match
+                        if (!match) {
+                            match = extensionsFromPage.find(e =>
+                                e.name.toLowerCase() === manifestName.toLowerCase()
+                            );
+                        }
+
+                        if (match) {
+                            ext.id = match.id;
+                            console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
+                        } else {
+                            console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
+                        }
+                    }
+                }
+
+                await extPage.close();
+            } catch (e) {
+                console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
            }

-            const extTargets = targets.filter(t =>
-                t.url().startsWith('chrome-extension://') ||
-                t.type() === 'service_worker' ||
-                t.type() === 'background_page'
-            );
-
-            // Filter out built-in extensions
+            // Fallback: check browser targets
+            const targets = browser.targets();
            const builtinIds = [
                'nkeimhogjdpnpccoofpliimaahmaaome',
                'fignfifoniblkonapihmkfakmlgkbkcf',
                'ahfgeienlihckogmohjhadlkjgocpleb',
                'mhjfbmdgcfjbbpaeojofohoefgiehjai',
            ];
-            const customExtTargets = extTargets.filter(t => {
+            const customExtTargets = targets.filter(t => {
                const url = t.url();
                if (!url.startsWith('chrome-extension://')) return false;
                const extId = url.split('://')[1].split('/')[0];
@@ -216,7 +262,7 @@ async function main() {
            for (const target of customExtTargets) {
                const url = target.url();
                const extId = url.split('://')[1].split('/')[0];
-                console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
+                console.error(`[+] Extension target: ${extId} (${target.type()})`);
            }

            if (customExtTargets.length === 0 && extensionPaths.length > 0) {
@@ -225,6 +271,14 @@ async function main() {
            }
        }

+        // Write extensions metadata with actual IDs
+        if (installedExtensions.length > 0) {
+            fs.writeFileSync(
+                path.join(OUTPUT_DIR, 'extensions.json'),
+                JSON.stringify(installedExtensions, null, 2)
+            );
+        }
+
        console.error(`[+] Chromium session started for crawl ${crawlId}`);
        console.error(`[+] CDP URL: ${cdpUrl}`);
        console.error(`[+] PID: ${chromePid}`);
--- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
@@ -2,7 +2,7 @@
 /**
 * Create a Chrome tab for this snapshot in the shared crawl Chrome session.
 *
- * If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
+ * If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
 * this connects to it and creates a new tab. Otherwise, falls back to launching
 * its own Chrome instance.
 *
@@ -215,7 +215,7 @@ async function launchNewChrome(url, binary) {
    console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);

    // Write PID immediately for cleanup
-    fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
+    fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));

    try {
        // Wait for Chrome to be ready
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -29,7 +29,7 @@ import shutil
 import platform

 PLUGIN_DIR = Path(__file__).parent.parent
-CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
+CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
 CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
 CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)

@@ -176,6 +176,7 @@ def test_chrome_launch_and_tab_creation():
        crawl_dir = Path(tmpdir) / 'crawl'
        crawl_dir.mkdir()
        chrome_dir = crawl_dir / 'chrome'
+        chrome_dir.mkdir()

        # Get test environment with NODE_MODULES_DIR set
        env = get_test_env()
@@ -184,7 +185,7 @@ def test_chrome_launch_and_tab_creation():
        # Launch Chrome at crawl level (background process)
        chrome_launch_process = subprocess.Popen(
            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
-            cwd=str(crawl_dir),
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
@@ -292,7 +293,7 @@ def test_chrome_navigation():
        # Launch Chrome (background process)
        chrome_launch_process = subprocess.Popen(
            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
-            cwd=str(crawl_dir),
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
@@ -363,7 +364,7 @@ def test_tab_cleanup_on_sigterm():
        # Launch Chrome (background process)
        chrome_launch_process = subprocess.Popen(
            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
-            cwd=str(crawl_dir),
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
@@ -423,11 +424,12 @@ def test_multiple_snapshots_share_chrome():
        crawl_dir = Path(tmpdir) / 'crawl'
        crawl_dir.mkdir()
        chrome_dir = crawl_dir / 'chrome'
+        chrome_dir.mkdir()

        # Launch Chrome at crawl level
        chrome_launch_process = subprocess.Popen(
            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
-            cwd=str(crawl_dir),
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
@@ -513,7 +515,7 @@ def test_chrome_cleanup_on_crawl_end():
        # Launch Chrome in background
        chrome_launch_process = subprocess.Popen(
            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
-            cwd=str(crawl_dir),
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
@@ -554,11 +556,12 @@ def test_zombie_prevention_hook_killed():
        crawl_dir = Path(tmpdir) / 'crawl'
        crawl_dir.mkdir()
        chrome_dir = crawl_dir / 'chrome'
+        chrome_dir.mkdir()

        # Launch Chrome
        chrome_launch_process = subprocess.Popen(
            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
-            cwd=str(crawl_dir),
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
--- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
+++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
@@ -26,7 +26,7 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
 INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
-CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
+CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
 CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
 CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
 TEST_URL = 'https://www.singsing.movie/'
@@ -122,6 +122,7 @@ def setup_chrome_session(tmpdir):
    crawl_dir = Path(tmpdir) / 'crawl'
    crawl_dir.mkdir()
    chrome_dir = crawl_dir / 'chrome'
+    chrome_dir.mkdir()

    env = get_test_env()
    env['CHROME_HEADLESS'] = 'true'
@@ -129,7 +130,7 @@ def setup_chrome_session(tmpdir):
    # Launch Chrome at crawl level
    chrome_launch_process = subprocess.Popen(
        ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
-        cwd=str(crawl_dir),
+        cwd=str(chrome_dir),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
--- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
+++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
@@ -16,7 +16,7 @@ import pytest


 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None)
+INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)


 def test_install_script_exists():
@@ -124,78 +124,106 @@ def test_no_configuration_required():
        assert "API" not in (result.stdout + result.stderr) or result.returncode == 0


-def setup_test_lib_dirs(tmpdir: Path) -> dict:
-    """Create isolated lib directories for tests and return env dict.
+PLUGINS_ROOT = PLUGIN_DIR.parent
+CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
+CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'

-    Sets up:
-        LIB_DIR: tmpdir/lib/<arch>
-        NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
-        NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
-        PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
-        PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
+
+def setup_test_env(tmpdir: Path) -> dict:
+    """Set up isolated data/lib directory structure for tests.
+
+    Creates structure matching real ArchiveBox data dir:
+        <tmpdir>/data/
+            lib/
+                arm64-darwin/   (or x86_64-linux, etc.)
+                    npm/
+                        .bin/
+                        node_modules/
+            personas/
+                Default/
+                    chrome_extensions/
+            users/
+                testuser/
+                    crawls/
+                    snapshots/
+
+    Calls chrome install hook which handles puppeteer-core and chromium installation.
+    Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
    """
    import platform
-    arch = platform.machine()
+    from datetime import datetime
+
+    # Determine machine type (matches archivebox.config.paths.get_machine_type())
+    machine = platform.machine().lower()
    system = platform.system().lower()
-    arch_dir = f"{arch}-{system}"
+    if machine in ('arm64', 'aarch64'):
+        machine = 'arm64'
+    elif machine in ('x86_64', 'amd64'):
+        machine = 'x86_64'
+    machine_type = f"{machine}-{system}"

-    lib_dir = tmpdir / 'lib' / arch_dir
+    # Create proper directory structure matching real ArchiveBox layout
+    data_dir = tmpdir / 'data'
+    lib_dir = data_dir / 'lib' / machine_type
    npm_dir = lib_dir / 'npm'
+    npm_bin_dir = npm_dir / '.bin'
    node_modules_dir = npm_dir / 'node_modules'
-    npm_bin_dir = npm_dir / 'bin'
-    pip_venv_dir = lib_dir / 'pip' / 'venv'
-    pip_bin_dir = pip_venv_dir / 'bin'

-    # Create directories
+    # Extensions go under personas/Default/
+    chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
+
+    # User data goes under users/{username}/
+    date_str = datetime.now().strftime('%Y%m%d')
+    users_dir = data_dir / 'users' / 'testuser'
+    crawls_dir = users_dir / 'crawls' / date_str
+    snapshots_dir = users_dir / 'snapshots' / date_str
+
+    # Create all directories
    node_modules_dir.mkdir(parents=True, exist_ok=True)
    npm_bin_dir.mkdir(parents=True, exist_ok=True)
-    pip_bin_dir.mkdir(parents=True, exist_ok=True)
+    chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
+    crawls_dir.mkdir(parents=True, exist_ok=True)
+    snapshots_dir.mkdir(parents=True, exist_ok=True)

-    # Install puppeteer-core to the test node_modules if not present
-    if not (node_modules_dir / 'puppeteer-core').exists():
-        result = subprocess.run(
-            ['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
-            capture_output=True,
-            text=True,
-            timeout=120
-        )
-        if result.returncode != 0:
-            pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
-
-    return {
+    # Build complete env dict
+    env = os.environ.copy()
+    env.update({
+        'DATA_DIR': str(data_dir),
        'LIB_DIR': str(lib_dir),
-        'NODE_MODULES_DIR': str(node_modules_dir),
+        'MACHINE_TYPE': machine_type,
        'NPM_BIN_DIR': str(npm_bin_dir),
-        'PIP_VENV_DIR': str(pip_venv_dir),
-        'PIP_BIN_DIR': str(pip_bin_dir),
-    }
+        'NODE_MODULES_DIR': str(node_modules_dir),
+        'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
+        'CRAWLS_DIR': str(crawls_dir),
+        'SNAPSHOTS_DIR': str(snapshots_dir),
+    })

-
-PLUGINS_ROOT = PLUGIN_DIR.parent
-
-
-def find_chromium_binary():
-    """Find the Chromium binary using chrome_utils.js findChromium().
-
-    This uses the centralized findChromium() function which checks:
-    - CHROME_BINARY env var
-    - @puppeteer/browsers install locations
-    - System Chromium locations
-    - Falls back to Chrome (with warning)
-    """
-    chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
+    # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
    result = subprocess.run(
-        ['node', str(chrome_utils), 'findChromium'],
-        capture_output=True,
-        text=True,
-        timeout=10
+        ['python', str(CHROME_INSTALL_HOOK)],
+        capture_output=True, text=True, timeout=120, env=env
    )
-    if result.returncode == 0 and result.stdout.strip():
-        return result.stdout.strip()
-    return None
+    if result.returncode != 0:
+        pytest.skip(f"Chrome install hook failed: {result.stderr}")

+    # Parse JSONL output to get CHROME_BINARY
+    chrome_binary = None
+    for line in result.stdout.strip().split('\n'):
+        if not line.strip():
+            continue
+        try:
+            data = json.loads(line)
+            if data.get('type') == 'Binary' and data.get('abspath'):
+                chrome_binary = data['abspath']
+                break
+        except json.JSONDecodeError:
+            continue

-CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
+    if not chrome_binary or not Path(chrome_binary).exists():
+        pytest.skip(f"Chromium binary not found: {chrome_binary}")
+
+    env['CHROME_BINARY'] = chrome_binary
+    return env

 TEST_URL = 'https://www.filmin.es/'

@@ -210,22 +238,11 @@ def test_extension_loads_in_chromium():
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

-        # Set up isolated lib directories for this test
-        lib_env = setup_test_lib_dirs(tmpdir)
+        # Set up isolated env with proper directory structure
+        env = setup_test_env(tmpdir)
+        env.setdefault('CHROME_HEADLESS', 'true')

-        # Set up extensions directory
-        ext_dir = tmpdir / 'chrome_extensions'
-        ext_dir.mkdir(parents=True)
-
-        env = os.environ.copy()
-        env.update(lib_env)
-        env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
-        env['CHROME_HEADLESS'] = 'true'
-
-        # Ensure CHROME_BINARY points to Chromium
-        chromium = find_chromium_binary()
-        if chromium:
-            env['CHROME_BINARY'] = chromium
+        ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])

        # Step 1: Install the extension
        result = subprocess.run(
@@ -245,13 +262,16 @@ def test_extension_loads_in_chromium():
        print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")

        # Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
-        crawl_dir = tmpdir / 'crawl'
-        crawl_dir.mkdir()
+        crawl_id = 'test-cookies'
+        crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
+        crawl_dir.mkdir(parents=True, exist_ok=True)
        chrome_dir = crawl_dir / 'chrome'
+        chrome_dir.mkdir(parents=True, exist_ok=True)
+        env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)

        chrome_launch_process = subprocess.Popen(
-            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
-            cwd=str(crawl_dir),
+            ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
@@ -400,156 +420,362 @@ const puppeteer = require('puppeteer-core');
                    pass


-def test_hides_cookie_consent_on_filmin():
-    """Live test: verify extension hides cookie consent popup on filmin.es.
+def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
+    """Launch Chromium and return (process, cdp_url) or raise on failure."""
+    chrome_dir.mkdir(parents=True, exist_ok=True)

-    Uses Chromium with extensions loaded automatically via chrome hook.
-    """
-    with tempfile.TemporaryDirectory() as tmpdir:
-        tmpdir = Path(tmpdir)
+    chrome_launch_process = subprocess.Popen(
+        ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
+        cwd=str(chrome_dir),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        env=env
+    )

-        # Set up isolated lib directories for this test
-        lib_env = setup_test_lib_dirs(tmpdir)
+    # Wait for Chromium to launch and CDP URL to be available
+    cdp_url = None
+    for i in range(20):
+        if chrome_launch_process.poll() is not None:
+            stdout, stderr = chrome_launch_process.communicate()
+            raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
+        cdp_file = chrome_dir / 'cdp_url.txt'
+        if cdp_file.exists():
+            cdp_url = cdp_file.read_text().strip()
+            break
+        time.sleep(1)

-        # Set up extensions directory
-        ext_dir = tmpdir / 'chrome_extensions'
-        ext_dir.mkdir(parents=True)
+    if not cdp_url:
+        chrome_launch_process.kill()
+        raise RuntimeError("Chromium CDP URL not found after 20s")

-        env = os.environ.copy()
-        env.update(lib_env)
-        env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
-        env['CHROME_HEADLESS'] = 'true'
+    return chrome_launch_process, cdp_url

-        # Ensure CHROME_BINARY points to Chromium
-        chromium = find_chromium_binary()
-        if chromium:
-            env['CHROME_BINARY'] = chromium
-
-        # Step 1: Install the extension
-        result = subprocess.run(
-            ['node', str(INSTALL_SCRIPT)],
-            cwd=str(tmpdir),
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
-        assert result.returncode == 0, f"Extension install failed: {result.stderr}"
-
-        # Verify extension cache was created
-        cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
-        assert cache_file.exists(), "Extension cache not created"
-        ext_data = json.loads(cache_file.read_text())
-        print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
-
-        # Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
-        crawl_dir = tmpdir / 'crawl'
-        crawl_dir.mkdir()
-        chrome_dir = crawl_dir / 'chrome'
-
-        chrome_launch_process = subprocess.Popen(
-            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
-            cwd=str(crawl_dir),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            env=env
-        )
-
-        # Wait for Chromium to launch and CDP URL to be available
-        cdp_url = None
-        for i in range(20):
-            if chrome_launch_process.poll() is not None:
-                stdout, stderr = chrome_launch_process.communicate()
-                raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
-            cdp_file = chrome_dir / 'cdp_url.txt'
-            if cdp_file.exists():
-                cdp_url = cdp_file.read_text().strip()
-                break
-            time.sleep(1)
-
-        assert cdp_url, "Chromium CDP URL not found after 20s"
-        print(f"Chromium launched with CDP URL: {cdp_url}")

+def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
+    """Clean up Chromium process."""
+    try:
+        chrome_launch_process.send_signal(signal.SIGTERM)
+        chrome_launch_process.wait(timeout=5)
+    except:
+        pass
+    chrome_pid_file = chrome_dir / 'chrome.pid'
+    if chrome_pid_file.exists():
        try:
-            # Step 3: Connect to Chromium and test cookie consent hiding
-            test_script = f'''
+            chrome_pid = int(chrome_pid_file.read_text().strip())
+            os.kill(chrome_pid, signal.SIGKILL)
+        except (OSError, ValueError):
+            pass
+
+
+def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
+    """Check if cookie consent elements are visible on a page.
+
+    Returns dict with:
+        - visible: bool - whether any cookie consent element is visible
+        - selector: str - which selector matched (if visible)
+        - elements_found: list - all cookie-related elements found in DOM
+        - html_snippet: str - snippet of the page HTML for debugging
+    """
+    test_script = f'''
 if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
 const puppeteer = require('puppeteer-core');

 (async () => {{
    const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});

-    // Wait for extension to initialize
-    await new Promise(r => setTimeout(r, 2000));
-
    const page = await browser.newPage();
-    await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
+    await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
    await page.setViewport({{ width: 1440, height: 900 }});

-    console.error('Navigating to {TEST_URL}...');
-    await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
+    console.error('Navigating to {test_url}...');
+    await page.goto('{test_url}', {{ waitUntil: 'networkidle2', timeout: 30000 }});

-    // Wait for extension content script to process page
-    await new Promise(r => setTimeout(r, 5000));
+    // Wait for page to fully render and any cookie scripts to run
+    await new Promise(r => setTimeout(r, 3000));

-    // Check cookie consent visibility
+    // Check cookie consent visibility using multiple common selectors
    const result = await page.evaluate(() => {{
-        const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay'];
+        // Common cookie consent selectors used by various consent management platforms
+        const selectors = [
+            // CookieYes
+            '.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal',
+            // OneTrust
+            '#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter',
+            // Cookiebot
+            '#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay',
+            // Generic cookie banners
+            '[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]',
+            '[class*="cookie-popup"]', '[class*="cookie-modal"]', '[class*="cookie-dialog"]',
+            '[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]',
+            '[id*="cookieconsent"]', '[id*="cookie-law"]',
+            // GDPR banners
+            '[class*="gdpr"]', '[id*="gdpr"]',
+            // Consent banners
+            '[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-popup"]',
+            // Privacy banners
+            '[class*="privacy-banner"]', '[class*="privacy-notice"]',
+            // Common frameworks
+            '.cc-window', '.cc-banner', '#cc-main',  // Cookie Consent by Insites
+            '.qc-cmp2-container',  // Quantcast
+            '.sp-message-container',  // SourcePoint
+        ];
+
+        const elementsFound = [];
+        let visibleElement = null;
+
        for (const sel of selectors) {{
-            const el = document.querySelector(sel);
-            if (el) {{
-                const style = window.getComputedStyle(el);
-                const rect = el.getBoundingClientRect();
-                const visible = style.display !== 'none' &&
-                               style.visibility !== 'hidden' &&
-                               rect.width > 0 && rect.height > 0;
-                if (visible) return {{ visible: true, selector: sel }};
+            try {{
+                const elements = document.querySelectorAll(sel);
+                for (const el of elements) {{
+                    const style = window.getComputedStyle(el);
+                    const rect = el.getBoundingClientRect();
+                    const isVisible = style.display !== 'none' &&
+                                     style.visibility !== 'hidden' &&
+                                     style.opacity !== '0' &&
+                                     rect.width > 0 && rect.height > 0;
+
+                    elementsFound.push({{
+                        selector: sel,
+                        visible: isVisible,
+                        display: style.display,
+                        visibility: style.visibility,
+                        opacity: style.opacity,
+                        width: rect.width,
+                        height: rect.height
+                    }});
+
+                    if (isVisible && !visibleElement) {{
+                        visibleElement = {{ selector: sel, width: rect.width, height: rect.height }};
+                    }}
+                }}
+            }} catch (e) {{
+                // Invalid selector, skip
            }}
        }}
-        return {{ visible: false }};
+
+        // Also grab a snippet of the HTML to help debug
+        const bodyHtml = document.body.innerHTML.slice(0, 2000);
+        const hasCookieKeyword = bodyHtml.toLowerCase().includes('cookie') ||
+                                  bodyHtml.toLowerCase().includes('consent') ||
+                                  bodyHtml.toLowerCase().includes('gdpr');
+
+        return {{
+            visible: visibleElement !== null,
+            selector: visibleElement ? visibleElement.selector : null,
+            elements_found: elementsFound,
+            has_cookie_keyword_in_html: hasCookieKeyword,
+            html_snippet: bodyHtml.slice(0, 500)
+        }};
    }});

-    console.error('Cookie consent:', JSON.stringify(result));
+    console.error('Cookie consent check result:', JSON.stringify({{
+        visible: result.visible,
+        selector: result.selector,
+        elements_found_count: result.elements_found.length
+    }}));
+
    browser.disconnect();
    console.log(JSON.stringify(result));
 }})();
 '''
-            script_path = tmpdir / 'test_extension.js'
-            script_path.write_text(test_script)
+    script_path = script_dir / 'check_cookies.js'
+    script_path.write_text(test_script)

-            result = subprocess.run(
-                ['node', str(script_path)],
-                cwd=str(tmpdir),
-                capture_output=True,
-                text=True,
-                env=env,
-                timeout=90
+    result = subprocess.run(
+        ['node', str(script_path)],
+        cwd=str(script_dir),
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=90
+    )
+
+    if result.returncode != 0:
+        raise RuntimeError(f"Cookie check script failed: {result.stderr}")
+
+    output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
+    if not output_lines:
+        raise RuntimeError(f"No JSON output from cookie check: {result.stdout}\nstderr: {result.stderr}")
+
+    return json.loads(output_lines[-1])
+
+
+def test_hides_cookie_consent_on_filmin():
+    """Live test: verify extension hides cookie consent popup on filmin.es.
+
+    This test runs TWO browser sessions:
+    1. WITHOUT extension - verifies cookie consent IS visible (baseline)
+    2. WITH extension - verifies cookie consent is HIDDEN
+
+    This ensures we're actually testing the extension's effect, not just
+    that a page happens to not have cookie consent.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set up isolated env with proper directory structure
+        env_base = setup_test_env(tmpdir)
+        env_base['CHROME_HEADLESS'] = 'true'
+
+        ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
+
+        # ============================================================
+        # STEP 1: BASELINE - Run WITHOUT extension, verify cookie consent IS visible
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 1: BASELINE TEST (no extension)")
+        print("="*60)
+
+        data_dir = Path(env_base['DATA_DIR'])
+
+        env_no_ext = env_base.copy()
+        env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
+        (data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
+
+        # Launch baseline Chromium in crawls directory
+        baseline_crawl_id = 'baseline-no-ext'
+        baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
+        baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
+        baseline_chrome_dir = baseline_crawl_dir / 'chrome'
+        env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
+        baseline_process = None
+
+        try:
+            baseline_process, baseline_cdp_url = launch_chromium_session(
+                env_no_ext, baseline_chrome_dir, baseline_crawl_id
+            )
+            print(f"Baseline Chromium launched: {baseline_cdp_url}")
+
+            # Wait a moment for browser to be ready
+            time.sleep(2)
+
+            baseline_result = check_cookie_consent_visibility(
+                baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
            )

-            print(f"stderr: {result.stderr}")
-            print(f"stdout: {result.stdout}")
+            print(f"Baseline result: visible={baseline_result['visible']}, "
+                  f"elements_found={len(baseline_result['elements_found'])}")

-            assert result.returncode == 0, f"Test failed: {result.stderr}"
-
-            output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
-            assert output_lines, f"No JSON output: {result.stdout}"
-
-            test_result = json.loads(output_lines[-1])
-            assert not test_result['visible'], \
-                f"Cookie consent should be hidden by extension. Result: {test_result}"
+            if baseline_result['elements_found']:
+                print("Elements found in baseline:")
+                for el in baseline_result['elements_found'][:5]:  # Show first 5
+                    print(f"  - {el['selector']}: visible={el['visible']}, "
+                          f"display={el['display']}, size={el['width']}x{el['height']}")

        finally:
-            # Clean up Chromium
-            try:
-                chrome_launch_process.send_signal(signal.SIGTERM)
-                chrome_launch_process.wait(timeout=5)
-            except:
-                pass
-            chrome_pid_file = chrome_dir / 'chrome.pid'
-            if chrome_pid_file.exists():
-                try:
-                    chrome_pid = int(chrome_pid_file.read_text().strip())
-                    os.kill(chrome_pid, signal.SIGKILL)
-                except (OSError, ValueError):
-                    pass
+            if baseline_process:
+                kill_chromium_session(baseline_process, baseline_chrome_dir)
+
+        # Verify baseline shows cookie consent
+        if not baseline_result['visible']:
+            # If no cookie consent visible in baseline, we can't test the extension
+            # This could happen if:
+            # - The site changed and no longer shows cookie consent
+            # - Cookie consent is region-specific
+            # - Our selectors don't match this site
+            print("\nWARNING: No cookie consent visible in baseline!")
+            print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}")
+            print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}")
+
+            pytest.skip(
+                f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. "
+                f"Elements found: {len(baseline_result['elements_found'])}. "
+                f"The site may have changed or cookie consent may be region-specific."
+            )
+
+        print(f"\n✓ Baseline confirmed: Cookie consent IS visible (selector: {baseline_result['selector']})")
+
+        # ============================================================
+        # STEP 2: Install the extension
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 2: INSTALLING EXTENSION")
+        print("="*60)
+
+        env_with_ext = env_base.copy()
+        env_with_ext['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
+
+        result = subprocess.run(
+            ['node', str(INSTALL_SCRIPT)],
+            cwd=str(tmpdir),
+            capture_output=True,
+            text=True,
+            env=env_with_ext,
+            timeout=60
+        )
+        assert result.returncode == 0, f"Extension install failed: {result.stderr}"
+
+        cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
+        assert cache_file.exists(), "Extension cache not created"
+        ext_data = json.loads(cache_file.read_text())
+        print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
+
+        # ============================================================
+        # STEP 3: Run WITH extension, verify cookie consent is HIDDEN
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 3: TEST WITH EXTENSION")
+        print("="*60)
+
+        # Launch extension test Chromium in crawls directory
+        ext_crawl_id = 'test-with-ext'
+        ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
+        ext_crawl_dir.mkdir(parents=True, exist_ok=True)
+        ext_chrome_dir = ext_crawl_dir / 'chrome'
+        env_with_ext['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
+        ext_process = None
+
+        try:
+            ext_process, ext_cdp_url = launch_chromium_session(
+                env_with_ext, ext_chrome_dir, ext_crawl_id
+            )
+            print(f"Extension Chromium launched: {ext_cdp_url}")
+
+            # Check that extension was loaded
+            extensions_file = ext_chrome_dir / 'extensions.json'
+            if extensions_file.exists():
+                loaded_exts = json.loads(extensions_file.read_text())
+                print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
+
+            # Wait for extension to initialize
+            time.sleep(3)
+
+            ext_result = check_cookie_consent_visibility(
+                ext_cdp_url, TEST_URL, env_with_ext, tmpdir
+            )
+
+            print(f"Extension result: visible={ext_result['visible']}, "
+                  f"elements_found={len(ext_result['elements_found'])}")
+
+            if ext_result['elements_found']:
+                print("Elements found with extension:")
+                for el in ext_result['elements_found'][:5]:
+                    print(f"  - {el['selector']}: visible={el['visible']}, "
+                          f"display={el['display']}, size={el['width']}x{el['height']}")
+
+        finally:
+            if ext_process:
+                kill_chromium_session(ext_process, ext_chrome_dir)
+
+        # ============================================================
+        # STEP 4: Compare results
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 4: COMPARISON")
+        print("="*60)
+        print(f"Baseline (no extension): cookie consent visible = {baseline_result['visible']}")
+        print(f"With extension: cookie consent visible = {ext_result['visible']}")
+
+        assert baseline_result['visible'], \
+            "Baseline should show cookie consent (this shouldn't happen, we checked above)"
+
+        assert not ext_result['visible'], \
+            f"Cookie consent should be HIDDEN by extension.\n" \
+            f"Baseline showed consent at: {baseline_result['selector']}\n" \
+            f"But with extension, consent is still visible.\n" \
+            f"Elements still visible: {[e for e in ext_result['elements_found'] if e['visible']]}"
+
+        print("\n✓ SUCCESS: Extension correctly hides cookie consent!")
+        print(f"  - Baseline showed consent at: {baseline_result['selector']}")
+        print(f"  - Extension successfully hid it")
--- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py
+++ b/archivebox/plugins/modalcloser/tests/test_modalcloser.py
@@ -26,7 +26,7 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
 MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
-CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
+CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
 CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
 CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
 TEST_URL = 'https://www.singsing.movie/'
@@ -123,6 +123,7 @@ def setup_chrome_session(tmpdir):
    crawl_dir = Path(tmpdir) / 'crawl'
    crawl_dir.mkdir()
    chrome_dir = crawl_dir / 'chrome'
+    chrome_dir.mkdir()

    env = get_test_env()
    env['CHROME_HEADLESS'] = 'true'
@@ -130,7 +131,7 @@ def setup_chrome_session(tmpdir):
    # Launch Chrome at crawl level
    chrome_launch_process = subprocess.Popen(
        ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'],
-        cwd=str(crawl_dir),
+        cwd=str(chrome_dir),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
--- a/archivebox/plugins/twocaptcha/config.json
+++ b/archivebox/plugins/twocaptcha/config.json
@@ -4,18 +4,47 @@
  "additionalProperties": false,
  "required_plugins": ["chrome"],
  "properties": {
-    "CAPTCHA2_ENABLED": {
+    "TWOCAPTCHA_ENABLED": {
      "type": "boolean",
      "default": true,
-      "x-aliases": ["USE_CAPTCHA2"],
-      "description": "Enable Captcha2 browser extension for CAPTCHA solving"
+      "x-aliases": ["CAPTCHA2_ENABLED", "USE_CAPTCHA2", "USE_TWOCAPTCHA"],
+      "description": "Enable 2captcha browser extension for automatic CAPTCHA solving"
    },
-    "CAPTCHA2_TIMEOUT": {
+    "TWOCAPTCHA_API_KEY": {
+      "type": "string",
+      "default": "",
+      "x-aliases": ["API_KEY_2CAPTCHA", "CAPTCHA2_API_KEY"],
+      "x-sensitive": true,
+      "description": "2captcha API key for CAPTCHA solving service (get from https://2captcha.com)"
+    },
+    "TWOCAPTCHA_RETRY_COUNT": {
+      "type": "integer",
+      "default": 3,
+      "minimum": 0,
+      "maximum": 10,
+      "x-aliases": ["CAPTCHA2_RETRY_COUNT"],
+      "description": "Number of times to retry CAPTCHA solving on error"
+    },
+    "TWOCAPTCHA_RETRY_DELAY": {
+      "type": "integer",
+      "default": 5,
+      "minimum": 0,
+      "maximum": 60,
+      "x-aliases": ["CAPTCHA2_RETRY_DELAY"],
+      "description": "Delay in seconds between CAPTCHA solving retries"
+    },
+    "TWOCAPTCHA_TIMEOUT": {
      "type": "integer",
      "default": 60,
      "minimum": 5,
      "x-fallback": "TIMEOUT",
+      "x-aliases": ["CAPTCHA2_TIMEOUT"],
      "description": "Timeout for CAPTCHA solving in seconds"
+    },
+    "TWOCAPTCHA_AUTO_SUBMIT": {
+      "type": "boolean",
+      "default": false,
+      "description": "Automatically submit forms after CAPTCHA is solved"
    }
  }
 }
--- a/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js
+++ b/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js
@@ -12,7 +12,7 @@
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * Requirements:
- * - API_KEY_2CAPTCHA environment variable must be set
+ * - TWOCAPTCHA_API_KEY environment variable must be set
 * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
 */

@@ -47,10 +47,10 @@ async function installCaptchaExtension() {
    }

    // Check if API key is configured
-    const apiKey = process.env.API_KEY_2CAPTCHA;
+    const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
    if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
-        console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
-        console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
+        console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
+        console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
    } else {
        console.log('[+] 2captcha extension installed and API key configured');
    }
--- a/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js
+++ b/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js
@@ -2,14 +2,21 @@
 /**
 * 2Captcha Extension Configuration
 *
- * Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
- * Runs once per crawl to inject API key into extension storage.
+ * Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
+ * Runs once per crawl to inject configuration into extension storage.
 *
- * Priority: 11 (after chrome_launch at 20)
+ * Priority: 25 (after chrome_launch at 30, before snapshots start)
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
+ * Config Options (from config.json / environment):
+ * - TWOCAPTCHA_API_KEY: API key for 2captcha service
+ * - TWOCAPTCHA_ENABLED: Enable/disable the extension
+ * - TWOCAPTCHA_RETRY_COUNT: Number of retries on error
+ * - TWOCAPTCHA_RETRY_DELAY: Delay between retries (seconds)
+ * - TWOCAPTCHA_AUTO_SUBMIT: Auto-submit forms after solving
+ *
 * Requirements:
- * - API_KEY_2CAPTCHA environment variable must be set
+ * - TWOCAPTCHA_API_KEY environment variable must be set
 * - chrome plugin must have loaded extensions (extensions.json must exist)
 */

@@ -36,6 +43,20 @@ function getEnv(name, defaultValue = '') {
    return (process.env[name] || defaultValue).trim();
 }

+// Get boolean environment variable
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Get integer environment variable
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
 // Parse command line arguments
 function parseArgs() {
    const args = {};
@@ -48,6 +69,82 @@ function parseArgs() {
    return args;
 }

+/**
+ * Get 2captcha configuration from environment variables.
+ * Supports both TWOCAPTCHA_* and legacy API_KEY_2CAPTCHA naming.
+ */
+function getTwoCaptchaConfig() {
+    const apiKey = getEnv('TWOCAPTCHA_API_KEY') || getEnv('API_KEY_2CAPTCHA') || getEnv('CAPTCHA2_API_KEY');
+    const isEnabled = getEnvBool('TWOCAPTCHA_ENABLED', true);
+    const retryCount = getEnvInt('TWOCAPTCHA_RETRY_COUNT', 3);
+    const retryDelay = getEnvInt('TWOCAPTCHA_RETRY_DELAY', 5);
+    const autoSubmit = getEnvBool('TWOCAPTCHA_AUTO_SUBMIT', false);
+
+    // Build the full config object matching the extension's storage structure
+    // Structure: chrome.storage.local.set({config: {...}})
+    return {
+        // API key - both variants for compatibility
+        apiKey: apiKey,
+        api_key: apiKey,
+
+        // Plugin enabled state
+        isPluginEnabled: isEnabled,
+
+        // Retry settings
+        repeatOnErrorTimes: retryCount,
+        repeatOnErrorDelay: retryDelay,
+
+        // Auto-submit setting
+        autoSubmitForms: autoSubmit,
+        submitFormsDelay: 0,
+
+        // Enable all CAPTCHA types
+        enabledForNormal: true,
+        enabledForRecaptchaV2: true,
+        enabledForInvisibleRecaptchaV2: true,
+        enabledForRecaptchaV3: true,
+        enabledForRecaptchaAudio: false,
+        enabledForGeetest: true,
+        enabledForGeetest_v4: true,
+        enabledForKeycaptcha: true,
+        enabledForArkoselabs: true,
+        enabledForLemin: true,
+        enabledForYandex: true,
+        enabledForCapyPuzzle: true,
+        enabledForTurnstile: true,
+        enabledForAmazonWaf: true,
+        enabledForMTCaptcha: true,
+
+        // Auto-solve all CAPTCHA types
+        autoSolveNormal: true,
+        autoSolveRecaptchaV2: true,
+        autoSolveInvisibleRecaptchaV2: true,
+        autoSolveRecaptchaV3: true,
+        autoSolveRecaptchaAudio: false,
+        autoSolveGeetest: true,
+        autoSolveGeetest_v4: true,
+        autoSolveKeycaptcha: true,
+        autoSolveArkoselabs: true,
+        autoSolveLemin: true,
+        autoSolveYandex: true,
+        autoSolveCapyPuzzle: true,
+        autoSolveTurnstile: true,
+        autoSolveAmazonWaf: true,
+        autoSolveMTCaptcha: true,
+
+        // Other settings with sensible defaults
+        recaptchaV2Type: 'token',
+        recaptchaV3MinScore: 0.3,
+        buttonPosition: 'inner',
+        useProxy: false,
+        proxy: '',
+        proxytype: 'HTTP',
+        blackListDomain: '',
+        autoSubmitRules: [],
+        normalSources: [],
+    };
+}
+
 async function configure2Captcha() {
    // Check if already configured in this session
    if (fs.existsSync(CONFIG_MARKER)) {
@@ -55,29 +152,23 @@ async function configure2Captcha() {
        return { success: true, skipped: true };
    }

+    // Get configuration
+    const config = getTwoCaptchaConfig();
+
    // Check if API key is set
-    const apiKey = getEnv('API_KEY_2CAPTCHA');
-    if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
-        console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
-        console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
-        return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
+    if (!config.apiKey || config.apiKey === 'YOUR_API_KEY_HERE') {
+        console.warn('[!] 2captcha extension loaded but TWOCAPTCHA_API_KEY not configured');
+        console.warn('[!] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
+        return { success: false, error: 'TWOCAPTCHA_API_KEY not configured' };
    }

-    // Load extensions metadata
-    const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
-    if (!fs.existsSync(extensionsFile)) {
-        return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
-    }
-
-    const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
-    const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
-
-    if (!captchaExt) {
-        console.error('[*] 2captcha extension not installed, skipping configuration');
-        return { success: true, skipped: true };
-    }
-
-    console.error('[*] Configuring 2captcha extension with API key...');
+    console.error('[*] Configuring 2captcha extension...');
+    console.error(`[*]   API Key: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`);
+    console.error(`[*]   Enabled: ${config.isPluginEnabled}`);
+    console.error(`[*]   Retry Count: ${config.repeatOnErrorTimes}`);
+    console.error(`[*]   Retry Delay: ${config.repeatOnErrorDelay}s`);
+    console.error(`[*]   Auto Submit: ${config.autoSubmitForms}`);
+    console.error(`[*]   Auto Solve: all CAPTCHA types enabled`);

    try {
        // Connect to the existing Chrome session via CDP
@@ -90,138 +181,116 @@ async function configure2Captcha() {
        const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });

        try {
-            // Method 1: Try to inject via extension background page
-            if (captchaExt.target && captchaExt.target_ctx) {
-                console.error('[*] Attempting to configure via extension background page...');
+            // First, navigate to a page to trigger extension content scripts and wake up service worker
+            console.error('[*] Waking up extension by visiting a page...');
+            const triggerPage = await browser.newPage();
+            try {
+                await triggerPage.goto('https://www.google.com', { waitUntil: 'domcontentloaded', timeout: 10000 });
+                await new Promise(r => setTimeout(r, 3000)); // Give extension time to initialize
+            } catch (e) {
+                console.warn(`[!] Trigger page failed: ${e.message}`);
+            }
+            try { await triggerPage.close(); } catch (e) {}

-                // Reconnect to the browser to get fresh target context
-                const targets = await browser.targets();
-                const extTarget = targets.find(t =>
-                    t.url().startsWith(`chrome-extension://${captchaExt.id}`)
-                );
-
-                if (extTarget) {
-                    const extContext = await extTarget.worker() || await extTarget.page();
-
-                    if (extContext) {
-                        await extContext.evaluate((key) => {
-                            // Try all common storage patterns
-                            if (typeof chrome !== 'undefined' && chrome.storage) {
-                                chrome.storage.local.set({
-                                    apiKey: key,
-                                    api_key: key,
-                                    '2captcha_apikey': key,
-                                    apikey: key,
-                                    'solver-api-key': key,
-                                });
-                                chrome.storage.sync.set({
-                                    apiKey: key,
-                                    api_key: key,
-                                    '2captcha_apikey': key,
-                                    apikey: key,
-                                    'solver-api-key': key,
-                                });
-                            }
-
-                            // Also try localStorage as fallback
-                            if (typeof localStorage !== 'undefined') {
-                                localStorage.setItem('apiKey', key);
-                                localStorage.setItem('2captcha_apikey', key);
-                                localStorage.setItem('solver-api-key', key);
-                            }
-                        }, apiKey);
-
-                        console.error('[+] 2captcha API key configured successfully via background page');
-
-                        // Mark as configured
-                        fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
-
-                        return { success: true, method: 'background_page' };
-                    }
-                }
+            // Get 2captcha extension info from extensions.json
+            const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
+            if (!fs.existsSync(extensionsFile)) {
+                return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
            }

-            // Method 2: Try to configure via options page
-            console.error('[*] Attempting to configure via options page...');
-            const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
-            const configPage = await browser.newPage();
+            const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
+            const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
+
+            if (!captchaExt) {
+                console.error('[*] 2captcha extension not installed, skipping configuration');
+                return { success: true, skipped: true };
+            }
+
+            if (!captchaExt.id) {
+                return { success: false, error: '2captcha extension ID not found in extensions.json' };
+            }
+
+            const extensionId = captchaExt.id;
+            console.error(`[*] 2captcha Extension ID: ${extensionId}`);
+
+            // Configure via options page
+            console.error('[*] Configuring via options page...');
+            const optionsUrl = `chrome-extension://${extensionId}/options/options.html`;
+
+            let configPage = await browser.newPage();

            try {
-                await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
-
-                const configured = await configPage.evaluate((key) => {
-                    // Try to find API key input field
-                    const selectors = [
-                        'input[name*="apikey" i]',
-                        'input[id*="apikey" i]',
-                        'input[name*="api-key" i]',
-                        'input[id*="api-key" i]',
-                        'input[name*="key" i]',
-                        'input[placeholder*="api" i]',
-                        'input[type="text"]',
-                    ];
-
-                    for (const selector of selectors) {
-                        const input = document.querySelector(selector);
-                        if (input) {
-                            input.value = key;
-                            input.dispatchEvent(new Event('input', { bubbles: true }));
-                            input.dispatchEvent(new Event('change', { bubbles: true }));
-
-                            // Try to find and click save button
-                            const saveSelectors = [
-                                'button[type="submit"]',
-                                'input[type="submit"]',
-                                'button:contains("Save")',
-                                'button:contains("Apply")',
-                            ];
-
-                            for (const btnSel of saveSelectors) {
-                                const btn = document.querySelector(btnSel);
-                                if (btn) {
-                                    btn.click();
-                                    break;
-                                }
-                            }
-
-                            // Also save to storage
-                            if (typeof chrome !== 'undefined' && chrome.storage) {
-                                chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
-                                chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
-                            }
-
-                            return true;
-                        }
-                    }
-
-                    // Fallback: Just save to storage
-                    if (typeof chrome !== 'undefined' && chrome.storage) {
-                        chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
-                        chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
-                        return true;
-                    }
-
-                    return false;
-                }, apiKey);
-
-                await configPage.close();
-
-                if (configured) {
-                    console.error('[+] 2captcha API key configured successfully via options page');
-
-                    // Mark as configured
-                    fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
-
-                    return { success: true, method: 'options_page' };
-                }
-            } catch (e) {
-                console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
+                // Navigate to options page - catch error but continue since page may still load
                try {
-                    await configPage.close();
-                } catch (e2) {}
-            }
+                    await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
+                } catch (navError) {
+                    // Navigation may throw ERR_BLOCKED_BY_CLIENT but page still loads
+                    console.error(`[*] Navigation threw error (may still work): ${navError.message}`);
+                }

-            return { success: false, error: 'Could not configure via any method' };
+                // Wait a moment for page to settle
+                await new Promise(r => setTimeout(r, 3000));
+
+                // Check all pages for the extension page (Chrome may open it in a different tab)
+                const pages = await browser.pages();
+                for (const page of pages) {
+                    const url = page.url();
+                    if (url.startsWith(`chrome-extension://${extensionId}`)) {
+                        configPage = page;
+                        break;
+                    }
+                }
+
+                const currentUrl = configPage.url();
+                console.error(`[*] Current URL: ${currentUrl}`);
+
+                if (!currentUrl.startsWith(`chrome-extension://${extensionId}`)) {
+                    return { success: false, error: `Failed to navigate to options page, got: ${currentUrl}` };
+                }
+
+                // Wait for Config object to be available
+                console.error('[*] Waiting for Config object...');
+                await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 });
+
+                // Use chrome.storage.local.set with the config wrapper
+                const result = await configPage.evaluate((cfg) => {
+                    return new Promise((resolve) => {
+                        if (typeof chrome !== 'undefined' && chrome.storage) {
+                            chrome.storage.local.set({ config: cfg }, () => {
+                                if (chrome.runtime.lastError) {
+                                    resolve({ success: false, error: chrome.runtime.lastError.message });
+                                } else {
+                                    resolve({ success: true, method: 'options_page' });
+                                }
+                            });
+                        } else {
+                            resolve({ success: false, error: 'chrome.storage not available' });
+                        }
+                    });
+                }, config);
+
+                if (result.success) {
+                    console.error(`[+] 2captcha configured via ${result.method}`);
+                    fs.writeFileSync(CONFIG_MARKER, JSON.stringify({
+                        timestamp: new Date().toISOString(),
+                        method: result.method,
+                        extensionId: extensionId,
+                        config: {
+                            apiKeySet: !!config.apiKey,
+                            isPluginEnabled: config.isPluginEnabled,
+                            repeatOnErrorTimes: config.repeatOnErrorTimes,
+                            repeatOnErrorDelay: config.repeatOnErrorDelay,
+                            autoSubmitForms: config.autoSubmitForms,
+                            autoSolveEnabled: true,
+                        }
+                    }, null, 2));
+                    return { success: true, method: result.method };
+                }
+
+                return { success: false, error: result.error || 'Config failed' };
+            } finally {
+                try { await configPage.close(); } catch (e) {}
+            }
        } finally {
            browser.disconnect();
        }
@@ -236,7 +305,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
+++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
@@ -1,184 +1,398 @@
 """
-Unit tests for twocaptcha plugin
+Integration tests for twocaptcha plugin

-Tests invoke the plugin hooks as external processes and verify outputs/side effects.
+Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs
+
+NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium.
 """

 import json
 import os
+import signal
 import subprocess
 import tempfile
+import time
 from pathlib import Path

 import pytest


 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None)
-CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None)
+PLUGINS_ROOT = PLUGIN_DIR.parent
+INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
+CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
+CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
+CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
+
+TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'


-def test_install_script_exists():
-    """Verify install script exists"""
-    assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
+def setup_test_env(tmpdir: Path) -> dict:
+    """Set up isolated data/lib directory structure for tests.
+
+    Creates structure matching real ArchiveBox data dir:
+        <tmpdir>/data/
+            lib/
+                arm64-darwin/   (or x86_64-linux, etc.)
+                    npm/
+                        .bin/
+                        node_modules/
+            personas/
+                default/
+                    chrome_extensions/
+            users/
+                testuser/
+                    crawls/
+                    snapshots/
+
+    Calls chrome install hook which handles puppeteer-core and chromium installation.
+    Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
+    """
+    import platform
+    from datetime import datetime
+
+    # Determine machine type (matches archivebox.config.paths.get_machine_type())
+    machine = platform.machine().lower()
+    system = platform.system().lower()
+    if machine in ('arm64', 'aarch64'):
+        machine = 'arm64'
+    elif machine in ('x86_64', 'amd64'):
+        machine = 'x86_64'
+    machine_type = f"{machine}-{system}"
+
+    # Create proper directory structure matching real ArchiveBox layout
+    data_dir = tmpdir / 'data'
+    lib_dir = data_dir / 'lib' / machine_type
+    npm_dir = lib_dir / 'npm'
+    npm_bin_dir = npm_dir / '.bin'
+    node_modules_dir = npm_dir / 'node_modules'
+
+    # Extensions go under personas/Default/
+    chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
+
+    # User data goes under users/{username}/
+    date_str = datetime.now().strftime('%Y%m%d')
+    users_dir = data_dir / 'users' / 'testuser'
+    crawls_dir = users_dir / 'crawls' / date_str
+    snapshots_dir = users_dir / 'snapshots' / date_str
+
+    # Create all directories
+    node_modules_dir.mkdir(parents=True, exist_ok=True)
+    npm_bin_dir.mkdir(parents=True, exist_ok=True)
+    chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
+    crawls_dir.mkdir(parents=True, exist_ok=True)
+    snapshots_dir.mkdir(parents=True, exist_ok=True)
+
+    # Build complete env dict
+    env = os.environ.copy()
+    env.update({
+        'DATA_DIR': str(data_dir),
+        'LIB_DIR': str(lib_dir),
+        'MACHINE_TYPE': machine_type,
+        'NPM_BIN_DIR': str(npm_bin_dir),
+        'NODE_MODULES_DIR': str(node_modules_dir),
+        'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
+        'CRAWLS_DIR': str(crawls_dir),
+        'SNAPSHOTS_DIR': str(snapshots_dir),
+    })
+
+    # Only set headless if not already in environment (allow override for debugging)
+    if 'CHROME_HEADLESS' not in os.environ:
+        env['CHROME_HEADLESS'] = 'true'
+
+    # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
+    result = subprocess.run(
+        ['python', str(CHROME_INSTALL_HOOK)],
+        capture_output=True, text=True, timeout=120, env=env
+    )
+    if result.returncode != 0:
+        pytest.skip(f"Chrome install hook failed: {result.stderr}")
+
+    # Parse JSONL output to get CHROME_BINARY
+    chrome_binary = None
+    for line in result.stdout.strip().split('\n'):
+        if not line.strip():
+            continue
+        try:
+            data = json.loads(line)
+            if data.get('type') == 'Binary' and data.get('abspath'):
+                chrome_binary = data['abspath']
+                break
+        except json.JSONDecodeError:
+            continue
+
+    if not chrome_binary or not Path(chrome_binary).exists():
+        pytest.skip(f"Chromium binary not found: {chrome_binary}")
+
+    env['CHROME_BINARY'] = chrome_binary
+    return env


-def test_config_script_exists():
-    """Verify config script exists"""
-    assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
+def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
+    """Launch Chromium and return (process, cdp_url)."""
+    chrome_dir.mkdir(parents=True, exist_ok=True)
+
+    process = subprocess.Popen(
+        ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
+        cwd=str(chrome_dir),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        env=env
+    )
+
+    cdp_url = None
+    for _ in range(30):
+        if process.poll() is not None:
+            stdout, stderr = process.communicate()
+            raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
+        cdp_file = chrome_dir / 'cdp_url.txt'
+        if cdp_file.exists():
+            cdp_url = cdp_file.read_text().strip()
+            break
+        time.sleep(1)
+
+    if not cdp_url:
+        process.kill()
+        stdout, stderr = process.communicate()
+        raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
+
+    # Wait for extensions.json to be written (chrome launch hook parses chrome://extensions)
+    extensions_file = chrome_dir / 'extensions.json'
+    for _ in range(15):
+        if extensions_file.exists():
+            break
+        time.sleep(1)
+
+    # Print chrome launch hook output for debugging
+    import select
+    if hasattr(select, 'poll'):
+        # Read any available stderr without blocking
+        import fcntl
+        import os as os_module
+        fd = process.stderr.fileno()
+        fl = fcntl.fcntl(fd, fcntl.F_GETFL)
+        fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
+        try:
+            stderr_output = process.stderr.read()
+            if stderr_output:
+                print(f"[Chrome Launch Hook Output]\n{stderr_output}")
+        except:
+            pass
+
+    return process, cdp_url


-def test_extension_metadata():
-    """Test that twocaptcha extension has correct metadata"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
-
-        # Just check the script can be loaded
-        result = subprocess.run(
-            ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
-            capture_output=True,
-            text=True,
-            env=env
-        )
-
-        assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
-
-        metadata = json.loads(result.stdout)
-        assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
-        assert metadata["name"] == "twocaptcha"
+def kill_chrome(process, chrome_dir: Path):
+    """Kill Chromium process."""
+    try:
+        process.send_signal(signal.SIGTERM)
+        process.wait(timeout=5)
+    except:
+        pass
+    pid_file = chrome_dir / 'chrome.pid'
+    if pid_file.exists():
+        try:
+            os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
+        except:
+            pass


-def test_install_creates_cache():
-    """Test that install creates extension cache"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        ext_dir = Path(tmpdir) / "chrome_extensions"
-        ext_dir.mkdir(parents=True)
+class TestTwoCaptcha:
+    """Integration tests requiring TWOCAPTCHA_API_KEY."""

-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
-        env["API_KEY_2CAPTCHA"] = "test_api_key"
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
+        if not self.api_key:
+            pytest.skip("TWOCAPTCHA_API_KEY required")

-        # Run install script
-        result = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
+    def test_install_and_load(self):
+        """Extension installs and loads in Chromium."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir = Path(tmpdir)
+            env = setup_test_env(tmpdir)
+            env['TWOCAPTCHA_API_KEY'] = self.api_key

-        # Check output mentions installation
-        assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
+            # Install
+            result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True)
+            assert result.returncode == 0, f"Install failed: {result.stderr}"

-        # Check cache file was created
-        cache_file = ext_dir / "twocaptcha.extension.json"
-        assert cache_file.exists(), "Cache file should be created"
+            cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json'
+            assert cache.exists()
+            data = json.loads(cache.read_text())
+            assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo'

-        # Verify cache content
-        cache_data = json.loads(cache_file.read_text())
-        assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
-        assert cache_data["name"] == "twocaptcha"
-        assert "unpacked_path" in cache_data
-        assert "version" in cache_data
+            # Launch Chromium in crawls directory
+            crawl_id = 'test'
+            crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
+            chrome_dir = crawl_dir / 'chrome'
+            env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
+            process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
+
+            try:
+                exts = json.loads((chrome_dir / 'extensions.json').read_text())
+                assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}"
+                print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
+            finally:
+                kill_chrome(process, chrome_dir)
+
+    def test_config_applied(self):
+        """Configuration is applied to extension and verified via Config.getAll()."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir = Path(tmpdir)
+            env = setup_test_env(tmpdir)
+            env['TWOCAPTCHA_API_KEY'] = self.api_key
+            env['TWOCAPTCHA_RETRY_COUNT'] = '5'
+            env['TWOCAPTCHA_RETRY_DELAY'] = '10'
+
+            subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
+
+            # Launch Chromium in crawls directory
+            crawl_id = 'cfg'
+            crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
+            chrome_dir = crawl_dir / 'chrome'
+            env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
+            process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
+
+            try:
+                result = subprocess.run(
+                    ['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
+                    env=env, timeout=30, capture_output=True, text=True
+                )
+                assert result.returncode == 0, f"Config failed: {result.stderr}"
+                assert (chrome_dir / '.twocaptcha_configured').exists()
+
+                # Verify config via options.html and Config.getAll()
+                # Get the actual extension ID from the config marker (Chrome computes IDs differently)
+                config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text())
+                ext_id = config_marker['extensionId']
+                script = f'''
+if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
+const puppeteer = require('puppeteer-core');
+(async () => {{
+    const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
+
+    // Load options.html and use Config.getAll() to verify
+    const optionsUrl = 'chrome-extension://{ext_id}/options/options.html';
+    const page = await browser.newPage();
+    console.error('[*] Loading options page:', optionsUrl);
+
+    // Navigate - catch error but continue since page may still load
+    try {{
+        await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }});
+    }} catch (e) {{
+        console.error('[*] Navigation threw error (may still work):', e.message);
+    }}
+
+    // Wait for page to settle
+    await new Promise(r => setTimeout(r, 2000));
+    console.error('[*] Current URL:', page.url());
+
+    // Wait for Config object to be available
+    await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }});
+
+    // Call Config.getAll() - the extension's own API (returns a Promise)
+    const cfg = await page.evaluate(async () => await Config.getAll());
+    console.error('[*] Config.getAll() returned:', JSON.stringify(cfg));
+
+    await page.close();
+    browser.disconnect();
+    console.log(JSON.stringify(cfg));
+}})();
+'''
+                (tmpdir / 'v.js').write_text(script)
+                r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True)
+                print(r.stderr)
+                assert r.returncode == 0, f"Verify failed: {r.stderr}"
+
+                cfg = json.loads(r.stdout.strip().split('\n')[-1])
+                print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}")
+
+                # Verify all the fields we care about
+                assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}"
+                assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}"
+                assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}"
+                assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}"
+                assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}"
+                assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}"
+                assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}"
+                assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}"
+
+                print(f"[+] Config verified via Config.getAll()!")
+            finally:
+                kill_chrome(process, chrome_dir)
+
+    def test_solves_recaptcha(self):
+        """Extension solves reCAPTCHA on demo page."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir = Path(tmpdir)
+            env = setup_test_env(tmpdir)
+            env['TWOCAPTCHA_API_KEY'] = self.api_key
+
+            subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
+
+            # Launch Chromium in crawls directory
+            crawl_id = 'solve'
+            crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
+            chrome_dir = crawl_dir / 'chrome'
+            env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
+            process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
+
+            try:
+                subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
+
+                script = f'''
+if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
+const puppeteer = require('puppeteer-core');
+(async () => {{
+    const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
+    const page = await browser.newPage();
+    await page.setViewport({{ width: 1440, height: 900 }});
+    console.error('[*] Loading {TEST_URL}...');
+    await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
+    await new Promise(r => setTimeout(r, 3000));
+
+    const start = Date.now();
+    const maxWait = 90000;
+
+    while (Date.now() - start < maxWait) {{
+        const state = await page.evaluate(() => {{
+            const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
+            const solver = document.querySelector('.captcha-solver');
+            return {{
+                solved: resp ? resp.value.length > 0 : false,
+                state: solver?.getAttribute('data-state'),
+                text: solver?.textContent?.trim() || ''
+            }};
+        }});
+        const sec = Math.round((Date.now() - start) / 1000);
+        console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30));
+        if (state.solved) {{ console.error('[+] SOLVED!'); break; }}
+        if (state.state === 'error') {{ console.error('[!] ERROR'); break; }}
+        await new Promise(r => setTimeout(r, 2000));
+    }}
+
+    const final = await page.evaluate(() => {{
+        const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
+        return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }};
+    }});
+    browser.disconnect();
+    console.log(JSON.stringify(final));
+}})();
+'''
+                (tmpdir / 's.js').write_text(script)
+                print("\n[*] Solving CAPTCHA (10-60s)...")
+                r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True)
+                print(r.stderr)
+                assert r.returncode == 0, f"Failed: {r.stderr}"
+
+                final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
+                assert final.get('solved'), f"Not solved: {final}"
+                print(f"[+] SOLVED! {final.get('preview','')[:30]}...")
+            finally:
+                kill_chrome(process, chrome_dir)


-def test_install_twice_uses_cache():
-    """Test that running install twice uses existing cache on second run"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        ext_dir = Path(tmpdir) / "chrome_extensions"
-        ext_dir.mkdir(parents=True)
-
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
-        env["API_KEY_2CAPTCHA"] = "test_api_key"
-
-        # First install - downloads the extension
-        result1 = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
-        assert result1.returncode == 0, f"First install failed: {result1.stderr}"
-
-        # Verify cache was created
-        cache_file = ext_dir / "twocaptcha.extension.json"
-        assert cache_file.exists(), "Cache file should exist after first install"
-
-        # Second install - should use cache
-        result2 = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=30
-        )
-        assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
-
-        # Second run should mention cache reuse
-        assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
-
-
-def test_install_warns_without_api_key():
-    """Test that install warns when API key not configured"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        ext_dir = Path(tmpdir) / "chrome_extensions"
-        ext_dir.mkdir(parents=True)
-
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
-        # Don't set API_KEY_2CAPTCHA
-
-        # Run install script
-        result = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
-
-        # Should warn about missing API key
-        combined_output = result.stdout + result.stderr
-        assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
-
-
-def test_install_success_with_api_key():
-    """Test that install succeeds when API key is configured"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        ext_dir = Path(tmpdir) / "chrome_extensions"
-        ext_dir.mkdir(parents=True)
-
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
-        env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
-
-        # Run install script
-        result = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
-
-        # Should mention API key configured
-        combined_output = result.stdout + result.stderr
-        assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
-
-
-def test_config_script_structure():
-    """Test that config script has proper structure"""
-    # Verify the script exists and contains expected markers
-    script_content = CONFIG_SCRIPT.read_text()
-
-    # Should mention configuration marker file
-    assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content
-
-    # Should mention API key
-    assert "API_KEY_2CAPTCHA" in script_content
-
-    # Should have main function or be executable
-    assert "async function" in script_content or "main" in script_content
+if __name__ == '__main__':
+    pytest.main([__file__, '-xvs'])
--- a/archivebox/plugins/ublock/tests/test_ublock.py
+++ b/archivebox/plugins/ublock/tests/test_ublock.py
@@ -14,7 +14,7 @@ import pytest


 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_ublock.*'), None)
+INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)


 def test_install_script_exists():
@@ -158,26 +158,221 @@ def test_large_extension_size():


 PLUGINS_ROOT = PLUGIN_DIR.parent
-CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
-CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
+CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
+CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
+
+
+def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
+    """Launch Chromium and return (process, cdp_url) or raise on failure."""
+    import signal
+    import time
+
+    chrome_dir.mkdir(parents=True, exist_ok=True)
+
+    chrome_launch_process = subprocess.Popen(
+        ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
+        cwd=str(chrome_dir),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        env=env
+    )
+
+    # Wait for Chromium to launch and CDP URL to be available
+    cdp_url = None
+    for i in range(20):
+        if chrome_launch_process.poll() is not None:
+            stdout, stderr = chrome_launch_process.communicate()
+            raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
+        cdp_file = chrome_dir / 'cdp_url.txt'
+        if cdp_file.exists():
+            cdp_url = cdp_file.read_text().strip()
+            break
+        time.sleep(1)
+
+    if not cdp_url:
+        chrome_launch_process.kill()
+        raise RuntimeError("Chromium CDP URL not found after 20s")
+
+    return chrome_launch_process, cdp_url
+
+
+def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
+    """Clean up Chromium process."""
+    import signal
+
+    try:
+        chrome_launch_process.send_signal(signal.SIGTERM)
+        chrome_launch_process.wait(timeout=5)
+    except:
+        pass
+    chrome_pid_file = chrome_dir / 'chrome.pid'
+    if chrome_pid_file.exists():
+        try:
+            chrome_pid = int(chrome_pid_file.read_text().strip())
+            os.kill(chrome_pid, signal.SIGKILL)
+        except (OSError, ValueError):
+            pass
+
+
+def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
+    """Check ad blocking effectiveness by counting ad elements on page.
+
+    Returns dict with:
+        - adElementsFound: int - number of ad-related elements found
+        - adElementsVisible: int - number of visible ad elements
+        - blockedRequests: int - number of blocked network requests (ads/trackers)
+        - totalRequests: int - total network requests made
+        - percentBlocked: int - percentage of ad elements hidden (0-100)
+    """
+    test_script = f'''
+if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
+const puppeteer = require('puppeteer-core');
+
+(async () => {{
+    const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
+
+    const page = await browser.newPage();
+    await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
+    await page.setViewport({{ width: 1440, height: 900 }});
+
+    // Track network requests
+    let blockedRequests = 0;
+    let totalRequests = 0;
+    const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr',
+                       'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo',
+                       'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini'];
+
+    page.on('request', request => {{
+        totalRequests++;
+        const url = request.url().toLowerCase();
+        if (adDomains.some(d => url.includes(d))) {{
+            // This is an ad request
+        }}
+    }});
+
+    page.on('requestfailed', request => {{
+        const url = request.url().toLowerCase();
+        if (adDomains.some(d => url.includes(d))) {{
+            blockedRequests++;
+        }}
+    }});
+
+    console.error('Navigating to {test_url}...');
+    await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }});
+
+    // Wait for page to fully render and ads to load
+    await new Promise(r => setTimeout(r, 5000));
+
+    // Check for ad elements in the DOM
+    const result = await page.evaluate(() => {{
+        // Common ad-related selectors
+        const adSelectors = [
+            // Generic ad containers
+            '[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]',
+            '[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]',
+            '[class*="advertisement"]', '[id*="advertisement"]',
+            '[class*="sponsored"]', '[id*="sponsored"]',
+            // Google ads
+            'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]',
+            // Yahoo specific
+            '[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]',
+            '[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]',
+            // iframes (often ads)
+            'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
+            // Common ad sizes
+            '[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]',
+            '[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]',
+        ];
+
+        let adElementsFound = 0;
+        let adElementsVisible = 0;
+
+        for (const selector of adSelectors) {{
+            try {{
+                const elements = document.querySelectorAll(selector);
+                for (const el of elements) {{
+                    adElementsFound++;
+                    const style = window.getComputedStyle(el);
+                    const rect = el.getBoundingClientRect();
+                    const isVisible = style.display !== 'none' &&
+                                     style.visibility !== 'hidden' &&
+                                     style.opacity !== '0' &&
+                                     rect.width > 0 && rect.height > 0;
+                    if (isVisible) {{
+                        adElementsVisible++;
+                    }}
+                }}
+            }} catch (e) {{
+                // Invalid selector, skip
+            }}
+        }}
+
+        return {{
+            adElementsFound,
+            adElementsVisible,
+            pageTitle: document.title
+        }};
+    }});
+
+    result.blockedRequests = blockedRequests;
+    result.totalRequests = totalRequests;
+    // Calculate how many ad elements were hidden (found but not visible)
+    const hiddenAds = result.adElementsFound - result.adElementsVisible;
+    result.percentBlocked = result.adElementsFound > 0
+        ? Math.round((hiddenAds / result.adElementsFound) * 100)
+        : 0;
+
+    console.error('Ad blocking result:', JSON.stringify(result));
+    browser.disconnect();
+    console.log(JSON.stringify(result));
+}})();
+'''
+    script_path = script_dir / 'check_ads.js'
+    script_path.write_text(test_script)
+
+    result = subprocess.run(
+        ['node', str(script_path)],
+        cwd=str(script_dir),
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=90
+    )
+
+    if result.returncode != 0:
+        raise RuntimeError(f"Ad check script failed: {result.stderr}")
+
+    output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
+    if not output_lines:
+        raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}")
+
+    return json.loads(output_lines[-1])


 def setup_test_env(tmpdir: Path) -> dict:
    """Set up isolated data/lib directory structure for tests.

-    Creates structure like:
+    Creates structure matching real ArchiveBox data dir:
        <tmpdir>/data/
            lib/
                arm64-darwin/   (or x86_64-linux, etc.)
                    npm/
-                        bin/
+                        .bin/
                        node_modules/
-            chrome_extensions/
+            personas/
+                default/
+                    chrome_extensions/
+            users/
+                testuser/
+                    crawls/
+                    snapshots/

    Calls chrome install hook which handles puppeteer-core and chromium installation.
    Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
    """
    import platform
+    from datetime import datetime

    # Determine machine type (matches archivebox.config.paths.get_machine_type())
    machine = platform.machine().lower()
@@ -188,18 +383,28 @@ def setup_test_env(tmpdir: Path) -> dict:
        machine = 'x86_64'
    machine_type = f"{machine}-{system}"

-    # Create proper directory structure
+    # Create proper directory structure matching real ArchiveBox layout
    data_dir = tmpdir / 'data'
    lib_dir = data_dir / 'lib' / machine_type
    npm_dir = lib_dir / 'npm'
-    npm_bin_dir = npm_dir / 'bin'
+    npm_bin_dir = npm_dir / '.bin'
    node_modules_dir = npm_dir / 'node_modules'
-    chrome_extensions_dir = data_dir / 'chrome_extensions'
+
+    # Extensions go under personas/Default/
+    chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
+
+    # User data goes under users/{username}/
+    date_str = datetime.now().strftime('%Y%m%d')
+    users_dir = data_dir / 'users' / 'testuser'
+    crawls_dir = users_dir / 'crawls' / date_str
+    snapshots_dir = users_dir / 'snapshots' / date_str

    # Create all directories
    node_modules_dir.mkdir(parents=True, exist_ok=True)
    npm_bin_dir.mkdir(parents=True, exist_ok=True)
    chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
+    crawls_dir.mkdir(parents=True, exist_ok=True)
+    snapshots_dir.mkdir(parents=True, exist_ok=True)

    # Build complete env dict
    env = os.environ.copy()
@@ -210,12 +415,14 @@ def setup_test_env(tmpdir: Path) -> dict:
        'NPM_BIN_DIR': str(npm_bin_dir),
        'NODE_MODULES_DIR': str(node_modules_dir),
        'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
+        'CRAWLS_DIR': str(crawls_dir),
+        'SNAPSHOTS_DIR': str(snapshots_dir),
    })

    # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
    result = subprocess.run(
        ['python', str(CHROME_INSTALL_HOOK)],
-        capture_output=True, text=True, timeout=10, env=env
+        capture_output=True, text=True, timeout=120, env=env
    )
    if result.returncode != 0:
        pytest.skip(f"Chrome install hook failed: {result.stderr}")
@@ -240,8 +447,8 @@ def setup_test_env(tmpdir: Path) -> dict:
    return env


-# Test URL: ad blocker test page that shows if ads are blocked
-TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
+# Test URL: Yahoo has many ads that uBlock should block
+TEST_URL = 'https://www.yahoo.com/'


@pytest.mark.timeout(15)
@@ -290,14 +497,18 @@ def test_extension_loads_in_chromium():
        print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
        print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
        print("[test] Launching Chromium...", flush=True)
-        data_dir = Path(env['DATA_DIR'])
-        crawl_dir = data_dir / 'crawl'
-        crawl_dir.mkdir()
+
+        # Launch Chromium in crawls directory
+        crawl_id = 'test-ublock'
+        crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
+        crawl_dir.mkdir(parents=True, exist_ok=True)
        chrome_dir = crawl_dir / 'chrome'
+        chrome_dir.mkdir(parents=True, exist_ok=True)
+        env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)

        chrome_launch_process = subprocess.Popen(
-            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
-            cwd=str(crawl_dir),
+            ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
+            cwd=str(chrome_dir),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
@@ -457,161 +668,177 @@ const puppeteer = require('puppeteer-core');
 def test_blocks_ads_on_test_page():
    """Live test: verify uBlock Origin blocks ads on a test page.

-    Uses Chromium with extensions loaded automatically via chrome hook.
-    Tests against d3ward's ad blocker test page which checks ad domains.
+    This test runs TWO browser sessions:
+    1. WITHOUT extension - verifies ads are NOT blocked (baseline)
+    2. WITH extension - verifies ads ARE blocked
+
+    This ensures we're actually testing the extension's effect, not just
+    that a test page happens to show ads as blocked.
    """
-    import signal
    import time

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Set up isolated env with proper directory structure
-        env = setup_test_env(tmpdir)
-        env['CHROME_HEADLESS'] = 'true'
+        env_base = setup_test_env(tmpdir)
+        env_base['CHROME_HEADLESS'] = 'true'

-        ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
+        # ============================================================
+        # STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 1: BASELINE TEST (no extension)")
+        print("="*60)
+
+        data_dir = Path(env_base['DATA_DIR'])
+
+        env_no_ext = env_base.copy()
+        env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
+        (data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
+
+        # Launch baseline Chromium in crawls directory
+        baseline_crawl_id = 'baseline-no-ext'
+        baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
+        baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
+        baseline_chrome_dir = baseline_crawl_dir / 'chrome'
+        env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
+        baseline_process = None
+
+        try:
+            baseline_process, baseline_cdp_url = launch_chromium_session(
+                env_no_ext, baseline_chrome_dir, baseline_crawl_id
+            )
+            print(f"Baseline Chromium launched: {baseline_cdp_url}")
+
+            # Wait a moment for browser to be ready
+            time.sleep(2)
+
+            baseline_result = check_ad_blocking(
+                baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
+            )
+
+            print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads "
+                  f"(found {baseline_result['adElementsFound']} ad elements)")
+
+        finally:
+            if baseline_process:
+                kill_chromium_session(baseline_process, baseline_chrome_dir)
+
+        # Verify baseline shows ads ARE visible (not blocked)
+        if baseline_result['adElementsFound'] == 0:
+            pytest.skip(
+                f"Cannot test extension: no ad elements found on {TEST_URL}. "
+                f"The page may have changed or loaded differently."
+            )
+
+        if baseline_result['adElementsVisible'] == 0:
+            print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
+            print("This suggests either:")
+            print("  - There's another ad blocker interfering")
+            print("  - Network-level ad blocking is in effect")
+
+            pytest.skip(
+                f"Cannot test extension: baseline shows no visible ads "
+                f"despite finding {baseline_result['adElementsFound']} ad elements."
+            )
+
+        print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
+
+        # ============================================================
+        # STEP 2: Install the uBlock extension
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 2: INSTALLING EXTENSION")
+        print("="*60)
+
+        ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])

-        # Step 1: Install the uBlock extension
        result = subprocess.run(
            ['node', str(INSTALL_SCRIPT)],
            capture_output=True,
            text=True,
-            env=env,
-            timeout=15
+            env=env_base,
+            timeout=60
        )
        assert result.returncode == 0, f"Extension install failed: {result.stderr}"

-        # Verify extension cache was created
        cache_file = ext_dir / 'ublock.extension.json'
        assert cache_file.exists(), "Extension cache not created"
        ext_data = json.loads(cache_file.read_text())
        print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")

-        # Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
-        data_dir = Path(env['DATA_DIR'])
-        crawl_dir = data_dir / 'crawl'
-        crawl_dir.mkdir()
-        chrome_dir = crawl_dir / 'chrome'
+        # ============================================================
+        # STEP 3: Run WITH extension, verify ads ARE blocked
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 3: TEST WITH EXTENSION")
+        print("="*60)

-        chrome_launch_process = subprocess.Popen(
-            ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
-            cwd=str(crawl_dir),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-            env=env
-        )
-
-        # Wait for Chrome to launch and CDP URL to be available
-        cdp_url = None
-        for i in range(20):
-            if chrome_launch_process.poll() is not None:
-                stdout, stderr = chrome_launch_process.communicate()
-                raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
-            cdp_file = chrome_dir / 'cdp_url.txt'
-            if cdp_file.exists():
-                cdp_url = cdp_file.read_text().strip()
-                break
-            time.sleep(1)
-
-        assert cdp_url, "Chrome CDP URL not found after 20s"
-        print(f"Chrome launched with CDP URL: {cdp_url}")
-
-        # Check that extensions were loaded
-        extensions_file = chrome_dir / 'extensions.json'
-        if extensions_file.exists():
-            loaded_exts = json.loads(extensions_file.read_text())
-            print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
+        # Launch extension test Chromium in crawls directory
+        ext_crawl_id = 'test-with-ext'
+        ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
+        ext_crawl_dir.mkdir(parents=True, exist_ok=True)
+        ext_chrome_dir = ext_crawl_dir / 'chrome'
+        env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
+        ext_process = None

        try:
-            # Step 3: Connect to Chrome and test ad blocking
-            test_script = f'''
-if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
-const puppeteer = require('puppeteer-core');
+            ext_process, ext_cdp_url = launch_chromium_session(
+                env_base, ext_chrome_dir, ext_crawl_id
+            )
+            print(f"Extension Chromium launched: {ext_cdp_url}")

-(async () => {{
-    const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
+            # Check that extension was loaded
+            extensions_file = ext_chrome_dir / 'extensions.json'
+            if extensions_file.exists():
+                loaded_exts = json.loads(extensions_file.read_text())
+                print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")

-    // Wait for extension to initialize
-    await new Promise(r => setTimeout(r, 500));
+            # Wait for extension to initialize
+            time.sleep(3)

-    // Check extension loaded by looking at targets
-    const targets = browser.targets();
-    const extTargets = targets.filter(t =>
-        t.url().startsWith('chrome-extension://') ||
-        t.type() === 'service_worker' ||
-        t.type() === 'background_page'
-    );
-    console.error('Extension targets found:', extTargets.length);
-    extTargets.forEach(t => console.error('  -', t.type(), t.url().substring(0, 60)));
-
-    const page = await browser.newPage();
-    await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
-    await page.setViewport({{ width: 1440, height: 900 }});
-
-    console.error('Navigating to {TEST_URL}...');
-    await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
-
-    // Wait for the test page to run its checks
-    await new Promise(r => setTimeout(r, 5000));
-
-    // The d3ward test page shows blocked percentage
-    const result = await page.evaluate(() => {{
-        const scoreEl = document.querySelector('#score');
-        const score = scoreEl ? scoreEl.textContent : null;
-        const blockedItems = document.querySelectorAll('.blocked').length;
-        const totalItems = document.querySelectorAll('.testlist li').length;
-        return {{
-            score,
-            blockedItems,
-            totalItems,
-            percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
-        }};
-    }});
-
-    console.error('Ad blocking result:', JSON.stringify(result));
-    browser.disconnect();
-    console.log(JSON.stringify(result));
-}})();
-'''
-            script_path = tmpdir / 'test_ublock.js'
-            script_path.write_text(test_script)
-
-            result = subprocess.run(
-                ['node', str(script_path)],
-                cwd=str(tmpdir),
-                capture_output=True,
-                text=True,
-                env=env,
-                timeout=10
+            ext_result = check_ad_blocking(
+                ext_cdp_url, TEST_URL, env_base, tmpdir
            )

-            print(f"stderr: {result.stderr}")
-            print(f"stdout: {result.stdout}")
-
-            assert result.returncode == 0, f"Test failed: {result.stderr}"
-
-            output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
-            assert output_lines, f"No JSON output: {result.stdout}"
-
-            test_result = json.loads(output_lines[-1])
-
-            # uBlock should block most ad domains on the test page
-            assert test_result['percentBlocked'] >= 50, \
-                f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
+            print(f"Extension result: {ext_result['adElementsVisible']} visible ads "
+                  f"(found {ext_result['adElementsFound']} ad elements)")

        finally:
-            # Clean up Chrome
-            try:
-                chrome_launch_process.send_signal(signal.SIGTERM)
-                chrome_launch_process.wait(timeout=5)
-            except:
-                pass
-            chrome_pid_file = chrome_dir / 'chrome.pid'
-            if chrome_pid_file.exists():
-                try:
-                    chrome_pid = int(chrome_pid_file.read_text().strip())
-                    os.kill(chrome_pid, signal.SIGKILL)
-                except (OSError, ValueError):
-                    pass
+            if ext_process:
+                kill_chromium_session(ext_process, ext_chrome_dir)
+
+        # ============================================================
+        # STEP 4: Compare results
+        # ============================================================
+        print("\n" + "="*60)
+        print("STEP 4: COMPARISON")
+        print("="*60)
+        print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads")
+        print(f"With extension: {ext_result['adElementsVisible']} visible ads")
+
+        # Calculate reduction in visible ads
+        ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible']
+        reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0
+
+        print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)")
+
+        # Extension should significantly reduce visible ads
+        assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \
+            f"uBlock should reduce visible ads.\n" \
+            f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
+            f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
+            f"Expected fewer ads with extension."
+
+        # Extension should block at least 30% of ads
+        assert reduction_percent >= 30, \
+            f"uBlock should block at least 30% of ads.\n" \
+            f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
+            f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
+            f"Reduction: only {reduction_percent:.0f}% (expected at least 30%)"
+
+        print(f"\n✓ SUCCESS: uBlock correctly blocks ads!")
+        print(f"  - Baseline: {baseline_result['adElementsVisible']} visible ads")
+        print(f"  - With extension: {ext_result['adElementsVisible']} visible ads")
+        print(f"  - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)")
--- a/old/TODO_chrome_plugin_cleanup.md
+++ b/old/TODO_chrome_plugin_cleanup.md
@@ -133,7 +133,7 @@ This plugin provides shared Chrome infrastructure for other plugins. It manages
 chrome/
 ├── on_Crawl__00_chrome_install_config.py  # Configure Chrome settings
 ├── on_Crawl__00_chrome_install.py         # Install Chrome binary
-├── on_Crawl__20_chrome_launch.bg.js       # Launch Chrome (Crawl-level, bg)
+├── on_Crawl__30_chrome_launch.bg.js       # Launch Chrome (Crawl-level, bg)
 ├── on_Snapshot__20_chrome_tab.bg.js       # Open tab (Snapshot-level, bg)
 ├── on_Snapshot__30_chrome_navigate.js     # Navigate to URL (foreground)
 ├── on_Snapshot__45_chrome_tab_cleanup.py  # Close tab, kill bg hooks