extension test fixes

This commit is contained in:
Nick Sweeting
2025-12-30 18:28:14 -08:00
parent dd2302ad92
commit 42d3fb7025
12 changed files with 1512 additions and 688 deletions

View File

@@ -8,7 +8,7 @@
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
@@ -165,14 +165,6 @@ async function main() {
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
// Write extensions metadata
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
// Connect puppeteer for extension verification
console.error(`[*] Connecting puppeteer to CDP...`);
const browser = await puppeteer.connect({
@@ -181,30 +173,84 @@ async function main() {
});
browserInstance = browser;
// Verify extensions loaded
// Get actual extension IDs from chrome://extensions page
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 3000));
await new Promise(r => setTimeout(r, 2000));
const targets = browser.targets();
console.error(`[*] All browser targets (${targets.length}):`);
for (const t of targets) {
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
try {
const extPage = await browser.newPage();
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 2000));
// Parse extension info from the page
const extensionsFromPage = await extPage.evaluate(() => {
const extensions = [];
// Extensions manager uses shadow DOM
const manager = document.querySelector('extensions-manager');
if (!manager || !manager.shadowRoot) return extensions;
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
if (!itemList || !itemList.shadowRoot) return extensions;
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
for (const item of items) {
const id = item.getAttribute('id');
const nameEl = item.shadowRoot?.querySelector('#name');
const name = nameEl?.textContent?.trim() || '';
if (id && name) {
extensions.push({ id, name });
}
}
return extensions;
});
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
for (const e of extensionsFromPage) {
console.error(` - ${e.id}: "${e.name}"`);
}
// Match extensions by name (strict matching)
for (const ext of installedExtensions) {
// Read the extension's manifest to get its display name
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
const manifestName = manifest.name || '';
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
// Find matching extension from page by exact name match first
let match = extensionsFromPage.find(e => e.name === manifestName);
// If no exact match, try case-insensitive exact match
if (!match) {
match = extensionsFromPage.find(e =>
e.name.toLowerCase() === manifestName.toLowerCase()
);
}
if (match) {
ext.id = match.id;
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
} else {
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
}
}
}
await extPage.close();
} catch (e) {
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
}
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
// Filter out built-in extensions
// Fallback: check browser targets
const targets = browser.targets();
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
const customExtTargets = extTargets.filter(t => {
const customExtTargets = targets.filter(t => {
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
@@ -216,7 +262,7 @@ async function main() {
for (const target of customExtTargets) {
const url = target.url();
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
console.error(`[+] Extension target: ${extId} (${target.type()})`);
}
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
@@ -225,6 +271,14 @@ async function main() {
}
}
// Write extensions metadata with actual IDs
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
console.error(`[+] Chromium session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${cdpUrl}`);
console.error(`[+] PID: ${chromePid}`);

View File

@@ -2,7 +2,7 @@
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
* If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
* this connects to it and creates a new tab. Otherwise, falls back to launching
* its own Chrome instance.
*
@@ -215,7 +215,7 @@ async function launchNewChrome(url, binary) {
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
// Write PID immediately for cleanup
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
try {
// Wait for Chrome to be ready

View File

@@ -29,7 +29,7 @@ import shutil
import platform
PLUGIN_DIR = Path(__file__).parent.parent
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
@@ -176,6 +176,7 @@ def test_chrome_launch_and_tab_creation():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Get test environment with NODE_MODULES_DIR set
env = get_test_env()
@@ -184,7 +185,7 @@ def test_chrome_launch_and_tab_creation():
# Launch Chrome at crawl level (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -292,7 +293,7 @@ def test_chrome_navigation():
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -363,7 +364,7 @@ def test_tab_cleanup_on_sigterm():
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -423,11 +424,12 @@ def test_multiple_snapshots_share_chrome():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -513,7 +515,7 @@ def test_chrome_cleanup_on_crawl_end():
# Launch Chrome in background
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -554,11 +556,12 @@ def test_zombie_prevention_hook_killed():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -26,7 +26,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
@@ -122,6 +122,7 @@ def setup_chrome_session(tmpdir):
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
@@ -129,7 +130,7 @@ def setup_chrome_session(tmpdir):
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -16,7 +16,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None)
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
def test_install_script_exists():
@@ -124,78 +124,106 @@ def test_no_configuration_required():
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
def setup_test_lib_dirs(tmpdir: Path) -> dict:
"""Create isolated lib directories for tests and return env dict.
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
Sets up:
LIB_DIR: tmpdir/lib/<arch>
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
Default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
arch = platform.machine()
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
arch_dir = f"{arch}-{system}"
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
lib_dir = tmpdir / 'lib' / arch_dir
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
npm_bin_dir = npm_dir / 'bin'
pip_venv_dir = lib_dir / 'pip' / 'venv'
pip_bin_dir = pip_venv_dir / 'bin'
# Create directories
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
pip_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Install puppeteer-core to the test node_modules if not present
if not (node_modules_dir / 'puppeteer-core').exists():
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
capture_output=True,
text=True,
timeout=120
)
if result.returncode != 0:
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
return {
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'PIP_VENV_DIR': str(pip_venv_dir),
'PIP_BIN_DIR': str(pip_bin_dir),
}
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
PLUGINS_ROOT = PLUGIN_DIR.parent
def find_chromium_binary():
"""Find the Chromium binary using chrome_utils.js findChromium().
This uses the centralized findChromium() function which checks:
- CHROME_BINARY env var
- @puppeteer/browsers install locations
- System Chromium locations
- Falls back to Chrome (with warning)
"""
chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['node', str(chrome_utils), 'findChromium'],
capture_output=True,
text=True,
timeout=10
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
TEST_URL = 'https://www.filmin.es/'
@@ -210,22 +238,11 @@ def test_extension_loads_in_chromium():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Set up isolated env with proper directory structure
env = setup_test_env(tmpdir)
env.setdefault('CHROME_HEADLESS', 'true')
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
# Step 1: Install the extension
result = subprocess.run(
@@ -245,13 +262,16 @@ def test_extension_loads_in_chromium():
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
crawl_id = 'test-cookies'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
crawl_dir.mkdir(parents=True, exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True, exist_ok=True)
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
cwd=str(crawl_dir),
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -400,156 +420,362 @@ const puppeteer = require('puppeteer-core');
pass
def test_hides_cookie_consent_on_filmin():
"""Live test: verify extension hides cookie consent popup on filmin.es.
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
chrome_dir.mkdir(parents=True, exist_ok=True)
Uses Chromium with extensions loaded automatically via chrome hook.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Set up isolated lib directories for this test
lib_env = setup_test_lib_dirs(tmpdir)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
# Set up extensions directory
ext_dir = tmpdir / 'chrome_extensions'
ext_dir.mkdir(parents=True)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
env = os.environ.copy()
env.update(lib_env)
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
env['CHROME_HEADLESS'] = 'true'
return chrome_launch_process, cdp_url
# Ensure CHROME_BINARY points to Chromium
chromium = find_chromium_binary()
if chromium:
env['CHROME_BINARY'] = chromium
# Step 1: Install the extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
crawl_dir = tmpdir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chromium CDP URL not found after 20s"
print(f"Chromium launched with CDP URL: {cdp_url}")
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
"""Clean up Chromium process."""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
# Step 3: Connect to Chromium and test cookie consent hiding
test_script = f'''
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
"""Check if cookie consent elements are visible on a page.
Returns dict with:
- visible: bool - whether any cookie consent element is visible
- selector: str - which selector matched (if visible)
- elements_found: list - all cookie-related elements found in DOM
- html_snippet: str - snippet of the page HTML for debugging
"""
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 2000));
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
console.error('Navigating to {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
console.error('Navigating to {test_url}...');
await page.goto('{test_url}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
// Wait for extension content script to process page
await new Promise(r => setTimeout(r, 5000));
// Wait for page to fully render and any cookie scripts to run
await new Promise(r => setTimeout(r, 3000));
// Check cookie consent visibility
// Check cookie consent visibility using multiple common selectors
const result = await page.evaluate(() => {{
const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay'];
// Common cookie consent selectors used by various consent management platforms
const selectors = [
// CookieYes
'.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal',
// OneTrust
'#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter',
// Cookiebot
'#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay',
// Generic cookie banners
'[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]',
'[class*="cookie-popup"]', '[class*="cookie-modal"]', '[class*="cookie-dialog"]',
'[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]',
'[id*="cookieconsent"]', '[id*="cookie-law"]',
// GDPR banners
'[class*="gdpr"]', '[id*="gdpr"]',
// Consent banners
'[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-popup"]',
// Privacy banners
'[class*="privacy-banner"]', '[class*="privacy-notice"]',
// Common frameworks
'.cc-window', '.cc-banner', '#cc-main', // Cookie Consent by Insites
'.qc-cmp2-container', // Quantcast
'.sp-message-container', // SourcePoint
];
const elementsFound = [];
let visibleElement = null;
for (const sel of selectors) {{
const el = document.querySelector(sel);
if (el) {{
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const visible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
rect.width > 0 && rect.height > 0;
if (visible) return {{ visible: true, selector: sel }};
try {{
const elements = document.querySelectorAll(sel);
for (const el of elements) {{
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const isVisible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0' &&
rect.width > 0 && rect.height > 0;
elementsFound.push({{
selector: sel,
visible: isVisible,
display: style.display,
visibility: style.visibility,
opacity: style.opacity,
width: rect.width,
height: rect.height
}});
if (isVisible && !visibleElement) {{
visibleElement = {{ selector: sel, width: rect.width, height: rect.height }};
}}
}}
}} catch (e) {{
// Invalid selector, skip
}}
}}
return {{ visible: false }};
// Also grab a snippet of the HTML to help debug
const bodyHtml = document.body.innerHTML.slice(0, 2000);
const hasCookieKeyword = bodyHtml.toLowerCase().includes('cookie') ||
bodyHtml.toLowerCase().includes('consent') ||
bodyHtml.toLowerCase().includes('gdpr');
return {{
visible: visibleElement !== null,
selector: visibleElement ? visibleElement.selector : null,
elements_found: elementsFound,
has_cookie_keyword_in_html: hasCookieKeyword,
html_snippet: bodyHtml.slice(0, 500)
}};
}});
console.error('Cookie consent:', JSON.stringify(result));
console.error('Cookie consent check result:', JSON.stringify({{
visible: result.visible,
selector: result.selector,
elements_found_count: result.elements_found.length
}}));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = tmpdir / 'test_extension.js'
script_path.write_text(test_script)
script_path = script_dir / 'check_cookies.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=90
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
timeout=90
)
if result.returncode != 0:
raise RuntimeError(f"Cookie check script failed: {result.stderr}")
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
if not output_lines:
raise RuntimeError(f"No JSON output from cookie check: {result.stdout}\nstderr: {result.stderr}")
return json.loads(output_lines[-1])
def test_hides_cookie_consent_on_filmin():
"""Live test: verify extension hides cookie consent popup on filmin.es.
This test runs TWO browser sessions:
1. WITHOUT extension - verifies cookie consent IS visible (baseline)
2. WITH extension - verifies cookie consent is HIDDEN
This ensures we're actually testing the extension's effect, not just
that a page happens to not have cookie consent.
"""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated env with proper directory structure
env_base = setup_test_env(tmpdir)
env_base['CHROME_HEADLESS'] = 'true'
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
# ============================================================
# STEP 1: BASELINE - Run WITHOUT extension, verify cookie consent IS visible
# ============================================================
print("\n" + "="*60)
print("STEP 1: BASELINE TEST (no extension)")
print("="*60)
data_dir = Path(env_base['DATA_DIR'])
env_no_ext = env_base.copy()
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
# Launch baseline Chromium in crawls directory
baseline_crawl_id = 'baseline-no-ext'
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
baseline_process = None
try:
baseline_process, baseline_cdp_url = launch_chromium_session(
env_no_ext, baseline_chrome_dir, baseline_crawl_id
)
print(f"Baseline Chromium launched: {baseline_cdp_url}")
# Wait a moment for browser to be ready
time.sleep(2)
baseline_result = check_cookie_consent_visibility(
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
print(f"Baseline result: visible={baseline_result['visible']}, "
f"elements_found={len(baseline_result['elements_found'])}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
assert not test_result['visible'], \
f"Cookie consent should be hidden by extension. Result: {test_result}"
if baseline_result['elements_found']:
print("Elements found in baseline:")
for el in baseline_result['elements_found'][:5]: # Show first 5
print(f" - {el['selector']}: visible={el['visible']}, "
f"display={el['display']}, size={el['width']}x{el['height']}")
finally:
# Clean up Chromium
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
if baseline_process:
kill_chromium_session(baseline_process, baseline_chrome_dir)
# Verify baseline shows cookie consent
if not baseline_result['visible']:
# If no cookie consent visible in baseline, we can't test the extension
# This could happen if:
# - The site changed and no longer shows cookie consent
# - Cookie consent is region-specific
# - Our selectors don't match this site
print("\nWARNING: No cookie consent visible in baseline!")
print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}")
print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}")
pytest.skip(
f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. "
f"Elements found: {len(baseline_result['elements_found'])}. "
f"The site may have changed or cookie consent may be region-specific."
)
print(f"\n✓ Baseline confirmed: Cookie consent IS visible (selector: {baseline_result['selector']})")
# ============================================================
# STEP 2: Install the extension
# ============================================================
print("\n" + "="*60)
print("STEP 2: INSTALLING EXTENSION")
print("="*60)
env_with_ext = env_base.copy()
env_with_ext['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env_with_ext,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# ============================================================
# STEP 3: Run WITH extension, verify cookie consent is HIDDEN
# ============================================================
print("\n" + "="*60)
print("STEP 3: TEST WITH EXTENSION")
print("="*60)
# Launch extension test Chromium in crawls directory
ext_crawl_id = 'test-with-ext'
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
ext_chrome_dir = ext_crawl_dir / 'chrome'
env_with_ext['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
ext_process = None
try:
ext_process, ext_cdp_url = launch_chromium_session(
env_with_ext, ext_chrome_dir, ext_crawl_id
)
print(f"Extension Chromium launched: {ext_cdp_url}")
# Check that extension was loaded
extensions_file = ext_chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
# Wait for extension to initialize
time.sleep(3)
ext_result = check_cookie_consent_visibility(
ext_cdp_url, TEST_URL, env_with_ext, tmpdir
)
print(f"Extension result: visible={ext_result['visible']}, "
f"elements_found={len(ext_result['elements_found'])}")
if ext_result['elements_found']:
print("Elements found with extension:")
for el in ext_result['elements_found'][:5]:
print(f" - {el['selector']}: visible={el['visible']}, "
f"display={el['display']}, size={el['width']}x{el['height']}")
finally:
if ext_process:
kill_chromium_session(ext_process, ext_chrome_dir)
# ============================================================
# STEP 4: Compare results
# ============================================================
print("\n" + "="*60)
print("STEP 4: COMPARISON")
print("="*60)
print(f"Baseline (no extension): cookie consent visible = {baseline_result['visible']}")
print(f"With extension: cookie consent visible = {ext_result['visible']}")
assert baseline_result['visible'], \
"Baseline should show cookie consent (this shouldn't happen, we checked above)"
assert not ext_result['visible'], \
f"Cookie consent should be HIDDEN by extension.\n" \
f"Baseline showed consent at: {baseline_result['selector']}\n" \
f"But with extension, consent is still visible.\n" \
f"Elements still visible: {[e for e in ext_result['elements_found'] if e['visible']]}"
print("\n✓ SUCCESS: Extension correctly hides cookie consent!")
print(f" - Baseline showed consent at: {baseline_result['selector']}")
print(f" - Extension successfully hid it")

View File

@@ -26,7 +26,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
@@ -123,6 +123,7 @@ def setup_chrome_session(tmpdir):
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
@@ -130,7 +131,7 @@ def setup_chrome_session(tmpdir):
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,

View File

@@ -4,18 +4,47 @@
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"CAPTCHA2_ENABLED": {
"TWOCAPTCHA_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_CAPTCHA2"],
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
"x-aliases": ["CAPTCHA2_ENABLED", "USE_CAPTCHA2", "USE_TWOCAPTCHA"],
"description": "Enable 2captcha browser extension for automatic CAPTCHA solving"
},
"CAPTCHA2_TIMEOUT": {
"TWOCAPTCHA_API_KEY": {
"type": "string",
"default": "",
"x-aliases": ["API_KEY_2CAPTCHA", "CAPTCHA2_API_KEY"],
"x-sensitive": true,
"description": "2captcha API key for CAPTCHA solving service (get from https://2captcha.com)"
},
"TWOCAPTCHA_RETRY_COUNT": {
"type": "integer",
"default": 3,
"minimum": 0,
"maximum": 10,
"x-aliases": ["CAPTCHA2_RETRY_COUNT"],
"description": "Number of times to retry CAPTCHA solving on error"
},
"TWOCAPTCHA_RETRY_DELAY": {
"type": "integer",
"default": 5,
"minimum": 0,
"maximum": 60,
"x-aliases": ["CAPTCHA2_RETRY_DELAY"],
"description": "Delay in seconds between CAPTCHA solving retries"
},
"TWOCAPTCHA_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"x-aliases": ["CAPTCHA2_TIMEOUT"],
"description": "Timeout for CAPTCHA solving in seconds"
},
"TWOCAPTCHA_AUTO_SUBMIT": {
"type": "boolean",
"default": false,
"description": "Automatically submit forms after CAPTCHA is solved"
}
}
}

View File

@@ -12,7 +12,7 @@
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - TWOCAPTCHA_API_KEY environment variable must be set
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
*/
@@ -47,10 +47,10 @@ async function installCaptchaExtension() {
}
// Check if API key is configured
const apiKey = process.env.API_KEY_2CAPTCHA;
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
} else {
console.log('[+] 2captcha extension installed and API key configured');
}

View File

@@ -2,14 +2,21 @@
/**
* 2Captcha Extension Configuration
*
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
* Runs once per crawl to inject API key into extension storage.
* Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
* Runs once per crawl to inject configuration into extension storage.
*
* Priority: 11 (after chrome_launch at 20)
* Priority: 25 (after chrome_launch at 30, before snapshots start)
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Config Options (from config.json / environment):
* - TWOCAPTCHA_API_KEY: API key for 2captcha service
* - TWOCAPTCHA_ENABLED: Enable/disable the extension
* - TWOCAPTCHA_RETRY_COUNT: Number of retries on error
* - TWOCAPTCHA_RETRY_DELAY: Delay between retries (seconds)
* - TWOCAPTCHA_AUTO_SUBMIT: Auto-submit forms after solving
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - TWOCAPTCHA_API_KEY environment variable must be set
* - chrome plugin must have loaded extensions (extensions.json must exist)
*/
@@ -36,6 +43,20 @@ function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
// Get boolean environment variable
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get integer environment variable
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Parse command line arguments
function parseArgs() {
const args = {};
@@ -48,6 +69,82 @@ function parseArgs() {
return args;
}
/**
* Get 2captcha configuration from environment variables.
* Supports both TWOCAPTCHA_* and legacy API_KEY_2CAPTCHA naming.
*/
function getTwoCaptchaConfig() {
const apiKey = getEnv('TWOCAPTCHA_API_KEY') || getEnv('API_KEY_2CAPTCHA') || getEnv('CAPTCHA2_API_KEY');
const isEnabled = getEnvBool('TWOCAPTCHA_ENABLED', true);
const retryCount = getEnvInt('TWOCAPTCHA_RETRY_COUNT', 3);
const retryDelay = getEnvInt('TWOCAPTCHA_RETRY_DELAY', 5);
const autoSubmit = getEnvBool('TWOCAPTCHA_AUTO_SUBMIT', false);
// Build the full config object matching the extension's storage structure
// Structure: chrome.storage.local.set({config: {...}})
return {
// API key - both variants for compatibility
apiKey: apiKey,
api_key: apiKey,
// Plugin enabled state
isPluginEnabled: isEnabled,
// Retry settings
repeatOnErrorTimes: retryCount,
repeatOnErrorDelay: retryDelay,
// Auto-submit setting
autoSubmitForms: autoSubmit,
submitFormsDelay: 0,
// Enable all CAPTCHA types
enabledForNormal: true,
enabledForRecaptchaV2: true,
enabledForInvisibleRecaptchaV2: true,
enabledForRecaptchaV3: true,
enabledForRecaptchaAudio: false,
enabledForGeetest: true,
enabledForGeetest_v4: true,
enabledForKeycaptcha: true,
enabledForArkoselabs: true,
enabledForLemin: true,
enabledForYandex: true,
enabledForCapyPuzzle: true,
enabledForTurnstile: true,
enabledForAmazonWaf: true,
enabledForMTCaptcha: true,
// Auto-solve all CAPTCHA types
autoSolveNormal: true,
autoSolveRecaptchaV2: true,
autoSolveInvisibleRecaptchaV2: true,
autoSolveRecaptchaV3: true,
autoSolveRecaptchaAudio: false,
autoSolveGeetest: true,
autoSolveGeetest_v4: true,
autoSolveKeycaptcha: true,
autoSolveArkoselabs: true,
autoSolveLemin: true,
autoSolveYandex: true,
autoSolveCapyPuzzle: true,
autoSolveTurnstile: true,
autoSolveAmazonWaf: true,
autoSolveMTCaptcha: true,
// Other settings with sensible defaults
recaptchaV2Type: 'token',
recaptchaV3MinScore: 0.3,
buttonPosition: 'inner',
useProxy: false,
proxy: '',
proxytype: 'HTTP',
blackListDomain: '',
autoSubmitRules: [],
normalSources: [],
};
}
async function configure2Captcha() {
// Check if already configured in this session
if (fs.existsSync(CONFIG_MARKER)) {
@@ -55,29 +152,23 @@ async function configure2Captcha() {
return { success: true, skipped: true };
}
// Get configuration
const config = getTwoCaptchaConfig();
// Check if API key is set
const apiKey = getEnv('API_KEY_2CAPTCHA');
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
if (!config.apiKey || config.apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[!] 2captcha extension loaded but TWOCAPTCHA_API_KEY not configured');
console.warn('[!] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
return { success: false, error: 'TWOCAPTCHA_API_KEY not configured' };
}
// Load extensions metadata
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
}
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
if (!captchaExt) {
console.error('[*] 2captcha extension not installed, skipping configuration');
return { success: true, skipped: true };
}
console.error('[*] Configuring 2captcha extension with API key...');
console.error('[*] Configuring 2captcha extension...');
console.error(`[*] API Key: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`);
console.error(`[*] Enabled: ${config.isPluginEnabled}`);
console.error(`[*] Retry Count: ${config.repeatOnErrorTimes}`);
console.error(`[*] Retry Delay: ${config.repeatOnErrorDelay}s`);
console.error(`[*] Auto Submit: ${config.autoSubmitForms}`);
console.error(`[*] Auto Solve: all CAPTCHA types enabled`);
try {
// Connect to the existing Chrome session via CDP
@@ -90,138 +181,116 @@ async function configure2Captcha() {
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
try {
// Method 1: Try to inject via extension background page
if (captchaExt.target && captchaExt.target_ctx) {
console.error('[*] Attempting to configure via extension background page...');
// First, navigate to a page to trigger extension content scripts and wake up service worker
console.error('[*] Waking up extension by visiting a page...');
const triggerPage = await browser.newPage();
try {
await triggerPage.goto('https://www.google.com', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 3000)); // Give extension time to initialize
} catch (e) {
console.warn(`[!] Trigger page failed: ${e.message}`);
}
try { await triggerPage.close(); } catch (e) {}
// Reconnect to the browser to get fresh target context
const targets = await browser.targets();
const extTarget = targets.find(t =>
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
);
if (extTarget) {
const extContext = await extTarget.worker() || await extTarget.page();
if (extContext) {
await extContext.evaluate((key) => {
// Try all common storage patterns
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
chrome.storage.sync.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
}
// Also try localStorage as fallback
if (typeof localStorage !== 'undefined') {
localStorage.setItem('apiKey', key);
localStorage.setItem('2captcha_apikey', key);
localStorage.setItem('solver-api-key', key);
}
}, apiKey);
console.error('[+] 2captcha API key configured successfully via background page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'background_page' };
}
}
// Get 2captcha extension info from extensions.json
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
}
// Method 2: Try to configure via options page
console.error('[*] Attempting to configure via options page...');
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
const configPage = await browser.newPage();
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
if (!captchaExt) {
console.error('[*] 2captcha extension not installed, skipping configuration');
return { success: true, skipped: true };
}
if (!captchaExt.id) {
return { success: false, error: '2captcha extension ID not found in extensions.json' };
}
const extensionId = captchaExt.id;
console.error(`[*] 2captcha Extension ID: ${extensionId}`);
// Configure via options page
console.error('[*] Configuring via options page...');
const optionsUrl = `chrome-extension://${extensionId}/options/options.html`;
let configPage = await browser.newPage();
try {
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
const configured = await configPage.evaluate((key) => {
// Try to find API key input field
const selectors = [
'input[name*="apikey" i]',
'input[id*="apikey" i]',
'input[name*="api-key" i]',
'input[id*="api-key" i]',
'input[name*="key" i]',
'input[placeholder*="api" i]',
'input[type="text"]',
];
for (const selector of selectors) {
const input = document.querySelector(selector);
if (input) {
input.value = key;
input.dispatchEvent(new Event('input', { bubbles: true }));
input.dispatchEvent(new Event('change', { bubbles: true }));
// Try to find and click save button
const saveSelectors = [
'button[type="submit"]',
'input[type="submit"]',
'button:contains("Save")',
'button:contains("Apply")',
];
for (const btnSel of saveSelectors) {
const btn = document.querySelector(btnSel);
if (btn) {
btn.click();
break;
}
}
// Also save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
}
return true;
}
}
// Fallback: Just save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
return true;
}
return false;
}, apiKey);
await configPage.close();
if (configured) {
console.error('[+] 2captcha API key configured successfully via options page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'options_page' };
}
} catch (e) {
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
// Navigate to options page - catch error but continue since page may still load
try {
await configPage.close();
} catch (e2) {}
}
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
} catch (navError) {
// Navigation may throw ERR_BLOCKED_BY_CLIENT but page still loads
console.error(`[*] Navigation threw error (may still work): ${navError.message}`);
}
return { success: false, error: 'Could not configure via any method' };
// Wait a moment for page to settle
await new Promise(r => setTimeout(r, 3000));
// Check all pages for the extension page (Chrome may open it in a different tab)
const pages = await browser.pages();
for (const page of pages) {
const url = page.url();
if (url.startsWith(`chrome-extension://${extensionId}`)) {
configPage = page;
break;
}
}
const currentUrl = configPage.url();
console.error(`[*] Current URL: ${currentUrl}`);
if (!currentUrl.startsWith(`chrome-extension://${extensionId}`)) {
return { success: false, error: `Failed to navigate to options page, got: ${currentUrl}` };
}
// Wait for Config object to be available
console.error('[*] Waiting for Config object...');
await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 });
// Use chrome.storage.local.set with the config wrapper
const result = await configPage.evaluate((cfg) => {
return new Promise((resolve) => {
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ config: cfg }, () => {
if (chrome.runtime.lastError) {
resolve({ success: false, error: chrome.runtime.lastError.message });
} else {
resolve({ success: true, method: 'options_page' });
}
});
} else {
resolve({ success: false, error: 'chrome.storage not available' });
}
});
}, config);
if (result.success) {
console.error(`[+] 2captcha configured via ${result.method}`);
fs.writeFileSync(CONFIG_MARKER, JSON.stringify({
timestamp: new Date().toISOString(),
method: result.method,
extensionId: extensionId,
config: {
apiKeySet: !!config.apiKey,
isPluginEnabled: config.isPluginEnabled,
repeatOnErrorTimes: config.repeatOnErrorTimes,
repeatOnErrorDelay: config.repeatOnErrorDelay,
autoSubmitForms: config.autoSubmitForms,
autoSolveEnabled: true,
}
}, null, 2));
return { success: true, method: result.method };
}
return { success: false, error: result.error || 'Config failed' };
} finally {
try { await configPage.close(); } catch (e) {}
}
} finally {
browser.disconnect();
}
@@ -236,7 +305,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -1,184 +1,398 @@
"""
Unit tests for twocaptcha plugin
Integration tests for twocaptcha plugin
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs
NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium.
"""
import json
import os
import signal
import subprocess
import tempfile
import time
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None)
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None)
PLUGINS_ROOT = PLUGIN_DIR.parent
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
.bin/
node_modules/
personas/
default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
env.update({
'DATA_DIR': str(data_dir),
'LIB_DIR': str(lib_dir),
'MACHINE_TYPE': machine_type,
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Only set headless if not already in environment (allow override for debugging)
if 'CHROME_HEADLESS' not in os.environ:
env['CHROME_HEADLESS'] = 'true'
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
# Parse JSONL output to get CHROME_BINARY
chrome_binary = None
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
try:
data = json.loads(line)
if data.get('type') == 'Binary' and data.get('abspath'):
chrome_binary = data['abspath']
break
except json.JSONDecodeError:
continue
if not chrome_binary or not Path(chrome_binary).exists():
pytest.skip(f"Chromium binary not found: {chrome_binary}")
env['CHROME_BINARY'] = chrome_binary
return env
def test_config_script_exists():
"""Verify config script exists"""
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url)."""
chrome_dir.mkdir(parents=True, exist_ok=True)
process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
cdp_url = None
for _ in range(30):
if process.poll() is not None:
stdout, stderr = process.communicate()
raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
if not cdp_url:
process.kill()
stdout, stderr = process.communicate()
raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
# Wait for extensions.json to be written (chrome launch hook parses chrome://extensions)
extensions_file = chrome_dir / 'extensions.json'
for _ in range(15):
if extensions_file.exists():
break
time.sleep(1)
# Print chrome launch hook output for debugging
import select
if hasattr(select, 'poll'):
# Read any available stderr without blocking
import fcntl
import os as os_module
fd = process.stderr.fileno()
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
try:
stderr_output = process.stderr.read()
if stderr_output:
print(f"[Chrome Launch Hook Output]\n{stderr_output}")
except:
pass
return process, cdp_url
def test_extension_metadata():
"""Test that twocaptcha extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
# Just check the script can be loaded
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert metadata["name"] == "twocaptcha"
def kill_chrome(process, chrome_dir: Path):
"""Kill Chromium process."""
try:
process.send_signal(signal.SIGTERM)
process.wait(timeout=5)
except:
pass
pid_file = chrome_dir / 'chrome.pid'
if pid_file.exists():
try:
os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
except:
pass
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
class TestTwoCaptcha:
"""Integration tests requiring TWOCAPTCHA_API_KEY."""
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
@pytest.fixture(autouse=True)
def setup(self):
self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
if not self.api_key:
pytest.skip("TWOCAPTCHA_API_KEY required")
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
def test_install_and_load(self):
"""Extension installs and loads in Chromium."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = setup_test_env(tmpdir)
env['TWOCAPTCHA_API_KEY'] = self.api_key
# Check output mentions installation
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
# Install
result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True)
assert result.returncode == 0, f"Install failed: {result.stderr}"
# Check cache file was created
cache_file = ext_dir / "twocaptcha.extension.json"
assert cache_file.exists(), "Cache file should be created"
cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json'
assert cache.exists()
data = json.loads(cache.read_text())
assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo'
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert cache_data["name"] == "twocaptcha"
assert "unpacked_path" in cache_data
assert "version" in cache_data
# Launch Chromium in crawls directory
crawl_id = 'test'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
chrome_dir = crawl_dir / 'chrome'
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
exts = json.loads((chrome_dir / 'extensions.json').read_text())
assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}"
print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
finally:
kill_chrome(process, chrome_dir)
def test_config_applied(self):
"""Configuration is applied to extension and verified via Config.getAll()."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = setup_test_env(tmpdir)
env['TWOCAPTCHA_API_KEY'] = self.api_key
env['TWOCAPTCHA_RETRY_COUNT'] = '5'
env['TWOCAPTCHA_RETRY_DELAY'] = '10'
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
# Launch Chromium in crawls directory
crawl_id = 'cfg'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
chrome_dir = crawl_dir / 'chrome'
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
result = subprocess.run(
['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
env=env, timeout=30, capture_output=True, text=True
)
assert result.returncode == 0, f"Config failed: {result.stderr}"
assert (chrome_dir / '.twocaptcha_configured').exists()
# Verify config via options.html and Config.getAll()
# Get the actual extension ID from the config marker (Chrome computes IDs differently)
config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text())
ext_id = config_marker['extensionId']
script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
// Load options.html and use Config.getAll() to verify
const optionsUrl = 'chrome-extension://{ext_id}/options/options.html';
const page = await browser.newPage();
console.error('[*] Loading options page:', optionsUrl);
// Navigate - catch error but continue since page may still load
try {{
await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }});
}} catch (e) {{
console.error('[*] Navigation threw error (may still work):', e.message);
}}
// Wait for page to settle
await new Promise(r => setTimeout(r, 2000));
console.error('[*] Current URL:', page.url());
// Wait for Config object to be available
await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }});
// Call Config.getAll() - the extension's own API (returns a Promise)
const cfg = await page.evaluate(async () => await Config.getAll());
console.error('[*] Config.getAll() returned:', JSON.stringify(cfg));
await page.close();
browser.disconnect();
console.log(JSON.stringify(cfg));
}})();
'''
(tmpdir / 'v.js').write_text(script)
r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True)
print(r.stderr)
assert r.returncode == 0, f"Verify failed: {r.stderr}"
cfg = json.loads(r.stdout.strip().split('\n')[-1])
print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}")
# Verify all the fields we care about
assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}"
assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}"
assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}"
assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}"
assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}"
assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}"
assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}"
assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}"
print(f"[+] Config verified via Config.getAll()!")
finally:
kill_chrome(process, chrome_dir)
def test_solves_recaptcha(self):
"""Extension solves reCAPTCHA on demo page."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = setup_test_env(tmpdir)
env['TWOCAPTCHA_API_KEY'] = self.api_key
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
# Launch Chromium in crawls directory
crawl_id = 'solve'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
chrome_dir = crawl_dir / 'chrome'
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
try:
subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
const page = await browser.newPage();
await page.setViewport({{ width: 1440, height: 900 }});
console.error('[*] Loading {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
await new Promise(r => setTimeout(r, 3000));
const start = Date.now();
const maxWait = 90000;
while (Date.now() - start < maxWait) {{
const state = await page.evaluate(() => {{
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
const solver = document.querySelector('.captcha-solver');
return {{
solved: resp ? resp.value.length > 0 : false,
state: solver?.getAttribute('data-state'),
text: solver?.textContent?.trim() || ''
}};
}});
const sec = Math.round((Date.now() - start) / 1000);
console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30));
if (state.solved) {{ console.error('[+] SOLVED!'); break; }}
if (state.state === 'error') {{ console.error('[!] ERROR'); break; }}
await new Promise(r => setTimeout(r, 2000));
}}
const final = await page.evaluate(() => {{
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }};
}});
browser.disconnect();
console.log(JSON.stringify(final));
}})();
'''
(tmpdir / 's.js').write_text(script)
print("\n[*] Solving CAPTCHA (10-60s)...")
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True)
print(r.stderr)
assert r.returncode == 0, f"Failed: {r.stderr}"
final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
assert final.get('solved'), f"Not solved: {final}"
print(f"[+] SOLVED! {final.get('preview','')[:30]}...")
finally:
kill_chrome(process, chrome_dir)
def test_install_twice_uses_cache():
"""Test that running install twice uses existing cache on second run"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# First install - downloads the extension
result1 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
# Verify cache was created
cache_file = ext_dir / "twocaptcha.extension.json"
assert cache_file.exists(), "Cache file should exist after first install"
# Second install - should use cache
result2 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
# Second run should mention cache reuse
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
def test_install_warns_without_api_key():
"""Test that install warns when API key not configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# Don't set API_KEY_2CAPTCHA
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should warn about missing API key
combined_output = result.stdout + result.stderr
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
def test_install_success_with_api_key():
"""Test that install succeeds when API key is configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should mention API key configured
combined_output = result.stdout + result.stderr
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
def test_config_script_structure():
"""Test that config script has proper structure"""
# Verify the script exists and contains expected markers
script_content = CONFIG_SCRIPT.read_text()
# Should mention configuration marker file
assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content
# Should mention API key
assert "API_KEY_2CAPTCHA" in script_content
# Should have main function or be executable
assert "async function" in script_content or "main" in script_content
if __name__ == '__main__':
pytest.main([__file__, '-xvs'])

View File

@@ -14,7 +14,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_ublock.*'), None)
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
def test_install_script_exists():
@@ -158,26 +158,221 @@ def test_large_extension_size():
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
import signal
import time
chrome_dir.mkdir(parents=True, exist_ok=True)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
return chrome_launch_process, cdp_url
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
"""Clean up Chromium process."""
import signal
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
"""Check ad blocking effectiveness by counting ad elements on page.
Returns dict with:
- adElementsFound: int - number of ad-related elements found
- adElementsVisible: int - number of visible ad elements
- blockedRequests: int - number of blocked network requests (ads/trackers)
- totalRequests: int - total network requests made
- percentBlocked: int - percentage of ad elements hidden (0-100)
"""
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
// Track network requests
let blockedRequests = 0;
let totalRequests = 0;
const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr',
'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo',
'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini'];
page.on('request', request => {{
totalRequests++;
const url = request.url().toLowerCase();
if (adDomains.some(d => url.includes(d))) {{
// This is an ad request
}}
}});
page.on('requestfailed', request => {{
const url = request.url().toLowerCase();
if (adDomains.some(d => url.includes(d))) {{
blockedRequests++;
}}
}});
console.error('Navigating to {test_url}...');
await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }});
// Wait for page to fully render and ads to load
await new Promise(r => setTimeout(r, 5000));
// Check for ad elements in the DOM
const result = await page.evaluate(() => {{
// Common ad-related selectors
const adSelectors = [
// Generic ad containers
'[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]',
'[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]',
'[class*="advertisement"]', '[id*="advertisement"]',
'[class*="sponsored"]', '[id*="sponsored"]',
// Google ads
'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]',
// Yahoo specific
'[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]',
'[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]',
// iframes (often ads)
'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
// Common ad sizes
'[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]',
'[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]',
];
let adElementsFound = 0;
let adElementsVisible = 0;
for (const selector of adSelectors) {{
try {{
const elements = document.querySelectorAll(selector);
for (const el of elements) {{
adElementsFound++;
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const isVisible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0' &&
rect.width > 0 && rect.height > 0;
if (isVisible) {{
adElementsVisible++;
}}
}}
}} catch (e) {{
// Invalid selector, skip
}}
}}
return {{
adElementsFound,
adElementsVisible,
pageTitle: document.title
}};
}});
result.blockedRequests = blockedRequests;
result.totalRequests = totalRequests;
// Calculate how many ad elements were hidden (found but not visible)
const hiddenAds = result.adElementsFound - result.adElementsVisible;
result.percentBlocked = result.adElementsFound > 0
? Math.round((hiddenAds / result.adElementsFound) * 100)
: 0;
console.error('Ad blocking result:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = script_dir / 'check_ads.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
timeout=90
)
if result.returncode != 0:
raise RuntimeError(f"Ad check script failed: {result.stderr}")
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
if not output_lines:
raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}")
return json.loads(output_lines[-1])
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure like:
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
bin/
.bin/
node_modules/
chrome_extensions/
personas/
default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
@@ -188,18 +383,28 @@ def setup_test_env(tmpdir: Path) -> dict:
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / 'bin'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
chrome_extensions_dir = data_dir / 'chrome_extensions'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
@@ -210,12 +415,14 @@ def setup_test_env(tmpdir: Path) -> dict:
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=10, env=env
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
@@ -240,8 +447,8 @@ def setup_test_env(tmpdir: Path) -> dict:
return env
# Test URL: ad blocker test page that shows if ads are blocked
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
# Test URL: Yahoo has many ads that uBlock should block
TEST_URL = 'https://www.yahoo.com/'
@pytest.mark.timeout(15)
@@ -290,14 +497,18 @@ def test_extension_loads_in_chromium():
print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
print("[test] Launching Chromium...", flush=True)
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
# Launch Chromium in crawls directory
crawl_id = 'test-ublock'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
crawl_dir.mkdir(parents=True, exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True, exist_ok=True)
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -457,161 +668,177 @@ const puppeteer = require('puppeteer-core');
def test_blocks_ads_on_test_page():
"""Live test: verify uBlock Origin blocks ads on a test page.
Uses Chromium with extensions loaded automatically via chrome hook.
Tests against d3ward's ad blocker test page which checks ad domains.
This test runs TWO browser sessions:
1. WITHOUT extension - verifies ads are NOT blocked (baseline)
2. WITH extension - verifies ads ARE blocked
This ensures we're actually testing the extension's effect, not just
that a test page happens to show ads as blocked.
"""
import signal
import time
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated env with proper directory structure
env = setup_test_env(tmpdir)
env['CHROME_HEADLESS'] = 'true'
env_base = setup_test_env(tmpdir)
env_base['CHROME_HEADLESS'] = 'true'
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
# ============================================================
# STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked
# ============================================================
print("\n" + "="*60)
print("STEP 1: BASELINE TEST (no extension)")
print("="*60)
data_dir = Path(env_base['DATA_DIR'])
env_no_ext = env_base.copy()
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
# Launch baseline Chromium in crawls directory
baseline_crawl_id = 'baseline-no-ext'
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
baseline_process = None
try:
baseline_process, baseline_cdp_url = launch_chromium_session(
env_no_ext, baseline_chrome_dir, baseline_crawl_id
)
print(f"Baseline Chromium launched: {baseline_cdp_url}")
# Wait a moment for browser to be ready
time.sleep(2)
baseline_result = check_ad_blocking(
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
)
print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads "
f"(found {baseline_result['adElementsFound']} ad elements)")
finally:
if baseline_process:
kill_chromium_session(baseline_process, baseline_chrome_dir)
# Verify baseline shows ads ARE visible (not blocked)
if baseline_result['adElementsFound'] == 0:
pytest.skip(
f"Cannot test extension: no ad elements found on {TEST_URL}. "
f"The page may have changed or loaded differently."
)
if baseline_result['adElementsVisible'] == 0:
print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
print("This suggests either:")
print(" - There's another ad blocker interfering")
print(" - Network-level ad blocking is in effect")
pytest.skip(
f"Cannot test extension: baseline shows no visible ads "
f"despite finding {baseline_result['adElementsFound']} ad elements."
)
print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
# ============================================================
# STEP 2: Install the uBlock extension
# ============================================================
print("\n" + "="*60)
print("STEP 2: INSTALLING EXTENSION")
print("="*60)
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
# Step 1: Install the uBlock extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=15
env=env_base,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'ublock.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
# ============================================================
# STEP 3: Run WITH extension, verify ads ARE blocked
# ============================================================
print("\n" + "="*60)
print("STEP 3: TEST WITH EXTENSION")
print("="*60)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chrome CDP URL not found after 20s"
print(f"Chrome launched with CDP URL: {cdp_url}")
# Check that extensions were loaded
extensions_file = chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
# Launch extension test Chromium in crawls directory
ext_crawl_id = 'test-with-ext'
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
ext_chrome_dir = ext_crawl_dir / 'chrome'
env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
ext_process = None
try:
# Step 3: Connect to Chrome and test ad blocking
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
ext_process, ext_cdp_url = launch_chromium_session(
env_base, ext_chrome_dir, ext_crawl_id
)
print(f"Extension Chromium launched: {ext_cdp_url}")
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
# Check that extension was loaded
extensions_file = ext_chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 500));
# Wait for extension to initialize
time.sleep(3)
// Check extension loaded by looking at targets
const targets = browser.targets();
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
console.error('Extension targets found:', extTargets.length);
extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60)));
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
console.error('Navigating to {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
// Wait for the test page to run its checks
await new Promise(r => setTimeout(r, 5000));
// The d3ward test page shows blocked percentage
const result = await page.evaluate(() => {{
const scoreEl = document.querySelector('#score');
const score = scoreEl ? scoreEl.textContent : null;
const blockedItems = document.querySelectorAll('.blocked').length;
const totalItems = document.querySelectorAll('.testlist li').length;
return {{
score,
blockedItems,
totalItems,
percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
}};
}});
console.error('Ad blocking result:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = tmpdir / 'test_ublock.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=10
ext_result = check_ad_blocking(
ext_cdp_url, TEST_URL, env_base, tmpdir
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
# uBlock should block most ad domains on the test page
assert test_result['percentBlocked'] >= 50, \
f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
print(f"Extension result: {ext_result['adElementsVisible']} visible ads "
f"(found {ext_result['adElementsFound']} ad elements)")
finally:
# Clean up Chrome
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
if ext_process:
kill_chromium_session(ext_process, ext_chrome_dir)
# ============================================================
# STEP 4: Compare results
# ============================================================
print("\n" + "="*60)
print("STEP 4: COMPARISON")
print("="*60)
print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads")
print(f"With extension: {ext_result['adElementsVisible']} visible ads")
# Calculate reduction in visible ads
ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible']
reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0
print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)")
# Extension should significantly reduce visible ads
assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \
f"uBlock should reduce visible ads.\n" \
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Expected fewer ads with extension."
# Extension should block at least 30% of ads
assert reduction_percent >= 30, \
f"uBlock should block at least 30% of ads.\n" \
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Reduction: only {reduction_percent:.0f}% (expected at least 30%)"
print(f"\n✓ SUCCESS: uBlock correctly blocks ads!")
print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads")
print(f" - With extension: {ext_result['adElementsVisible']} visible ads")
print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)")

View File

@@ -133,7 +133,7 @@ This plugin provides shared Chrome infrastructure for other plugins. It manages
chrome/
├── on_Crawl__00_chrome_install_config.py # Configure Chrome settings
├── on_Crawl__00_chrome_install.py # Install Chrome binary
├── on_Crawl__20_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg)
├── on_Crawl__30_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg)
├── on_Snapshot__20_chrome_tab.bg.js # Open tab (Snapshot-level, bg)
├── on_Snapshot__30_chrome_navigate.js # Navigate to URL (foreground)
├── on_Snapshot__45_chrome_tab_cleanup.py # Close tab, kill bg hooks