mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-02 17:05:38 +10:00
extension test fixes
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
@@ -165,14 +165,6 @@ async function main() {
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
|
||||
// Write extensions metadata
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
// Connect puppeteer for extension verification
|
||||
console.error(`[*] Connecting puppeteer to CDP...`);
|
||||
const browser = await puppeteer.connect({
|
||||
@@ -181,30 +173,84 @@ async function main() {
|
||||
});
|
||||
browserInstance = browser;
|
||||
|
||||
// Verify extensions loaded
|
||||
// Get actual extension IDs from chrome://extensions page
|
||||
if (extensionPaths.length > 0) {
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const targets = browser.targets();
|
||||
console.error(`[*] All browser targets (${targets.length}):`);
|
||||
for (const t of targets) {
|
||||
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
|
||||
try {
|
||||
const extPage = await browser.newPage();
|
||||
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Parse extension info from the page
|
||||
const extensionsFromPage = await extPage.evaluate(() => {
|
||||
const extensions = [];
|
||||
// Extensions manager uses shadow DOM
|
||||
const manager = document.querySelector('extensions-manager');
|
||||
if (!manager || !manager.shadowRoot) return extensions;
|
||||
|
||||
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
|
||||
if (!itemList || !itemList.shadowRoot) return extensions;
|
||||
|
||||
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
|
||||
for (const item of items) {
|
||||
const id = item.getAttribute('id');
|
||||
const nameEl = item.shadowRoot?.querySelector('#name');
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
if (id && name) {
|
||||
extensions.push({ id, name });
|
||||
}
|
||||
}
|
||||
return extensions;
|
||||
});
|
||||
|
||||
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
|
||||
for (const e of extensionsFromPage) {
|
||||
console.error(` - ${e.id}: "${e.name}"`);
|
||||
}
|
||||
|
||||
// Match extensions by name (strict matching)
|
||||
for (const ext of installedExtensions) {
|
||||
// Read the extension's manifest to get its display name
|
||||
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
|
||||
const manifestName = manifest.name || '';
|
||||
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
|
||||
|
||||
// Find matching extension from page by exact name match first
|
||||
let match = extensionsFromPage.find(e => e.name === manifestName);
|
||||
|
||||
// If no exact match, try case-insensitive exact match
|
||||
if (!match) {
|
||||
match = extensionsFromPage.find(e =>
|
||||
e.name.toLowerCase() === manifestName.toLowerCase()
|
||||
);
|
||||
}
|
||||
|
||||
if (match) {
|
||||
ext.id = match.id;
|
||||
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
|
||||
} else {
|
||||
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await extPage.close();
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
|
||||
}
|
||||
|
||||
const extTargets = targets.filter(t =>
|
||||
t.url().startsWith('chrome-extension://') ||
|
||||
t.type() === 'service_worker' ||
|
||||
t.type() === 'background_page'
|
||||
);
|
||||
|
||||
// Filter out built-in extensions
|
||||
// Fallback: check browser targets
|
||||
const targets = browser.targets();
|
||||
const builtinIds = [
|
||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
||||
];
|
||||
const customExtTargets = extTargets.filter(t => {
|
||||
const customExtTargets = targets.filter(t => {
|
||||
const url = t.url();
|
||||
if (!url.startsWith('chrome-extension://')) return false;
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
@@ -216,7 +262,7 @@ async function main() {
|
||||
for (const target of customExtTargets) {
|
||||
const url = target.url();
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
|
||||
console.error(`[+] Extension target: ${extId} (${target.type()})`);
|
||||
}
|
||||
|
||||
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
||||
@@ -225,6 +271,14 @@ async function main() {
|
||||
}
|
||||
}
|
||||
|
||||
// Write extensions metadata with actual IDs
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
||||
console.error(`[+] PID: ${chromePid}`);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
/**
|
||||
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
|
||||
*
|
||||
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
|
||||
* If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
|
||||
* this connects to it and creates a new tab. Otherwise, falls back to launching
|
||||
* its own Chrome instance.
|
||||
*
|
||||
@@ -215,7 +215,7 @@ async function launchNewChrome(url, binary) {
|
||||
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
|
||||
|
||||
// Write PID immediately for cleanup
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
|
||||
|
||||
try {
|
||||
// Wait for Chrome to be ready
|
||||
|
||||
@@ -29,7 +29,7 @@ import shutil
|
||||
import platform
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
|
||||
@@ -176,6 +176,7 @@ def test_chrome_launch_and_tab_creation():
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Get test environment with NODE_MODULES_DIR set
|
||||
env = get_test_env()
|
||||
@@ -184,7 +185,7 @@ def test_chrome_launch_and_tab_creation():
|
||||
# Launch Chrome at crawl level (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -292,7 +293,7 @@ def test_chrome_navigation():
|
||||
# Launch Chrome (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -363,7 +364,7 @@ def test_tab_cleanup_on_sigterm():
|
||||
# Launch Chrome (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -423,11 +424,12 @@ def test_multiple_snapshots_share_chrome():
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -513,7 +515,7 @@ def test_chrome_cleanup_on_crawl_end():
|
||||
# Launch Chrome in background
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -554,11 +556,12 @@ def test_zombie_prevention_hook_killed():
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
|
||||
@@ -26,7 +26,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
TEST_URL = 'https://www.singsing.movie/'
|
||||
@@ -122,6 +122,7 @@ def setup_chrome_session(tmpdir):
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
@@ -129,7 +130,7 @@ def setup_chrome_session(tmpdir):
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
|
||||
@@ -16,7 +16,7 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None)
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
@@ -124,78 +124,106 @@ def test_no_configuration_required():
|
||||
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
||||
|
||||
|
||||
def setup_test_lib_dirs(tmpdir: Path) -> dict:
|
||||
"""Create isolated lib directories for tests and return env dict.
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
Sets up:
|
||||
LIB_DIR: tmpdir/lib/<arch>
|
||||
NODE_MODULES_DIR: tmpdir/lib/<arch>/npm/node_modules
|
||||
NPM_BIN_DIR: tmpdir/lib/<arch>/npm/bin
|
||||
PIP_VENV_DIR: tmpdir/lib/<arch>/pip/venv
|
||||
PIP_BIN_DIR: tmpdir/lib/<arch>/pip/venv/bin
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
Default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
arch = platform.machine()
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
arch_dir = f"{arch}-{system}"
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
lib_dir = tmpdir / 'lib' / arch_dir
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
pip_venv_dir = lib_dir / 'pip' / 'venv'
|
||||
pip_bin_dir = pip_venv_dir / 'bin'
|
||||
|
||||
# Create directories
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
pip_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Install puppeteer-core to the test node_modules if not present
|
||||
if not (node_modules_dir / 'puppeteer-core').exists():
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_dir), 'puppeteer-core'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Failed to install puppeteer-core: {result.stderr}")
|
||||
|
||||
return {
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'PIP_VENV_DIR': str(pip_venv_dir),
|
||||
'PIP_BIN_DIR': str(pip_bin_dir),
|
||||
}
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
|
||||
|
||||
def find_chromium_binary():
|
||||
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||
|
||||
This uses the centralized findChromium() function which checks:
|
||||
- CHROME_BINARY env var
|
||||
- @puppeteer/browsers install locations
|
||||
- System Chromium locations
|
||||
- Falls back to Chrome (with warning)
|
||||
"""
|
||||
chrome_utils = PLUGINS_ROOT / 'chrome' / 'chrome_utils.js'
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['node', str(chrome_utils), 'findChromium'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode == 0 and result.stdout.strip():
|
||||
return result.stdout.strip()
|
||||
return None
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
@@ -210,22 +238,11 @@ def test_extension_loads_in_chromium():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
# Set up isolated env with proper directory structure
|
||||
env = setup_test_env(tmpdir)
|
||||
env.setdefault('CHROME_HEADLESS', 'true')
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
@@ -245,13 +262,16 @@ def test_extension_loads_in_chromium():
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
crawl_id = 'test-cookies'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
|
||||
cwd=str(crawl_dir),
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -400,156 +420,362 @@ const puppeteer = require('puppeteer-core');
|
||||
pass
|
||||
|
||||
|
||||
def test_hides_cookie_consent_on_filmin():
|
||||
"""Live test: verify extension hides cookie consent popup on filmin.es.
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
Uses Chromium with extensions loaded automatically via chrome hook.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Set up isolated lib directories for this test
|
||||
lib_env = setup_test_lib_dirs(tmpdir)
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
# Set up extensions directory
|
||||
ext_dir = tmpdir / 'chrome_extensions'
|
||||
ext_dir.mkdir(parents=True)
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update(lib_env)
|
||||
env['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
# Ensure CHROME_BINARY points to Chromium
|
||||
chromium = find_chromium_binary()
|
||||
if chromium:
|
||||
env['CHROME_BINARY'] = chromium
|
||||
|
||||
# Step 1: Install the extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
crawl_dir = tmpdir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cookies'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chromium CDP URL not found after 20s"
|
||||
print(f"Chromium launched with CDP URL: {cdp_url}")
|
||||
|
||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
||||
"""Clean up Chromium process."""
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
# Step 3: Connect to Chromium and test cookie consent hiding
|
||||
test_script = f'''
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check if cookie consent elements are visible on a page.
|
||||
|
||||
Returns dict with:
|
||||
- visible: bool - whether any cookie consent element is visible
|
||||
- selector: str - which selector matched (if visible)
|
||||
- elements_found: list - all cookie-related elements found in DOM
|
||||
- html_snippet: str - snippet of the page HTML for debugging
|
||||
"""
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
console.error('Navigating to {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
console.error('Navigating to {test_url}...');
|
||||
await page.goto('{test_url}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
|
||||
// Wait for extension content script to process page
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
// Wait for page to fully render and any cookie scripts to run
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Check cookie consent visibility
|
||||
// Check cookie consent visibility using multiple common selectors
|
||||
const result = await page.evaluate(() => {{
|
||||
const selectors = ['.cky-consent-container', '.cky-popup-center', '.cky-overlay'];
|
||||
// Common cookie consent selectors used by various consent management platforms
|
||||
const selectors = [
|
||||
// CookieYes
|
||||
'.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal',
|
||||
// OneTrust
|
||||
'#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter',
|
||||
// Cookiebot
|
||||
'#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay',
|
||||
// Generic cookie banners
|
||||
'[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]',
|
||||
'[class*="cookie-popup"]', '[class*="cookie-modal"]', '[class*="cookie-dialog"]',
|
||||
'[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]',
|
||||
'[id*="cookieconsent"]', '[id*="cookie-law"]',
|
||||
// GDPR banners
|
||||
'[class*="gdpr"]', '[id*="gdpr"]',
|
||||
// Consent banners
|
||||
'[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-popup"]',
|
||||
// Privacy banners
|
||||
'[class*="privacy-banner"]', '[class*="privacy-notice"]',
|
||||
// Common frameworks
|
||||
'.cc-window', '.cc-banner', '#cc-main', // Cookie Consent by Insites
|
||||
'.qc-cmp2-container', // Quantcast
|
||||
'.sp-message-container', // SourcePoint
|
||||
];
|
||||
|
||||
const elementsFound = [];
|
||||
let visibleElement = null;
|
||||
|
||||
for (const sel of selectors) {{
|
||||
const el = document.querySelector(sel);
|
||||
if (el) {{
|
||||
const style = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
const visible = style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
rect.width > 0 && rect.height > 0;
|
||||
if (visible) return {{ visible: true, selector: sel }};
|
||||
try {{
|
||||
const elements = document.querySelectorAll(sel);
|
||||
for (const el of elements) {{
|
||||
const style = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
const isVisible = style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
style.opacity !== '0' &&
|
||||
rect.width > 0 && rect.height > 0;
|
||||
|
||||
elementsFound.push({{
|
||||
selector: sel,
|
||||
visible: isVisible,
|
||||
display: style.display,
|
||||
visibility: style.visibility,
|
||||
opacity: style.opacity,
|
||||
width: rect.width,
|
||||
height: rect.height
|
||||
}});
|
||||
|
||||
if (isVisible && !visibleElement) {{
|
||||
visibleElement = {{ selector: sel, width: rect.width, height: rect.height }};
|
||||
}}
|
||||
}}
|
||||
}} catch (e) {{
|
||||
// Invalid selector, skip
|
||||
}}
|
||||
}}
|
||||
return {{ visible: false }};
|
||||
|
||||
// Also grab a snippet of the HTML to help debug
|
||||
const bodyHtml = document.body.innerHTML.slice(0, 2000);
|
||||
const hasCookieKeyword = bodyHtml.toLowerCase().includes('cookie') ||
|
||||
bodyHtml.toLowerCase().includes('consent') ||
|
||||
bodyHtml.toLowerCase().includes('gdpr');
|
||||
|
||||
return {{
|
||||
visible: visibleElement !== null,
|
||||
selector: visibleElement ? visibleElement.selector : null,
|
||||
elements_found: elementsFound,
|
||||
has_cookie_keyword_in_html: hasCookieKeyword,
|
||||
html_snippet: bodyHtml.slice(0, 500)
|
||||
}};
|
||||
}});
|
||||
|
||||
console.error('Cookie consent:', JSON.stringify(result));
|
||||
console.error('Cookie consent check result:', JSON.stringify({{
|
||||
visible: result.visible,
|
||||
selector: result.selector,
|
||||
elements_found_count: result.elements_found.length
|
||||
}}));
|
||||
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_extension.js'
|
||||
script_path.write_text(test_script)
|
||||
script_path = script_dir / 'check_cookies.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(script_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Cookie check script failed: {result.stderr}")
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
if not output_lines:
|
||||
raise RuntimeError(f"No JSON output from cookie check: {result.stdout}\nstderr: {result.stderr}")
|
||||
|
||||
return json.loads(output_lines[-1])
|
||||
|
||||
|
||||
def test_hides_cookie_consent_on_filmin():
|
||||
"""Live test: verify extension hides cookie consent popup on filmin.es.
|
||||
|
||||
This test runs TWO browser sessions:
|
||||
1. WITHOUT extension - verifies cookie consent IS visible (baseline)
|
||||
2. WITH extension - verifies cookie consent is HIDDEN
|
||||
|
||||
This ensures we're actually testing the extension's effect, not just
|
||||
that a page happens to not have cookie consent.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated env with proper directory structure
|
||||
env_base = setup_test_env(tmpdir)
|
||||
env_base['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# ============================================================
|
||||
# STEP 1: BASELINE - Run WITHOUT extension, verify cookie consent IS visible
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 1: BASELINE TEST (no extension)")
|
||||
print("="*60)
|
||||
|
||||
data_dir = Path(env_base['DATA_DIR'])
|
||||
|
||||
env_no_ext = env_base.copy()
|
||||
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
|
||||
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Launch baseline Chromium in crawls directory
|
||||
baseline_crawl_id = 'baseline-no-ext'
|
||||
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
|
||||
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
|
||||
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
|
||||
baseline_process = None
|
||||
|
||||
try:
|
||||
baseline_process, baseline_cdp_url = launch_chromium_session(
|
||||
env_no_ext, baseline_chrome_dir, baseline_crawl_id
|
||||
)
|
||||
print(f"Baseline Chromium launched: {baseline_cdp_url}")
|
||||
|
||||
# Wait a moment for browser to be ready
|
||||
time.sleep(2)
|
||||
|
||||
baseline_result = check_cookie_consent_visibility(
|
||||
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
print(f"Baseline result: visible={baseline_result['visible']}, "
|
||||
f"elements_found={len(baseline_result['elements_found'])}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
assert not test_result['visible'], \
|
||||
f"Cookie consent should be hidden by extension. Result: {test_result}"
|
||||
if baseline_result['elements_found']:
|
||||
print("Elements found in baseline:")
|
||||
for el in baseline_result['elements_found'][:5]: # Show first 5
|
||||
print(f" - {el['selector']}: visible={el['visible']}, "
|
||||
f"display={el['display']}, size={el['width']}x{el['height']}")
|
||||
|
||||
finally:
|
||||
# Clean up Chromium
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
if baseline_process:
|
||||
kill_chromium_session(baseline_process, baseline_chrome_dir)
|
||||
|
||||
# Verify baseline shows cookie consent
|
||||
if not baseline_result['visible']:
|
||||
# If no cookie consent visible in baseline, we can't test the extension
|
||||
# This could happen if:
|
||||
# - The site changed and no longer shows cookie consent
|
||||
# - Cookie consent is region-specific
|
||||
# - Our selectors don't match this site
|
||||
print("\nWARNING: No cookie consent visible in baseline!")
|
||||
print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}")
|
||||
print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}")
|
||||
|
||||
pytest.skip(
|
||||
f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. "
|
||||
f"Elements found: {len(baseline_result['elements_found'])}. "
|
||||
f"The site may have changed or cookie consent may be region-specific."
|
||||
)
|
||||
|
||||
print(f"\n✓ Baseline confirmed: Cookie consent IS visible (selector: {baseline_result['selector']})")
|
||||
|
||||
# ============================================================
|
||||
# STEP 2: Install the extension
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: INSTALLING EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
env_with_ext = env_base.copy()
|
||||
env_with_ext['CHROME_EXTENSIONS_DIR'] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env_with_ext,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# ============================================================
|
||||
# STEP 3: Run WITH extension, verify cookie consent is HIDDEN
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 3: TEST WITH EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
# Launch extension test Chromium in crawls directory
|
||||
ext_crawl_id = 'test-with-ext'
|
||||
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
|
||||
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
ext_chrome_dir = ext_crawl_dir / 'chrome'
|
||||
env_with_ext['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
|
||||
ext_process = None
|
||||
|
||||
try:
|
||||
ext_process, ext_cdp_url = launch_chromium_session(
|
||||
env_with_ext, ext_chrome_dir, ext_crawl_id
|
||||
)
|
||||
print(f"Extension Chromium launched: {ext_cdp_url}")
|
||||
|
||||
# Check that extension was loaded
|
||||
extensions_file = ext_chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
|
||||
# Wait for extension to initialize
|
||||
time.sleep(3)
|
||||
|
||||
ext_result = check_cookie_consent_visibility(
|
||||
ext_cdp_url, TEST_URL, env_with_ext, tmpdir
|
||||
)
|
||||
|
||||
print(f"Extension result: visible={ext_result['visible']}, "
|
||||
f"elements_found={len(ext_result['elements_found'])}")
|
||||
|
||||
if ext_result['elements_found']:
|
||||
print("Elements found with extension:")
|
||||
for el in ext_result['elements_found'][:5]:
|
||||
print(f" - {el['selector']}: visible={el['visible']}, "
|
||||
f"display={el['display']}, size={el['width']}x{el['height']}")
|
||||
|
||||
finally:
|
||||
if ext_process:
|
||||
kill_chromium_session(ext_process, ext_chrome_dir)
|
||||
|
||||
# ============================================================
|
||||
# STEP 4: Compare results
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 4: COMPARISON")
|
||||
print("="*60)
|
||||
print(f"Baseline (no extension): cookie consent visible = {baseline_result['visible']}")
|
||||
print(f"With extension: cookie consent visible = {ext_result['visible']}")
|
||||
|
||||
assert baseline_result['visible'], \
|
||||
"Baseline should show cookie consent (this shouldn't happen, we checked above)"
|
||||
|
||||
assert not ext_result['visible'], \
|
||||
f"Cookie consent should be HIDDEN by extension.\n" \
|
||||
f"Baseline showed consent at: {baseline_result['selector']}\n" \
|
||||
f"But with extension, consent is still visible.\n" \
|
||||
f"Elements still visible: {[e for e in ext_result['elements_found'] if e['visible']]}"
|
||||
|
||||
print("\n✓ SUCCESS: Extension correctly hides cookie consent!")
|
||||
print(f" - Baseline showed consent at: {baseline_result['selector']}")
|
||||
print(f" - Extension successfully hid it")
|
||||
|
||||
@@ -26,7 +26,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
TEST_URL = 'https://www.singsing.movie/'
|
||||
@@ -123,6 +123,7 @@ def setup_chrome_session(tmpdir):
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
@@ -130,7 +131,7 @@ def setup_chrome_session(tmpdir):
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'],
|
||||
cwd=str(crawl_dir),
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
|
||||
@@ -4,18 +4,47 @@
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"CAPTCHA2_ENABLED": {
|
||||
"TWOCAPTCHA_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_CAPTCHA2"],
|
||||
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
|
||||
"x-aliases": ["CAPTCHA2_ENABLED", "USE_CAPTCHA2", "USE_TWOCAPTCHA"],
|
||||
"description": "Enable 2captcha browser extension for automatic CAPTCHA solving"
|
||||
},
|
||||
"CAPTCHA2_TIMEOUT": {
|
||||
"TWOCAPTCHA_API_KEY": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-aliases": ["API_KEY_2CAPTCHA", "CAPTCHA2_API_KEY"],
|
||||
"x-sensitive": true,
|
||||
"description": "2captcha API key for CAPTCHA solving service (get from https://2captcha.com)"
|
||||
},
|
||||
"TWOCAPTCHA_RETRY_COUNT": {
|
||||
"type": "integer",
|
||||
"default": 3,
|
||||
"minimum": 0,
|
||||
"maximum": 10,
|
||||
"x-aliases": ["CAPTCHA2_RETRY_COUNT"],
|
||||
"description": "Number of times to retry CAPTCHA solving on error"
|
||||
},
|
||||
"TWOCAPTCHA_RETRY_DELAY": {
|
||||
"type": "integer",
|
||||
"default": 5,
|
||||
"minimum": 0,
|
||||
"maximum": 60,
|
||||
"x-aliases": ["CAPTCHA2_RETRY_DELAY"],
|
||||
"description": "Delay in seconds between CAPTCHA solving retries"
|
||||
},
|
||||
"TWOCAPTCHA_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"x-aliases": ["CAPTCHA2_TIMEOUT"],
|
||||
"description": "Timeout for CAPTCHA solving in seconds"
|
||||
},
|
||||
"TWOCAPTCHA_AUTO_SUBMIT": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "Automatically submit forms after CAPTCHA is solved"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* Requirements:
|
||||
* - API_KEY_2CAPTCHA environment variable must be set
|
||||
* - TWOCAPTCHA_API_KEY environment variable must be set
|
||||
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
|
||||
*/
|
||||
|
||||
@@ -47,10 +47,10 @@ async function installCaptchaExtension() {
|
||||
}
|
||||
|
||||
// Check if API key is configured
|
||||
const apiKey = process.env.API_KEY_2CAPTCHA;
|
||||
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
|
||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
|
||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
||||
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
|
||||
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
|
||||
} else {
|
||||
console.log('[+] 2captcha extension installed and API key configured');
|
||||
}
|
||||
|
||||
@@ -2,14 +2,21 @@
|
||||
/**
|
||||
* 2Captcha Extension Configuration
|
||||
*
|
||||
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
|
||||
* Runs once per crawl to inject API key into extension storage.
|
||||
* Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
|
||||
* Runs once per crawl to inject configuration into extension storage.
|
||||
*
|
||||
* Priority: 11 (after chrome_launch at 20)
|
||||
* Priority: 25 (after chrome_launch at 30, before snapshots start)
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* Config Options (from config.json / environment):
|
||||
* - TWOCAPTCHA_API_KEY: API key for 2captcha service
|
||||
* - TWOCAPTCHA_ENABLED: Enable/disable the extension
|
||||
* - TWOCAPTCHA_RETRY_COUNT: Number of retries on error
|
||||
* - TWOCAPTCHA_RETRY_DELAY: Delay between retries (seconds)
|
||||
* - TWOCAPTCHA_AUTO_SUBMIT: Auto-submit forms after solving
|
||||
*
|
||||
* Requirements:
|
||||
* - API_KEY_2CAPTCHA environment variable must be set
|
||||
* - TWOCAPTCHA_API_KEY environment variable must be set
|
||||
* - chrome plugin must have loaded extensions (extensions.json must exist)
|
||||
*/
|
||||
|
||||
@@ -36,6 +43,20 @@ function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
// Get boolean environment variable
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get integer environment variable
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
@@ -48,6 +69,82 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get 2captcha configuration from environment variables.
|
||||
* Supports both TWOCAPTCHA_* and legacy API_KEY_2CAPTCHA naming.
|
||||
*/
|
||||
function getTwoCaptchaConfig() {
|
||||
const apiKey = getEnv('TWOCAPTCHA_API_KEY') || getEnv('API_KEY_2CAPTCHA') || getEnv('CAPTCHA2_API_KEY');
|
||||
const isEnabled = getEnvBool('TWOCAPTCHA_ENABLED', true);
|
||||
const retryCount = getEnvInt('TWOCAPTCHA_RETRY_COUNT', 3);
|
||||
const retryDelay = getEnvInt('TWOCAPTCHA_RETRY_DELAY', 5);
|
||||
const autoSubmit = getEnvBool('TWOCAPTCHA_AUTO_SUBMIT', false);
|
||||
|
||||
// Build the full config object matching the extension's storage structure
|
||||
// Structure: chrome.storage.local.set({config: {...}})
|
||||
return {
|
||||
// API key - both variants for compatibility
|
||||
apiKey: apiKey,
|
||||
api_key: apiKey,
|
||||
|
||||
// Plugin enabled state
|
||||
isPluginEnabled: isEnabled,
|
||||
|
||||
// Retry settings
|
||||
repeatOnErrorTimes: retryCount,
|
||||
repeatOnErrorDelay: retryDelay,
|
||||
|
||||
// Auto-submit setting
|
||||
autoSubmitForms: autoSubmit,
|
||||
submitFormsDelay: 0,
|
||||
|
||||
// Enable all CAPTCHA types
|
||||
enabledForNormal: true,
|
||||
enabledForRecaptchaV2: true,
|
||||
enabledForInvisibleRecaptchaV2: true,
|
||||
enabledForRecaptchaV3: true,
|
||||
enabledForRecaptchaAudio: false,
|
||||
enabledForGeetest: true,
|
||||
enabledForGeetest_v4: true,
|
||||
enabledForKeycaptcha: true,
|
||||
enabledForArkoselabs: true,
|
||||
enabledForLemin: true,
|
||||
enabledForYandex: true,
|
||||
enabledForCapyPuzzle: true,
|
||||
enabledForTurnstile: true,
|
||||
enabledForAmazonWaf: true,
|
||||
enabledForMTCaptcha: true,
|
||||
|
||||
// Auto-solve all CAPTCHA types
|
||||
autoSolveNormal: true,
|
||||
autoSolveRecaptchaV2: true,
|
||||
autoSolveInvisibleRecaptchaV2: true,
|
||||
autoSolveRecaptchaV3: true,
|
||||
autoSolveRecaptchaAudio: false,
|
||||
autoSolveGeetest: true,
|
||||
autoSolveGeetest_v4: true,
|
||||
autoSolveKeycaptcha: true,
|
||||
autoSolveArkoselabs: true,
|
||||
autoSolveLemin: true,
|
||||
autoSolveYandex: true,
|
||||
autoSolveCapyPuzzle: true,
|
||||
autoSolveTurnstile: true,
|
||||
autoSolveAmazonWaf: true,
|
||||
autoSolveMTCaptcha: true,
|
||||
|
||||
// Other settings with sensible defaults
|
||||
recaptchaV2Type: 'token',
|
||||
recaptchaV3MinScore: 0.3,
|
||||
buttonPosition: 'inner',
|
||||
useProxy: false,
|
||||
proxy: '',
|
||||
proxytype: 'HTTP',
|
||||
blackListDomain: '',
|
||||
autoSubmitRules: [],
|
||||
normalSources: [],
|
||||
};
|
||||
}
|
||||
|
||||
async function configure2Captcha() {
|
||||
// Check if already configured in this session
|
||||
if (fs.existsSync(CONFIG_MARKER)) {
|
||||
@@ -55,29 +152,23 @@ async function configure2Captcha() {
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
// Get configuration
|
||||
const config = getTwoCaptchaConfig();
|
||||
|
||||
// Check if API key is set
|
||||
const apiKey = getEnv('API_KEY_2CAPTCHA');
|
||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
|
||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
||||
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
|
||||
if (!config.apiKey || config.apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[!] 2captcha extension loaded but TWOCAPTCHA_API_KEY not configured');
|
||||
console.warn('[!] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
|
||||
return { success: false, error: 'TWOCAPTCHA_API_KEY not configured' };
|
||||
}
|
||||
|
||||
// Load extensions metadata
|
||||
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
|
||||
if (!fs.existsSync(extensionsFile)) {
|
||||
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
|
||||
}
|
||||
|
||||
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
|
||||
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
|
||||
|
||||
if (!captchaExt) {
|
||||
console.error('[*] 2captcha extension not installed, skipping configuration');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
console.error('[*] Configuring 2captcha extension with API key...');
|
||||
console.error('[*] Configuring 2captcha extension...');
|
||||
console.error(`[*] API Key: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`);
|
||||
console.error(`[*] Enabled: ${config.isPluginEnabled}`);
|
||||
console.error(`[*] Retry Count: ${config.repeatOnErrorTimes}`);
|
||||
console.error(`[*] Retry Delay: ${config.repeatOnErrorDelay}s`);
|
||||
console.error(`[*] Auto Submit: ${config.autoSubmitForms}`);
|
||||
console.error(`[*] Auto Solve: all CAPTCHA types enabled`);
|
||||
|
||||
try {
|
||||
// Connect to the existing Chrome session via CDP
|
||||
@@ -90,138 +181,116 @@ async function configure2Captcha() {
|
||||
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
|
||||
try {
|
||||
// Method 1: Try to inject via extension background page
|
||||
if (captchaExt.target && captchaExt.target_ctx) {
|
||||
console.error('[*] Attempting to configure via extension background page...');
|
||||
// First, navigate to a page to trigger extension content scripts and wake up service worker
|
||||
console.error('[*] Waking up extension by visiting a page...');
|
||||
const triggerPage = await browser.newPage();
|
||||
try {
|
||||
await triggerPage.goto('https://www.google.com', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||
await new Promise(r => setTimeout(r, 3000)); // Give extension time to initialize
|
||||
} catch (e) {
|
||||
console.warn(`[!] Trigger page failed: ${e.message}`);
|
||||
}
|
||||
try { await triggerPage.close(); } catch (e) {}
|
||||
|
||||
// Reconnect to the browser to get fresh target context
|
||||
const targets = await browser.targets();
|
||||
const extTarget = targets.find(t =>
|
||||
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
|
||||
);
|
||||
|
||||
if (extTarget) {
|
||||
const extContext = await extTarget.worker() || await extTarget.page();
|
||||
|
||||
if (extContext) {
|
||||
await extContext.evaluate((key) => {
|
||||
// Try all common storage patterns
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({
|
||||
apiKey: key,
|
||||
api_key: key,
|
||||
'2captcha_apikey': key,
|
||||
apikey: key,
|
||||
'solver-api-key': key,
|
||||
});
|
||||
chrome.storage.sync.set({
|
||||
apiKey: key,
|
||||
api_key: key,
|
||||
'2captcha_apikey': key,
|
||||
apikey: key,
|
||||
'solver-api-key': key,
|
||||
});
|
||||
}
|
||||
|
||||
// Also try localStorage as fallback
|
||||
if (typeof localStorage !== 'undefined') {
|
||||
localStorage.setItem('apiKey', key);
|
||||
localStorage.setItem('2captcha_apikey', key);
|
||||
localStorage.setItem('solver-api-key', key);
|
||||
}
|
||||
}, apiKey);
|
||||
|
||||
console.error('[+] 2captcha API key configured successfully via background page');
|
||||
|
||||
// Mark as configured
|
||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
||||
|
||||
return { success: true, method: 'background_page' };
|
||||
}
|
||||
}
|
||||
// Get 2captcha extension info from extensions.json
|
||||
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
|
||||
if (!fs.existsSync(extensionsFile)) {
|
||||
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
|
||||
}
|
||||
|
||||
// Method 2: Try to configure via options page
|
||||
console.error('[*] Attempting to configure via options page...');
|
||||
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
|
||||
const configPage = await browser.newPage();
|
||||
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
|
||||
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
|
||||
|
||||
if (!captchaExt) {
|
||||
console.error('[*] 2captcha extension not installed, skipping configuration');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
if (!captchaExt.id) {
|
||||
return { success: false, error: '2captcha extension ID not found in extensions.json' };
|
||||
}
|
||||
|
||||
const extensionId = captchaExt.id;
|
||||
console.error(`[*] 2captcha Extension ID: ${extensionId}`);
|
||||
|
||||
// Configure via options page
|
||||
console.error('[*] Configuring via options page...');
|
||||
const optionsUrl = `chrome-extension://${extensionId}/options/options.html`;
|
||||
|
||||
let configPage = await browser.newPage();
|
||||
|
||||
try {
|
||||
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
|
||||
|
||||
const configured = await configPage.evaluate((key) => {
|
||||
// Try to find API key input field
|
||||
const selectors = [
|
||||
'input[name*="apikey" i]',
|
||||
'input[id*="apikey" i]',
|
||||
'input[name*="api-key" i]',
|
||||
'input[id*="api-key" i]',
|
||||
'input[name*="key" i]',
|
||||
'input[placeholder*="api" i]',
|
||||
'input[type="text"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const input = document.querySelector(selector);
|
||||
if (input) {
|
||||
input.value = key;
|
||||
input.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
input.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
|
||||
// Try to find and click save button
|
||||
const saveSelectors = [
|
||||
'button[type="submit"]',
|
||||
'input[type="submit"]',
|
||||
'button:contains("Save")',
|
||||
'button:contains("Apply")',
|
||||
];
|
||||
|
||||
for (const btnSel of saveSelectors) {
|
||||
const btn = document.querySelector(btnSel);
|
||||
if (btn) {
|
||||
btn.click();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Also save to storage
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Just save to storage
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}, apiKey);
|
||||
|
||||
await configPage.close();
|
||||
|
||||
if (configured) {
|
||||
console.error('[+] 2captcha API key configured successfully via options page');
|
||||
|
||||
// Mark as configured
|
||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
||||
|
||||
return { success: true, method: 'options_page' };
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
|
||||
// Navigate to options page - catch error but continue since page may still load
|
||||
try {
|
||||
await configPage.close();
|
||||
} catch (e2) {}
|
||||
}
|
||||
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
|
||||
} catch (navError) {
|
||||
// Navigation may throw ERR_BLOCKED_BY_CLIENT but page still loads
|
||||
console.error(`[*] Navigation threw error (may still work): ${navError.message}`);
|
||||
}
|
||||
|
||||
return { success: false, error: 'Could not configure via any method' };
|
||||
// Wait a moment for page to settle
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Check all pages for the extension page (Chrome may open it in a different tab)
|
||||
const pages = await browser.pages();
|
||||
for (const page of pages) {
|
||||
const url = page.url();
|
||||
if (url.startsWith(`chrome-extension://${extensionId}`)) {
|
||||
configPage = page;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const currentUrl = configPage.url();
|
||||
console.error(`[*] Current URL: ${currentUrl}`);
|
||||
|
||||
if (!currentUrl.startsWith(`chrome-extension://${extensionId}`)) {
|
||||
return { success: false, error: `Failed to navigate to options page, got: ${currentUrl}` };
|
||||
}
|
||||
|
||||
// Wait for Config object to be available
|
||||
console.error('[*] Waiting for Config object...');
|
||||
await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 });
|
||||
|
||||
// Use chrome.storage.local.set with the config wrapper
|
||||
const result = await configPage.evaluate((cfg) => {
|
||||
return new Promise((resolve) => {
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ config: cfg }, () => {
|
||||
if (chrome.runtime.lastError) {
|
||||
resolve({ success: false, error: chrome.runtime.lastError.message });
|
||||
} else {
|
||||
resolve({ success: true, method: 'options_page' });
|
||||
}
|
||||
});
|
||||
} else {
|
||||
resolve({ success: false, error: 'chrome.storage not available' });
|
||||
}
|
||||
});
|
||||
}, config);
|
||||
|
||||
if (result.success) {
|
||||
console.error(`[+] 2captcha configured via ${result.method}`);
|
||||
fs.writeFileSync(CONFIG_MARKER, JSON.stringify({
|
||||
timestamp: new Date().toISOString(),
|
||||
method: result.method,
|
||||
extensionId: extensionId,
|
||||
config: {
|
||||
apiKeySet: !!config.apiKey,
|
||||
isPluginEnabled: config.isPluginEnabled,
|
||||
repeatOnErrorTimes: config.repeatOnErrorTimes,
|
||||
repeatOnErrorDelay: config.repeatOnErrorDelay,
|
||||
autoSubmitForms: config.autoSubmitForms,
|
||||
autoSolveEnabled: true,
|
||||
}
|
||||
}, null, 2));
|
||||
return { success: true, method: result.method };
|
||||
}
|
||||
|
||||
return { success: false, error: result.error || 'Config failed' };
|
||||
} finally {
|
||||
try { await configPage.close(); } catch (e) {}
|
||||
}
|
||||
} finally {
|
||||
browser.disconnect();
|
||||
}
|
||||
@@ -236,7 +305,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,184 +1,398 @@
|
||||
"""
|
||||
Unit tests for twocaptcha plugin
|
||||
Integration tests for twocaptcha plugin
|
||||
|
||||
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
|
||||
Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs
|
||||
|
||||
NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None)
|
||||
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None)
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Only set headless if not already in environment (allow override for debugging)
|
||||
if 'CHROME_HEADLESS' not in os.environ:
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
def test_config_script_exists():
|
||||
"""Verify config script exists"""
|
||||
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
|
||||
def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url)."""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
cdp_url = None
|
||||
for _ in range(30):
|
||||
if process.poll() is not None:
|
||||
stdout, stderr = process.communicate()
|
||||
raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
process.kill()
|
||||
stdout, stderr = process.communicate()
|
||||
raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
|
||||
|
||||
# Wait for extensions.json to be written (chrome launch hook parses chrome://extensions)
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
for _ in range(15):
|
||||
if extensions_file.exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
# Print chrome launch hook output for debugging
|
||||
import select
|
||||
if hasattr(select, 'poll'):
|
||||
# Read any available stderr without blocking
|
||||
import fcntl
|
||||
import os as os_module
|
||||
fd = process.stderr.fileno()
|
||||
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
|
||||
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
|
||||
try:
|
||||
stderr_output = process.stderr.read()
|
||||
if stderr_output:
|
||||
print(f"[Chrome Launch Hook Output]\n{stderr_output}")
|
||||
except:
|
||||
pass
|
||||
|
||||
return process, cdp_url
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that twocaptcha extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
# Just check the script can be loaded
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert metadata["name"] == "twocaptcha"
|
||||
def kill_chrome(process, chrome_dir: Path):
|
||||
"""Kill Chromium process."""
|
||||
try:
|
||||
process.send_signal(signal.SIGTERM)
|
||||
process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
pid_file = chrome_dir / 'chrome.pid'
|
||||
if pid_file.exists():
|
||||
try:
|
||||
os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
class TestTwoCaptcha:
|
||||
"""Integration tests requiring TWOCAPTCHA_API_KEY."""
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup(self):
|
||||
self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA')
|
||||
if not self.api_key:
|
||||
pytest.skip("TWOCAPTCHA_API_KEY required")
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
def test_install_and_load(self):
|
||||
"""Extension installs and loads in Chromium."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = setup_test_env(tmpdir)
|
||||
env['TWOCAPTCHA_API_KEY'] = self.api_key
|
||||
|
||||
# Check output mentions installation
|
||||
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
|
||||
# Install
|
||||
result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True)
|
||||
assert result.returncode == 0, f"Install failed: {result.stderr}"
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "twocaptcha.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json'
|
||||
assert cache.exists()
|
||||
data = json.loads(cache.read_text())
|
||||
assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo'
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert cache_data["name"] == "twocaptcha"
|
||||
assert "unpacked_path" in cache_data
|
||||
assert "version" in cache_data
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'test'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
exts = json.loads((chrome_dir / 'extensions.json').read_text())
|
||||
assert any(e['name'] == 'twocaptcha' for e in exts), f"Not loaded: {exts}"
|
||||
print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
def test_config_applied(self):
|
||||
"""Configuration is applied to extension and verified via Config.getAll()."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = setup_test_env(tmpdir)
|
||||
env['TWOCAPTCHA_API_KEY'] = self.api_key
|
||||
env['TWOCAPTCHA_RETRY_COUNT'] = '5'
|
||||
env['TWOCAPTCHA_RETRY_DELAY'] = '10'
|
||||
|
||||
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
|
||||
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'cfg'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'],
|
||||
env=env, timeout=30, capture_output=True, text=True
|
||||
)
|
||||
assert result.returncode == 0, f"Config failed: {result.stderr}"
|
||||
assert (chrome_dir / '.twocaptcha_configured').exists()
|
||||
|
||||
# Verify config via options.html and Config.getAll()
|
||||
# Get the actual extension ID from the config marker (Chrome computes IDs differently)
|
||||
config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text())
|
||||
ext_id = config_marker['extensionId']
|
||||
script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
// Load options.html and use Config.getAll() to verify
|
||||
const optionsUrl = 'chrome-extension://{ext_id}/options/options.html';
|
||||
const page = await browser.newPage();
|
||||
console.error('[*] Loading options page:', optionsUrl);
|
||||
|
||||
// Navigate - catch error but continue since page may still load
|
||||
try {{
|
||||
await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }});
|
||||
}} catch (e) {{
|
||||
console.error('[*] Navigation threw error (may still work):', e.message);
|
||||
}}
|
||||
|
||||
// Wait for page to settle
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
console.error('[*] Current URL:', page.url());
|
||||
|
||||
// Wait for Config object to be available
|
||||
await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }});
|
||||
|
||||
// Call Config.getAll() - the extension's own API (returns a Promise)
|
||||
const cfg = await page.evaluate(async () => await Config.getAll());
|
||||
console.error('[*] Config.getAll() returned:', JSON.stringify(cfg));
|
||||
|
||||
await page.close();
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(cfg));
|
||||
}})();
|
||||
'''
|
||||
(tmpdir / 'v.js').write_text(script)
|
||||
r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True)
|
||||
print(r.stderr)
|
||||
assert r.returncode == 0, f"Verify failed: {r.stderr}"
|
||||
|
||||
cfg = json.loads(r.stdout.strip().split('\n')[-1])
|
||||
print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}")
|
||||
|
||||
# Verify all the fields we care about
|
||||
assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}"
|
||||
assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}"
|
||||
assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}"
|
||||
assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}"
|
||||
assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}"
|
||||
assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}"
|
||||
assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}"
|
||||
assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}"
|
||||
|
||||
print(f"[+] Config verified via Config.getAll()!")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
def test_solves_recaptcha(self):
|
||||
"""Extension solves reCAPTCHA on demo page."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = setup_test_env(tmpdir)
|
||||
env['TWOCAPTCHA_API_KEY'] = self.api_key
|
||||
|
||||
subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True)
|
||||
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'solve'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
process, cdp_url = launch_chrome(env, chrome_dir, crawl_id)
|
||||
|
||||
try:
|
||||
subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True)
|
||||
|
||||
script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
console.error('[*] Loading {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }});
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
const start = Date.now();
|
||||
const maxWait = 90000;
|
||||
|
||||
while (Date.now() - start < maxWait) {{
|
||||
const state = await page.evaluate(() => {{
|
||||
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
|
||||
const solver = document.querySelector('.captcha-solver');
|
||||
return {{
|
||||
solved: resp ? resp.value.length > 0 : false,
|
||||
state: solver?.getAttribute('data-state'),
|
||||
text: solver?.textContent?.trim() || ''
|
||||
}};
|
||||
}});
|
||||
const sec = Math.round((Date.now() - start) / 1000);
|
||||
console.error('[*] ' + sec + 's state=' + state.state + ' solved=' + state.solved + ' text=' + state.text.slice(0,30));
|
||||
if (state.solved) {{ console.error('[+] SOLVED!'); break; }}
|
||||
if (state.state === 'error') {{ console.error('[!] ERROR'); break; }}
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
}}
|
||||
|
||||
const final = await page.evaluate(() => {{
|
||||
const resp = document.querySelector('textarea[name="g-recaptcha-response"]');
|
||||
return {{ solved: resp ? resp.value.length > 0 : false, preview: resp?.value?.slice(0,50) || '' }};
|
||||
}});
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(final));
|
||||
}})();
|
||||
'''
|
||||
(tmpdir / 's.js').write_text(script)
|
||||
print("\n[*] Solving CAPTCHA (10-60s)...")
|
||||
r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=120, capture_output=True, text=True)
|
||||
print(r.stderr)
|
||||
assert r.returncode == 0, f"Failed: {r.stderr}"
|
||||
|
||||
final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1])
|
||||
assert final.get('solved'), f"Not solved: {final}"
|
||||
print(f"[+] SOLVED! {final.get('preview','')[:30]}...")
|
||||
finally:
|
||||
kill_chrome(process, chrome_dir)
|
||||
|
||||
|
||||
def test_install_twice_uses_cache():
|
||||
"""Test that running install twice uses existing cache on second run"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
||||
|
||||
# First install - downloads the extension
|
||||
result1 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
|
||||
|
||||
# Verify cache was created
|
||||
cache_file = ext_dir / "twocaptcha.extension.json"
|
||||
assert cache_file.exists(), "Cache file should exist after first install"
|
||||
|
||||
# Second install - should use cache
|
||||
result2 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
|
||||
|
||||
# Second run should mention cache reuse
|
||||
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
|
||||
|
||||
|
||||
def test_install_warns_without_api_key():
|
||||
"""Test that install warns when API key not configured"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# Don't set API_KEY_2CAPTCHA
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should warn about missing API key
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
|
||||
|
||||
|
||||
def test_install_success_with_api_key():
|
||||
"""Test that install succeeds when API key is configured"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should mention API key configured
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
|
||||
|
||||
|
||||
def test_config_script_structure():
|
||||
"""Test that config script has proper structure"""
|
||||
# Verify the script exists and contains expected markers
|
||||
script_content = CONFIG_SCRIPT.read_text()
|
||||
|
||||
# Should mention configuration marker file
|
||||
assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content
|
||||
|
||||
# Should mention API key
|
||||
assert "API_KEY_2CAPTCHA" in script_content
|
||||
|
||||
# Should have main function or be executable
|
||||
assert "async function" in script_content or "main" in script_content
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-xvs'])
|
||||
|
||||
@@ -14,7 +14,7 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_ublock.*'), None)
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
@@ -158,26 +158,221 @@ def test_large_extension_size():
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
||||
import signal
|
||||
import time
|
||||
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
||||
"""Clean up Chromium process."""
|
||||
import signal
|
||||
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check ad blocking effectiveness by counting ad elements on page.
|
||||
|
||||
Returns dict with:
|
||||
- adElementsFound: int - number of ad-related elements found
|
||||
- adElementsVisible: int - number of visible ad elements
|
||||
- blockedRequests: int - number of blocked network requests (ads/trackers)
|
||||
- totalRequests: int - total network requests made
|
||||
- percentBlocked: int - percentage of ad elements hidden (0-100)
|
||||
"""
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
// Track network requests
|
||||
let blockedRequests = 0;
|
||||
let totalRequests = 0;
|
||||
const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr',
|
||||
'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo',
|
||||
'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini'];
|
||||
|
||||
page.on('request', request => {{
|
||||
totalRequests++;
|
||||
const url = request.url().toLowerCase();
|
||||
if (adDomains.some(d => url.includes(d))) {{
|
||||
// This is an ad request
|
||||
}}
|
||||
}});
|
||||
|
||||
page.on('requestfailed', request => {{
|
||||
const url = request.url().toLowerCase();
|
||||
if (adDomains.some(d => url.includes(d))) {{
|
||||
blockedRequests++;
|
||||
}}
|
||||
}});
|
||||
|
||||
console.error('Navigating to {test_url}...');
|
||||
await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }});
|
||||
|
||||
// Wait for page to fully render and ads to load
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
|
||||
// Check for ad elements in the DOM
|
||||
const result = await page.evaluate(() => {{
|
||||
// Common ad-related selectors
|
||||
const adSelectors = [
|
||||
// Generic ad containers
|
||||
'[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]',
|
||||
'[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]',
|
||||
'[class*="advertisement"]', '[id*="advertisement"]',
|
||||
'[class*="sponsored"]', '[id*="sponsored"]',
|
||||
// Google ads
|
||||
'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]',
|
||||
// Yahoo specific
|
||||
'[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]',
|
||||
'[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]',
|
||||
// iframes (often ads)
|
||||
'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
|
||||
// Common ad sizes
|
||||
'[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]',
|
||||
'[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]',
|
||||
];
|
||||
|
||||
let adElementsFound = 0;
|
||||
let adElementsVisible = 0;
|
||||
|
||||
for (const selector of adSelectors) {{
|
||||
try {{
|
||||
const elements = document.querySelectorAll(selector);
|
||||
for (const el of elements) {{
|
||||
adElementsFound++;
|
||||
const style = window.getComputedStyle(el);
|
||||
const rect = el.getBoundingClientRect();
|
||||
const isVisible = style.display !== 'none' &&
|
||||
style.visibility !== 'hidden' &&
|
||||
style.opacity !== '0' &&
|
||||
rect.width > 0 && rect.height > 0;
|
||||
if (isVisible) {{
|
||||
adElementsVisible++;
|
||||
}}
|
||||
}}
|
||||
}} catch (e) {{
|
||||
// Invalid selector, skip
|
||||
}}
|
||||
}}
|
||||
|
||||
return {{
|
||||
adElementsFound,
|
||||
adElementsVisible,
|
||||
pageTitle: document.title
|
||||
}};
|
||||
}});
|
||||
|
||||
result.blockedRequests = blockedRequests;
|
||||
result.totalRequests = totalRequests;
|
||||
// Calculate how many ad elements were hidden (found but not visible)
|
||||
const hiddenAds = result.adElementsFound - result.adElementsVisible;
|
||||
result.percentBlocked = result.adElementsFound > 0
|
||||
? Math.round((hiddenAds / result.adElementsFound) * 100)
|
||||
: 0;
|
||||
|
||||
console.error('Ad blocking result:', JSON.stringify(result));
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = script_dir / 'check_ads.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(script_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Ad check script failed: {result.stderr}")
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
if not output_lines:
|
||||
raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}")
|
||||
|
||||
return json.loads(output_lines[-1])
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure like:
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
bin/
|
||||
.bin/
|
||||
node_modules/
|
||||
chrome_extensions/
|
||||
personas/
|
||||
default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
@@ -188,18 +383,28 @@ def setup_test_env(tmpdir: Path) -> dict:
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / 'bin'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
chrome_extensions_dir = data_dir / 'chrome_extensions'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
@@ -210,12 +415,14 @@ def setup_test_env(tmpdir: Path) -> dict:
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=10, env=env
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
@@ -240,8 +447,8 @@ def setup_test_env(tmpdir: Path) -> dict:
|
||||
return env
|
||||
|
||||
|
||||
# Test URL: ad blocker test page that shows if ads are blocked
|
||||
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
|
||||
# Test URL: Yahoo has many ads that uBlock should block
|
||||
TEST_URL = 'https://www.yahoo.com/'
|
||||
|
||||
|
||||
@pytest.mark.timeout(15)
|
||||
@@ -290,14 +497,18 @@ def test_extension_loads_in_chromium():
|
||||
print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
|
||||
print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
|
||||
print("[test] Launching Chromium...", flush=True)
|
||||
data_dir = Path(env['DATA_DIR'])
|
||||
crawl_dir = data_dir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
|
||||
# Launch Chromium in crawls directory
|
||||
crawl_id = 'test-ublock'
|
||||
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
|
||||
crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
|
||||
cwd=str(crawl_dir),
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -457,161 +668,177 @@ const puppeteer = require('puppeteer-core');
|
||||
def test_blocks_ads_on_test_page():
|
||||
"""Live test: verify uBlock Origin blocks ads on a test page.
|
||||
|
||||
Uses Chromium with extensions loaded automatically via chrome hook.
|
||||
Tests against d3ward's ad blocker test page which checks ad domains.
|
||||
This test runs TWO browser sessions:
|
||||
1. WITHOUT extension - verifies ads are NOT blocked (baseline)
|
||||
2. WITH extension - verifies ads ARE blocked
|
||||
|
||||
This ensures we're actually testing the extension's effect, not just
|
||||
that a test page happens to show ads as blocked.
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set up isolated env with proper directory structure
|
||||
env = setup_test_env(tmpdir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env_base = setup_test_env(tmpdir)
|
||||
env_base['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
|
||||
# ============================================================
|
||||
# STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 1: BASELINE TEST (no extension)")
|
||||
print("="*60)
|
||||
|
||||
data_dir = Path(env_base['DATA_DIR'])
|
||||
|
||||
env_no_ext = env_base.copy()
|
||||
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
|
||||
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Launch baseline Chromium in crawls directory
|
||||
baseline_crawl_id = 'baseline-no-ext'
|
||||
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
|
||||
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
|
||||
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
|
||||
baseline_process = None
|
||||
|
||||
try:
|
||||
baseline_process, baseline_cdp_url = launch_chromium_session(
|
||||
env_no_ext, baseline_chrome_dir, baseline_crawl_id
|
||||
)
|
||||
print(f"Baseline Chromium launched: {baseline_cdp_url}")
|
||||
|
||||
# Wait a moment for browser to be ready
|
||||
time.sleep(2)
|
||||
|
||||
baseline_result = check_ad_blocking(
|
||||
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
|
||||
)
|
||||
|
||||
print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads "
|
||||
f"(found {baseline_result['adElementsFound']} ad elements)")
|
||||
|
||||
finally:
|
||||
if baseline_process:
|
||||
kill_chromium_session(baseline_process, baseline_chrome_dir)
|
||||
|
||||
# Verify baseline shows ads ARE visible (not blocked)
|
||||
if baseline_result['adElementsFound'] == 0:
|
||||
pytest.skip(
|
||||
f"Cannot test extension: no ad elements found on {TEST_URL}. "
|
||||
f"The page may have changed or loaded differently."
|
||||
)
|
||||
|
||||
if baseline_result['adElementsVisible'] == 0:
|
||||
print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
|
||||
print("This suggests either:")
|
||||
print(" - There's another ad blocker interfering")
|
||||
print(" - Network-level ad blocking is in effect")
|
||||
|
||||
pytest.skip(
|
||||
f"Cannot test extension: baseline shows no visible ads "
|
||||
f"despite finding {baseline_result['adElementsFound']} ad elements."
|
||||
)
|
||||
|
||||
print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
|
||||
|
||||
# ============================================================
|
||||
# STEP 2: Install the uBlock extension
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: INSTALLING EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
|
||||
|
||||
# Step 1: Install the uBlock extension
|
||||
result = subprocess.run(
|
||||
['node', str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=15
|
||||
env=env_base,
|
||||
timeout=60
|
||||
)
|
||||
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
|
||||
|
||||
# Verify extension cache was created
|
||||
cache_file = ext_dir / 'ublock.extension.json'
|
||||
assert cache_file.exists(), "Extension cache not created"
|
||||
ext_data = json.loads(cache_file.read_text())
|
||||
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
|
||||
|
||||
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
|
||||
data_dir = Path(env['DATA_DIR'])
|
||||
crawl_dir = data_dir / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
# ============================================================
|
||||
# STEP 3: Run WITH extension, verify ads ARE blocked
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 3: TEST WITH EXTENSION")
|
||||
print("="*60)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cdp_url, "Chrome CDP URL not found after 20s"
|
||||
print(f"Chrome launched with CDP URL: {cdp_url}")
|
||||
|
||||
# Check that extensions were loaded
|
||||
extensions_file = chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
# Launch extension test Chromium in crawls directory
|
||||
ext_crawl_id = 'test-with-ext'
|
||||
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
|
||||
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
|
||||
ext_chrome_dir = ext_crawl_dir / 'chrome'
|
||||
env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
|
||||
ext_process = None
|
||||
|
||||
try:
|
||||
# Step 3: Connect to Chrome and test ad blocking
|
||||
test_script = f'''
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
ext_process, ext_cdp_url = launch_chromium_session(
|
||||
env_base, ext_chrome_dir, ext_crawl_id
|
||||
)
|
||||
print(f"Extension Chromium launched: {ext_cdp_url}")
|
||||
|
||||
(async () => {{
|
||||
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
|
||||
# Check that extension was loaded
|
||||
extensions_file = ext_chrome_dir / 'extensions.json'
|
||||
if extensions_file.exists():
|
||||
loaded_exts = json.loads(extensions_file.read_text())
|
||||
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
|
||||
|
||||
// Wait for extension to initialize
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
# Wait for extension to initialize
|
||||
time.sleep(3)
|
||||
|
||||
// Check extension loaded by looking at targets
|
||||
const targets = browser.targets();
|
||||
const extTargets = targets.filter(t =>
|
||||
t.url().startsWith('chrome-extension://') ||
|
||||
t.type() === 'service_worker' ||
|
||||
t.type() === 'background_page'
|
||||
);
|
||||
console.error('Extension targets found:', extTargets.length);
|
||||
extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60)));
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
|
||||
await page.setViewport({{ width: 1440, height: 900 }});
|
||||
|
||||
console.error('Navigating to {TEST_URL}...');
|
||||
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
|
||||
|
||||
// Wait for the test page to run its checks
|
||||
await new Promise(r => setTimeout(r, 5000));
|
||||
|
||||
// The d3ward test page shows blocked percentage
|
||||
const result = await page.evaluate(() => {{
|
||||
const scoreEl = document.querySelector('#score');
|
||||
const score = scoreEl ? scoreEl.textContent : null;
|
||||
const blockedItems = document.querySelectorAll('.blocked').length;
|
||||
const totalItems = document.querySelectorAll('.testlist li').length;
|
||||
return {{
|
||||
score,
|
||||
blockedItems,
|
||||
totalItems,
|
||||
percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
|
||||
}};
|
||||
}});
|
||||
|
||||
console.error('Ad blocking result:', JSON.stringify(result));
|
||||
browser.disconnect();
|
||||
console.log(JSON.stringify(result));
|
||||
}})();
|
||||
'''
|
||||
script_path = tmpdir / 'test_ublock.js'
|
||||
script_path.write_text(test_script)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(script_path)],
|
||||
cwd=str(tmpdir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10
|
||||
ext_result = check_ad_blocking(
|
||||
ext_cdp_url, TEST_URL, env_base, tmpdir
|
||||
)
|
||||
|
||||
print(f"stderr: {result.stderr}")
|
||||
print(f"stdout: {result.stdout}")
|
||||
|
||||
assert result.returncode == 0, f"Test failed: {result.stderr}"
|
||||
|
||||
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
|
||||
assert output_lines, f"No JSON output: {result.stdout}"
|
||||
|
||||
test_result = json.loads(output_lines[-1])
|
||||
|
||||
# uBlock should block most ad domains on the test page
|
||||
assert test_result['percentBlocked'] >= 50, \
|
||||
f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
|
||||
print(f"Extension result: {ext_result['adElementsVisible']} visible ads "
|
||||
f"(found {ext_result['adElementsFound']} ad elements)")
|
||||
|
||||
finally:
|
||||
# Clean up Chrome
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
if ext_process:
|
||||
kill_chromium_session(ext_process, ext_chrome_dir)
|
||||
|
||||
# ============================================================
|
||||
# STEP 4: Compare results
|
||||
# ============================================================
|
||||
print("\n" + "="*60)
|
||||
print("STEP 4: COMPARISON")
|
||||
print("="*60)
|
||||
print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads")
|
||||
print(f"With extension: {ext_result['adElementsVisible']} visible ads")
|
||||
|
||||
# Calculate reduction in visible ads
|
||||
ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible']
|
||||
reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0
|
||||
|
||||
print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)")
|
||||
|
||||
# Extension should significantly reduce visible ads
|
||||
assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \
|
||||
f"uBlock should reduce visible ads.\n" \
|
||||
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
|
||||
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
|
||||
f"Expected fewer ads with extension."
|
||||
|
||||
# Extension should block at least 30% of ads
|
||||
assert reduction_percent >= 30, \
|
||||
f"uBlock should block at least 30% of ads.\n" \
|
||||
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
|
||||
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
|
||||
f"Reduction: only {reduction_percent:.0f}% (expected at least 30%)"
|
||||
|
||||
print(f"\n✓ SUCCESS: uBlock correctly blocks ads!")
|
||||
print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads")
|
||||
print(f" - With extension: {ext_result['adElementsVisible']} visible ads")
|
||||
print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)")
|
||||
|
||||
@@ -133,7 +133,7 @@ This plugin provides shared Chrome infrastructure for other plugins. It manages
|
||||
chrome/
|
||||
├── on_Crawl__00_chrome_install_config.py # Configure Chrome settings
|
||||
├── on_Crawl__00_chrome_install.py # Install Chrome binary
|
||||
├── on_Crawl__20_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg)
|
||||
├── on_Crawl__30_chrome_launch.bg.js # Launch Chrome (Crawl-level, bg)
|
||||
├── on_Snapshot__20_chrome_tab.bg.js # Open tab (Snapshot-level, bg)
|
||||
├── on_Snapshot__30_chrome_navigate.js # Navigate to URL (foreground)
|
||||
├── on_Snapshot__45_chrome_tab_cleanup.py # Close tab, kill bg hooks
|
||||
|
||||
Reference in New Issue
Block a user