extension test fixes

This commit is contained in:
Nick Sweeting
2025-12-30 18:28:14 -08:00
parent dd2302ad92
commit 42d3fb7025
12 changed files with 1512 additions and 688 deletions

View File

@@ -14,7 +14,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_ublock.*'), None)
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
def test_install_script_exists():
@@ -158,26 +158,221 @@ def test_large_extension_size():
PLUGINS_ROOT = PLUGIN_DIR.parent
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
import signal
import time
chrome_dir.mkdir(parents=True, exist_ok=True)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chromium to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
if not cdp_url:
chrome_launch_process.kill()
raise RuntimeError("Chromium CDP URL not found after 20s")
return chrome_launch_process, cdp_url
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
"""Clean up Chromium process."""
import signal
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
"""Check ad blocking effectiveness by counting ad elements on page.
Returns dict with:
- adElementsFound: int - number of ad-related elements found
- adElementsVisible: int - number of visible ad elements
- blockedRequests: int - number of blocked network requests (ads/trackers)
- totalRequests: int - total network requests made
- percentBlocked: int - percentage of ad elements hidden (0-100)
"""
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
// Track network requests
let blockedRequests = 0;
let totalRequests = 0;
const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr',
'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo',
'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini'];
page.on('request', request => {{
totalRequests++;
const url = request.url().toLowerCase();
if (adDomains.some(d => url.includes(d))) {{
// This is an ad request
}}
}});
page.on('requestfailed', request => {{
const url = request.url().toLowerCase();
if (adDomains.some(d => url.includes(d))) {{
blockedRequests++;
}}
}});
console.error('Navigating to {test_url}...');
await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }});
// Wait for page to fully render and ads to load
await new Promise(r => setTimeout(r, 5000));
// Check for ad elements in the DOM
const result = await page.evaluate(() => {{
// Common ad-related selectors
const adSelectors = [
// Generic ad containers
'[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]',
'[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]',
'[class*="advertisement"]', '[id*="advertisement"]',
'[class*="sponsored"]', '[id*="sponsored"]',
// Google ads
'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]',
// Yahoo specific
'[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]',
'[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]',
// iframes (often ads)
'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]',
// Common ad sizes
'[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]',
'[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]',
];
let adElementsFound = 0;
let adElementsVisible = 0;
for (const selector of adSelectors) {{
try {{
const elements = document.querySelectorAll(selector);
for (const el of elements) {{
adElementsFound++;
const style = window.getComputedStyle(el);
const rect = el.getBoundingClientRect();
const isVisible = style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0' &&
rect.width > 0 && rect.height > 0;
if (isVisible) {{
adElementsVisible++;
}}
}}
}} catch (e) {{
// Invalid selector, skip
}}
}}
return {{
adElementsFound,
adElementsVisible,
pageTitle: document.title
}};
}});
result.blockedRequests = blockedRequests;
result.totalRequests = totalRequests;
// Calculate how many ad elements were hidden (found but not visible)
const hiddenAds = result.adElementsFound - result.adElementsVisible;
result.percentBlocked = result.adElementsFound > 0
? Math.round((hiddenAds / result.adElementsFound) * 100)
: 0;
console.error('Ad blocking result:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = script_dir / 'check_ads.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir),
capture_output=True,
text=True,
env=env,
timeout=90
)
if result.returncode != 0:
raise RuntimeError(f"Ad check script failed: {result.stderr}")
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
if not output_lines:
raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}")
return json.loads(output_lines[-1])
def setup_test_env(tmpdir: Path) -> dict:
"""Set up isolated data/lib directory structure for tests.
Creates structure like:
Creates structure matching real ArchiveBox data dir:
<tmpdir>/data/
lib/
arm64-darwin/ (or x86_64-linux, etc.)
npm/
bin/
.bin/
node_modules/
chrome_extensions/
personas/
default/
chrome_extensions/
users/
testuser/
crawls/
snapshots/
Calls chrome install hook which handles puppeteer-core and chromium installation.
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
"""
import platform
from datetime import datetime
# Determine machine type (matches archivebox.config.paths.get_machine_type())
machine = platform.machine().lower()
@@ -188,18 +383,28 @@ def setup_test_env(tmpdir: Path) -> dict:
machine = 'x86_64'
machine_type = f"{machine}-{system}"
# Create proper directory structure
# Create proper directory structure matching real ArchiveBox layout
data_dir = tmpdir / 'data'
lib_dir = data_dir / 'lib' / machine_type
npm_dir = lib_dir / 'npm'
npm_bin_dir = npm_dir / 'bin'
npm_bin_dir = npm_dir / '.bin'
node_modules_dir = npm_dir / 'node_modules'
chrome_extensions_dir = data_dir / 'chrome_extensions'
# Extensions go under personas/Default/
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
# User data goes under users/{username}/
date_str = datetime.now().strftime('%Y%m%d')
users_dir = data_dir / 'users' / 'testuser'
crawls_dir = users_dir / 'crawls' / date_str
snapshots_dir = users_dir / 'snapshots' / date_str
# Create all directories
node_modules_dir.mkdir(parents=True, exist_ok=True)
npm_bin_dir.mkdir(parents=True, exist_ok=True)
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
crawls_dir.mkdir(parents=True, exist_ok=True)
snapshots_dir.mkdir(parents=True, exist_ok=True)
# Build complete env dict
env = os.environ.copy()
@@ -210,12 +415,14 @@ def setup_test_env(tmpdir: Path) -> dict:
'NPM_BIN_DIR': str(npm_bin_dir),
'NODE_MODULES_DIR': str(node_modules_dir),
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
'CRAWLS_DIR': str(crawls_dir),
'SNAPSHOTS_DIR': str(snapshots_dir),
})
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
result = subprocess.run(
['python', str(CHROME_INSTALL_HOOK)],
capture_output=True, text=True, timeout=10, env=env
capture_output=True, text=True, timeout=120, env=env
)
if result.returncode != 0:
pytest.skip(f"Chrome install hook failed: {result.stderr}")
@@ -240,8 +447,8 @@ def setup_test_env(tmpdir: Path) -> dict:
return env
# Test URL: ad blocker test page that shows if ads are blocked
TEST_URL = 'https://d3ward.github.io/toolz/adblock.html'
# Test URL: Yahoo has many ads that uBlock should block
TEST_URL = 'https://www.yahoo.com/'
@pytest.mark.timeout(15)
@@ -290,14 +497,18 @@ def test_extension_loads_in_chromium():
print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True)
print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True)
print("[test] Launching Chromium...", flush=True)
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
# Launch Chromium in crawls directory
crawl_id = 'test-ublock'
crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id
crawl_dir.mkdir(parents=True, exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True, exist_ok=True)
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -457,161 +668,177 @@ const puppeteer = require('puppeteer-core');
def test_blocks_ads_on_test_page():
"""Live test: verify uBlock Origin blocks ads on a test page.
Uses Chromium with extensions loaded automatically via chrome hook.
Tests against d3ward's ad blocker test page which checks ad domains.
This test runs TWO browser sessions:
1. WITHOUT extension - verifies ads are NOT blocked (baseline)
2. WITH extension - verifies ads ARE blocked
This ensures we're actually testing the extension's effect, not just
that a test page happens to show ads as blocked.
"""
import signal
import time
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set up isolated env with proper directory structure
env = setup_test_env(tmpdir)
env['CHROME_HEADLESS'] = 'true'
env_base = setup_test_env(tmpdir)
env_base['CHROME_HEADLESS'] = 'true'
ext_dir = Path(env['CHROME_EXTENSIONS_DIR'])
# ============================================================
# STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked
# ============================================================
print("\n" + "="*60)
print("STEP 1: BASELINE TEST (no extension)")
print("="*60)
data_dir = Path(env_base['DATA_DIR'])
env_no_ext = env_base.copy()
env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions')
(data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True)
# Launch baseline Chromium in crawls directory
baseline_crawl_id = 'baseline-no-ext'
baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id
baseline_crawl_dir.mkdir(parents=True, exist_ok=True)
baseline_chrome_dir = baseline_crawl_dir / 'chrome'
env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir)
baseline_process = None
try:
baseline_process, baseline_cdp_url = launch_chromium_session(
env_no_ext, baseline_chrome_dir, baseline_crawl_id
)
print(f"Baseline Chromium launched: {baseline_cdp_url}")
# Wait a moment for browser to be ready
time.sleep(2)
baseline_result = check_ad_blocking(
baseline_cdp_url, TEST_URL, env_no_ext, tmpdir
)
print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads "
f"(found {baseline_result['adElementsFound']} ad elements)")
finally:
if baseline_process:
kill_chromium_session(baseline_process, baseline_chrome_dir)
# Verify baseline shows ads ARE visible (not blocked)
if baseline_result['adElementsFound'] == 0:
pytest.skip(
f"Cannot test extension: no ad elements found on {TEST_URL}. "
f"The page may have changed or loaded differently."
)
if baseline_result['adElementsVisible'] == 0:
print(f"\nWARNING: Baseline shows 0 visible ads despite finding {baseline_result['adElementsFound']} elements!")
print("This suggests either:")
print(" - There's another ad blocker interfering")
print(" - Network-level ad blocking is in effect")
pytest.skip(
f"Cannot test extension: baseline shows no visible ads "
f"despite finding {baseline_result['adElementsFound']} ad elements."
)
print(f"\n✓ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension")
# ============================================================
# STEP 2: Install the uBlock extension
# ============================================================
print("\n" + "="*60)
print("STEP 2: INSTALLING EXTENSION")
print("="*60)
ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR'])
# Step 1: Install the uBlock extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=15
env=env_base,
timeout=60
)
assert result.returncode == 0, f"Extension install failed: {result.stderr}"
# Verify extension cache was created
cache_file = ext_dir / 'ublock.extension.json'
assert cache_file.exists(), "Extension cache not created"
ext_data = json.loads(cache_file.read_text())
print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}")
# Step 2: Launch Chromium using the chrome hook (loads extensions automatically)
data_dir = Path(env['DATA_DIR'])
crawl_dir = data_dir / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
# ============================================================
# STEP 3: Run WITH extension, verify ads ARE blocked
# ============================================================
print("\n" + "="*60)
print("STEP 3: TEST WITH EXTENSION")
print("="*60)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-ublock'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch and CDP URL to be available
cdp_url = None
for i in range(20):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
cdp_file = chrome_dir / 'cdp_url.txt'
if cdp_file.exists():
cdp_url = cdp_file.read_text().strip()
break
time.sleep(1)
assert cdp_url, "Chrome CDP URL not found after 20s"
print(f"Chrome launched with CDP URL: {cdp_url}")
# Check that extensions were loaded
extensions_file = chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
# Launch extension test Chromium in crawls directory
ext_crawl_id = 'test-with-ext'
ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id
ext_crawl_dir.mkdir(parents=True, exist_ok=True)
ext_chrome_dir = ext_crawl_dir / 'chrome'
env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir)
ext_process = None
try:
# Step 3: Connect to Chrome and test ad blocking
test_script = f'''
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
ext_process, ext_cdp_url = launch_chromium_session(
env_base, ext_chrome_dir, ext_crawl_id
)
print(f"Extension Chromium launched: {ext_cdp_url}")
(async () => {{
const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }});
# Check that extension was loaded
extensions_file = ext_chrome_dir / 'extensions.json'
if extensions_file.exists():
loaded_exts = json.loads(extensions_file.read_text())
print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}")
// Wait for extension to initialize
await new Promise(r => setTimeout(r, 500));
# Wait for extension to initialize
time.sleep(3)
// Check extension loaded by looking at targets
const targets = browser.targets();
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
console.error('Extension targets found:', extTargets.length);
extTargets.forEach(t => console.error(' -', t.type(), t.url().substring(0, 60)));
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36');
await page.setViewport({{ width: 1440, height: 900 }});
console.error('Navigating to {TEST_URL}...');
await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 60000 }});
// Wait for the test page to run its checks
await new Promise(r => setTimeout(r, 5000));
// The d3ward test page shows blocked percentage
const result = await page.evaluate(() => {{
const scoreEl = document.querySelector('#score');
const score = scoreEl ? scoreEl.textContent : null;
const blockedItems = document.querySelectorAll('.blocked').length;
const totalItems = document.querySelectorAll('.testlist li').length;
return {{
score,
blockedItems,
totalItems,
percentBlocked: totalItems > 0 ? Math.round((blockedItems / totalItems) * 100) : 0
}};
}});
console.error('Ad blocking result:', JSON.stringify(result));
browser.disconnect();
console.log(JSON.stringify(result));
}})();
'''
script_path = tmpdir / 'test_ublock.js'
script_path.write_text(test_script)
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
capture_output=True,
text=True,
env=env,
timeout=10
ext_result = check_ad_blocking(
ext_cdp_url, TEST_URL, env_base, tmpdir
)
print(f"stderr: {result.stderr}")
print(f"stdout: {result.stdout}")
assert result.returncode == 0, f"Test failed: {result.stderr}"
output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')]
assert output_lines, f"No JSON output: {result.stdout}"
test_result = json.loads(output_lines[-1])
# uBlock should block most ad domains on the test page
assert test_result['percentBlocked'] >= 50, \
f"uBlock should block at least 50% of ads, only blocked {test_result['percentBlocked']}%. Result: {test_result}"
print(f"Extension result: {ext_result['adElementsVisible']} visible ads "
f"(found {ext_result['adElementsFound']} ad elements)")
finally:
# Clean up Chrome
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
chrome_pid_file = chrome_dir / 'chrome.pid'
if chrome_pid_file.exists():
try:
chrome_pid = int(chrome_pid_file.read_text().strip())
os.kill(chrome_pid, signal.SIGKILL)
except (OSError, ValueError):
pass
if ext_process:
kill_chromium_session(ext_process, ext_chrome_dir)
# ============================================================
# STEP 4: Compare results
# ============================================================
print("\n" + "="*60)
print("STEP 4: COMPARISON")
print("="*60)
print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads")
print(f"With extension: {ext_result['adElementsVisible']} visible ads")
# Calculate reduction in visible ads
ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible']
reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0
print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)")
# Extension should significantly reduce visible ads
assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \
f"uBlock should reduce visible ads.\n" \
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Expected fewer ads with extension."
# Extension should block at least 30% of ads
assert reduction_percent >= 30, \
f"uBlock should block at least 30% of ads.\n" \
f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \
f"With extension: {ext_result['adElementsVisible']} visible ads\n" \
f"Reduction: only {reduction_percent:.0f}% (expected at least 30%)"
print(f"\n✓ SUCCESS: uBlock correctly blocks ads!")
print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads")
print(f" - With extension: {ext_result['adElementsVisible']} visible ads")
print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)")