mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Consolidate Chrome test helpers across all plugin tests
- Add setup_test_env, launch_chromium_session, kill_chromium_session to chrome_test_helpers.py for extension tests - Add chromium_session context manager for cleaner test code - Refactor ublock, istilldontcareaboutcookies, twocaptcha tests to use shared helpers (~450 lines removed) - Refactor screenshot, dom, pdf tests to use shared get_test_env and get_lib_dir (~60 lines removed) - Net reduction: 228 lines of duplicate code
This commit is contained in:
@@ -6,19 +6,35 @@ duplication across test files. It uses the JavaScript utilities from chrome_util
|
||||
where appropriate.
|
||||
|
||||
Usage:
|
||||
# For simple tests (screenshot, dom, pdf, etc.):
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_lib_dir,
|
||||
find_chromium_binary,
|
||||
)
|
||||
|
||||
# For extension tests (ublock, istilldontcareaboutcookies, twocaptcha):
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
)
|
||||
|
||||
# For tab-based tests (infiniscroll, modalcloser):
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_chrome_session,
|
||||
cleanup_chrome,
|
||||
find_chromium_binary,
|
||||
get_node_modules_dir,
|
||||
chrome_session,
|
||||
)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
from contextlib import contextmanager
|
||||
@@ -29,34 +45,48 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
|
||||
|
||||
# Hook script locations
|
||||
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
|
||||
|
||||
|
||||
def get_lib_dir() -> Path:
|
||||
"""Get LIB_DIR for tests, checking env first then ArchiveBox config.
|
||||
|
||||
Returns the path to the lib directory, checking:
|
||||
1. LIB_DIR environment variable
|
||||
2. ArchiveBox config STORAGE_CONFIG.LIB_DIR
|
||||
"""
|
||||
if os.environ.get('LIB_DIR'):
|
||||
return Path(os.environ['LIB_DIR'])
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
|
||||
def get_node_modules_dir() -> Path:
|
||||
"""Get NODE_MODULES_DIR for tests, checking env first.
|
||||
|
||||
Returns the path to the node_modules directory, checking:
|
||||
1. NODE_MODULES_DIR environment variable
|
||||
2. Computed from LIB_DIR via ArchiveBox config
|
||||
2. Computed from LIB_DIR
|
||||
"""
|
||||
if os.environ.get('NODE_MODULES_DIR'):
|
||||
return Path(os.environ['NODE_MODULES_DIR'])
|
||||
# Otherwise compute from LIB_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
lib_dir = get_lib_dir()
|
||||
return lib_dir / 'npm' / 'node_modules'
|
||||
|
||||
|
||||
def get_test_env() -> dict:
|
||||
"""Get environment dict with NODE_MODULES_DIR set correctly for tests.
|
||||
"""Get environment dict with NODE_MODULES_DIR and LIB_DIR set correctly for tests.
|
||||
|
||||
Returns a copy of os.environ with NODE_MODULES_DIR added/updated.
|
||||
Use this for all subprocess calls in plugin tests.
|
||||
Returns a copy of os.environ with NODE_MODULES_DIR and LIB_DIR added/updated.
|
||||
Use this for all subprocess calls in simple plugin tests (screenshot, dom, pdf).
|
||||
"""
|
||||
env = os.environ.copy()
|
||||
lib_dir = get_lib_dir()
|
||||
env['LIB_DIR'] = str(lib_dir)
|
||||
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
|
||||
return env
|
||||
|
||||
@@ -113,6 +143,219 @@ def get_extensions_dir() -> str:
|
||||
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Extension Test Helpers
|
||||
# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for extension tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
Default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
|
||||
Args:
|
||||
tmpdir: Base temporary directory for the test
|
||||
|
||||
Returns:
|
||||
Environment dict with all paths set, or pytest.skip() if Chrome install fails
|
||||
"""
|
||||
import pytest
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Only set headless if not already in environment (allow override for debugging)
|
||||
if 'CHROME_HEADLESS' not in os.environ:
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]:
|
||||
"""Launch Chromium and return (process, cdp_url).
|
||||
|
||||
This launches Chrome using the chrome launch hook and waits for the CDP URL
|
||||
to become available. Use this for extension tests that need direct CDP access.
|
||||
|
||||
Args:
|
||||
env: Environment dict (from setup_test_env)
|
||||
chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.)
|
||||
crawl_id: ID for the crawl
|
||||
|
||||
Returns:
|
||||
Tuple of (chrome_launch_process, cdp_url)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If Chrome fails to launch or CDP URL not available after 20s
|
||||
"""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None:
|
||||
"""Clean up Chromium process launched by launch_chromium_session.
|
||||
|
||||
Args:
|
||||
chrome_launch_process: The Popen object from launch_chromium_session
|
||||
chrome_dir: The chrome directory containing chrome.pid
|
||||
"""
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Context manager for Chromium sessions with automatic cleanup.
|
||||
|
||||
Usage:
|
||||
with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url):
|
||||
# Use cdp_url to connect with puppeteer
|
||||
pass
|
||||
# Chromium automatically cleaned up
|
||||
|
||||
Args:
|
||||
env: Environment dict (from setup_test_env)
|
||||
chrome_dir: Directory for Chrome files
|
||||
crawl_id: ID for the crawl
|
||||
|
||||
Yields:
|
||||
Tuple of (chrome_launch_process, cdp_url)
|
||||
"""
|
||||
chrome_launch_process = None
|
||||
try:
|
||||
chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id)
|
||||
yield chrome_launch_process, cdp_url
|
||||
finally:
|
||||
if chrome_launch_process:
|
||||
kill_chromium_session(chrome_launch_process, chrome_dir)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Tab-based Test Helpers
|
||||
# Used by tab-based tests (infiniscroll, modalcloser)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def setup_chrome_session(
|
||||
tmpdir: Path,
|
||||
crawl_id: str = 'test-crawl',
|
||||
|
||||
@@ -20,6 +20,11 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_lib_dir,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
@@ -27,22 +32,9 @@ DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
|
||||
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -14,6 +14,14 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None)
|
||||
@@ -124,107 +132,6 @@ def test_no_configuration_required():
|
||||
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
Default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
TEST_URL = 'https://www.filmin.es/'
|
||||
|
||||
|
||||
@@ -420,54 +327,6 @@ const puppeteer = require('puppeteer-core');
|
||||
pass
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
||||
"""Clean up Chromium process."""
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check if cookie consent elements are visible on a page.
|
||||
|
||||
|
||||
@@ -21,6 +21,11 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_lib_dir,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
@@ -28,22 +33,9 @@ PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -20,28 +20,20 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_lib_dir,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Get LIB_DIR for NODE_MODULES_DIR
|
||||
def get_lib_dir():
|
||||
"""Get LIB_DIR for tests."""
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
||||
env['LIB_DIR'] = str(LIB_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
|
||||
@@ -16,184 +16,25 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
TEST_URL = 'https://2captcha.com/demo/recaptcha-v2'
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Only set headless if not already in environment (allow override for debugging)
|
||||
if 'CHROME_HEADLESS' not in os.environ:
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
def launch_chrome(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url)."""
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
cdp_url = None
|
||||
extensions_ready = False
|
||||
for _ in range(30):
|
||||
if process.poll() is not None:
|
||||
stdout, stderr = process.communicate()
|
||||
raise RuntimeError(f"Chromium failed:\n{stdout}\n{stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
ext_file = chrome_dir / 'extensions.json'
|
||||
if cdp_file.exists() and not cdp_url:
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
if ext_file.exists():
|
||||
extensions_ready = True
|
||||
if cdp_url and extensions_ready:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
process.kill()
|
||||
stdout, stderr = process.communicate()
|
||||
raise RuntimeError(f"CDP URL not found after 30s.\nstdout: {stdout}\nstderr: {stderr}")
|
||||
|
||||
# Print chrome launch hook output for debugging
|
||||
import select
|
||||
if hasattr(select, 'poll'):
|
||||
# Read any available stderr without blocking
|
||||
import fcntl
|
||||
import os as os_module
|
||||
fd = process.stderr.fileno()
|
||||
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
|
||||
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os_module.O_NONBLOCK)
|
||||
try:
|
||||
stderr_output = process.stderr.read()
|
||||
if stderr_output:
|
||||
print(f"[Chrome Launch Hook Output]\n{stderr_output}")
|
||||
except:
|
||||
pass
|
||||
|
||||
return process, cdp_url
|
||||
|
||||
|
||||
def kill_chrome(process, chrome_dir: Path):
|
||||
"""Kill Chromium process."""
|
||||
try:
|
||||
process.send_signal(signal.SIGTERM)
|
||||
process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
pid_file = chrome_dir / 'chrome.pid'
|
||||
if pid_file.exists():
|
||||
try:
|
||||
os.kill(int(pid_file.read_text().strip()), signal.SIGKILL)
|
||||
except:
|
||||
pass
|
||||
# Alias for backward compatibility with existing test names
|
||||
launch_chrome = launch_chromium_session
|
||||
kill_chrome = kill_chromium_session
|
||||
|
||||
|
||||
class TestTwoCaptcha:
|
||||
|
||||
@@ -12,6 +12,14 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
setup_test_env,
|
||||
launch_chromium_session,
|
||||
kill_chromium_session,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
PLUGINS_ROOT,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None)
|
||||
@@ -157,64 +165,6 @@ def test_large_extension_size():
|
||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||
|
||||
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
|
||||
|
||||
def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str):
|
||||
"""Launch Chromium and return (process, cdp_url) or raise on failure."""
|
||||
import signal
|
||||
import time
|
||||
|
||||
chrome_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chromium to launch and CDP URL to be available
|
||||
cdp_url = None
|
||||
for i in range(20):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
cdp_file = chrome_dir / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
cdp_url = cdp_file.read_text().strip()
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not cdp_url:
|
||||
chrome_launch_process.kill()
|
||||
raise RuntimeError("Chromium CDP URL not found after 20s")
|
||||
|
||||
return chrome_launch_process, cdp_url
|
||||
|
||||
|
||||
def kill_chromium_session(chrome_launch_process, chrome_dir: Path):
|
||||
"""Clean up Chromium process."""
|
||||
import signal
|
||||
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
chrome_pid_file = chrome_dir / 'chrome.pid'
|
||||
if chrome_pid_file.exists():
|
||||
try:
|
||||
chrome_pid = int(chrome_pid_file.read_text().strip())
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict:
|
||||
"""Check ad blocking effectiveness by counting ad elements on page.
|
||||
|
||||
@@ -350,103 +300,6 @@ const puppeteer = require('puppeteer-core');
|
||||
return json.loads(output_lines[-1])
|
||||
|
||||
|
||||
def setup_test_env(tmpdir: Path) -> dict:
|
||||
"""Set up isolated data/lib directory structure for tests.
|
||||
|
||||
Creates structure matching real ArchiveBox data dir:
|
||||
<tmpdir>/data/
|
||||
lib/
|
||||
arm64-darwin/ (or x86_64-linux, etc.)
|
||||
npm/
|
||||
.bin/
|
||||
node_modules/
|
||||
personas/
|
||||
default/
|
||||
chrome_extensions/
|
||||
users/
|
||||
testuser/
|
||||
crawls/
|
||||
snapshots/
|
||||
|
||||
Calls chrome install hook which handles puppeteer-core and chromium installation.
|
||||
Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.
|
||||
"""
|
||||
import platform
|
||||
from datetime import datetime
|
||||
|
||||
# Determine machine type (matches archivebox.config.paths.get_machine_type())
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
|
||||
# Create proper directory structure matching real ArchiveBox layout
|
||||
data_dir = tmpdir / 'data'
|
||||
lib_dir = data_dir / 'lib' / machine_type
|
||||
npm_dir = lib_dir / 'npm'
|
||||
npm_bin_dir = npm_dir / '.bin'
|
||||
node_modules_dir = npm_dir / 'node_modules'
|
||||
|
||||
# Extensions go under personas/Default/
|
||||
chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions'
|
||||
|
||||
# User data goes under users/{username}/
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
users_dir = data_dir / 'users' / 'testuser'
|
||||
crawls_dir = users_dir / 'crawls' / date_str
|
||||
snapshots_dir = users_dir / 'snapshots' / date_str
|
||||
|
||||
# Create all directories
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
npm_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
crawls_dir.mkdir(parents=True, exist_ok=True)
|
||||
snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build complete env dict
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
'DATA_DIR': str(data_dir),
|
||||
'LIB_DIR': str(lib_dir),
|
||||
'MACHINE_TYPE': machine_type,
|
||||
'NPM_BIN_DIR': str(npm_bin_dir),
|
||||
'NODE_MODULES_DIR': str(node_modules_dir),
|
||||
'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir),
|
||||
'CRAWLS_DIR': str(crawls_dir),
|
||||
'SNAPSHOTS_DIR': str(snapshots_dir),
|
||||
})
|
||||
|
||||
# Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
|
||||
result = subprocess.run(
|
||||
['python', str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True, text=True, timeout=120, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Chrome install hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output to get CHROME_BINARY
|
||||
chrome_binary = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get('type') == 'Binary' and data.get('abspath'):
|
||||
chrome_binary = data['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not chrome_binary or not Path(chrome_binary).exists():
|
||||
pytest.skip(f"Chromium binary not found: {chrome_binary}")
|
||||
|
||||
env['CHROME_BINARY'] = chrome_binary
|
||||
return env
|
||||
|
||||
|
||||
# Test URL: Yahoo has many ads that uBlock should block
|
||||
TEST_URL = 'https://www.yahoo.com/'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user