Reduce code duplication between Chrome utilities (#1737)

This change consolidates duplicated logic between chrome_utils.js and
extension installer hooks, as well as between Python plugin tests:

JavaScript changes:
- Add getExtensionsDir() to centralize extension directory path
calculation
- Add installExtensionWithCache() to handle extension install + cache
workflow
- Add CLI commands for new utilities
- Refactor all 3 extension installers (ublock,
istilldontcareaboutcookies, twocaptcha) to use shared utilities,
reducing each from ~115 lines to ~60
- Update chrome_launch hook to use getExtensionsDir()

Python test changes:
- Add chrome_test_helpers.py with shared Chrome session management
utilities
- Refactor infiniscroll and modalcloser tests to use shared helpers
- setup_chrome_session(), cleanup_chrome(), get_test_env() now
centralized
- Add chrome_session() context manager for automatic cleanup

Net result: ~208 lines of code removed while maintaining same
functionality.

<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->

# Summary

<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->

# Related issues

<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->

# Changes these areas

- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
This commit is contained in:
Nick Sweeting
2025-12-31 00:19:44 -08:00
committed by GitHub
8 changed files with 469 additions and 401 deletions

View File

@@ -1312,6 +1312,99 @@ function findChromium() {
return null;
}
// ============================================================================
// Shared Extension Installer Utilities
// ============================================================================
/**
* Get the extensions directory path.
* Centralized path calculation used by extension installers and chrome launch.
*
* Path is derived from environment variables in this priority:
* 1. CHROME_EXTENSIONS_DIR (explicit override)
* 2. DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions (default)
*
* @returns {string} - Absolute path to extensions directory
*/
function getExtensionsDir() {
const dataDir = getEnv('DATA_DIR', './data');
const persona = getEnv('ACTIVE_PERSONA', 'Default');
return getEnv('CHROME_EXTENSIONS_DIR') ||
path.join(dataDir, 'personas', persona, 'chrome_extensions');
}
/**
* Install a Chrome extension with caching support.
*
* This is the main entry point for extension installer hooks. It handles:
* - Checking for cached extension metadata
* - Installing the extension if not cached
* - Writing cache file for future runs
*
* @param {Object} extension - Extension metadata object
* @param {string} extension.webstore_id - Chrome Web Store extension ID
* @param {string} extension.name - Human-readable extension name (used for cache file)
* @param {Object} [options] - Options
* @param {string} [options.extensionsDir] - Override extensions directory
* @param {boolean} [options.quiet=false] - Suppress info logging
* @returns {Promise<Object|null>} - Installed extension metadata or null on failure
*/
async function installExtensionWithCache(extension, options = {}) {
const {
extensionsDir = getExtensionsDir(),
quiet = false,
} = options;
const cacheFile = path.join(extensionsDir, `${extension.name}.extension.json`);
// Check if extension is already cached and valid
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
if (!quiet) {
console.log(`[*] ${extension.name} extension already installed (using cache)`);
}
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn(`[⚠️] Extension cache corrupted for ${extension.name}, re-installing...`);
}
}
// Install extension
if (!quiet) {
console.log(`[*] Installing ${extension.name} extension...`);
}
const installedExt = await loadOrInstallExtension(extension, extensionsDir);
if (!installedExt) {
console.error(`[❌] Failed to install ${extension.name} extension`);
return null;
}
// Write cache file
try {
await fs.promises.mkdir(extensionsDir, { recursive: true });
await fs.promises.writeFile(cacheFile, JSON.stringify(installedExt, null, 2));
if (!quiet) {
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
} catch (e) {
console.warn(`[⚠️] Failed to write cache file: ${e.message}`);
}
if (!quiet) {
console.log(`[+] ${extension.name} extension installed`);
}
return installedExt;
}
// Export all functions
module.exports = {
// Environment helpers
@@ -1349,6 +1442,9 @@ module.exports = {
getExtensionPaths,
waitForExtensionTarget,
getExtensionTargets,
// Shared extension installer utilities
getExtensionsDir,
installExtensionWithCache,
// Deprecated - use enableExtensions option instead
getExtensionLaunchArgs,
};
@@ -1371,6 +1467,8 @@ if (require.main === module) {
console.log(' loadExtensionManifest <path>');
console.log(' getExtensionLaunchArgs <extensions_json>');
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
console.log(' getExtensionsDir');
console.log(' installExtensionWithCache <webstore_id> <name>');
process.exit(1);
}
@@ -1483,6 +1581,26 @@ if (require.main === module) {
break;
}
case 'getExtensionsDir': {
console.log(getExtensionsDir());
break;
}
case 'installExtensionWithCache': {
const [webstore_id, name] = commandArgs;
if (!webstore_id || !name) {
console.error('Usage: installExtensionWithCache <webstore_id> <name>');
process.exit(1);
}
const ext = await installExtensionWithCache({ webstore_id, name });
if (ext) {
console.log(JSON.stringify(ext, null, 2));
} else {
process.exit(1);
}
break;
}
default:
console.error(`Unknown command: ${command}`);
process.exit(1);

View File

@@ -38,6 +38,7 @@ const {
killChrome,
getEnv,
writePidWithMtime,
getExtensionsDir,
} = require('./chrome_utils.js');
// Extractor metadata
@@ -115,8 +116,7 @@ async function main() {
if (version) console.error(`[*] Version: ${version}`);
// Load installed extensions
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
const extensionsDir = getExtensionsDir();
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
if (userDataDir) {

View File

@@ -0,0 +1,276 @@
"""
Shared Chrome test helpers for plugin integration tests.
This module provides common utilities for Chrome-based plugin tests, reducing
duplication across test files. It uses the JavaScript utilities from chrome_utils.js
where appropriate.
Usage:
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
setup_chrome_session,
cleanup_chrome,
find_chromium_binary,
get_node_modules_dir,
)
"""
import os
import signal
import subprocess
import time
from pathlib import Path
from typing import Tuple, Optional
from contextlib import contextmanager
# Plugin directory locations
CHROME_PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
# Hook script locations
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
def get_node_modules_dir() -> Path:
"""Get NODE_MODULES_DIR for tests, checking env first.
Returns the path to the node_modules directory, checking:
1. NODE_MODULES_DIR environment variable
2. Computed from LIB_DIR via ArchiveBox config
"""
if os.environ.get('NODE_MODULES_DIR'):
return Path(os.environ['NODE_MODULES_DIR'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
return lib_dir / 'npm' / 'node_modules'
def get_test_env() -> dict:
"""Get environment dict with NODE_MODULES_DIR set correctly for tests.
Returns a copy of os.environ with NODE_MODULES_DIR added/updated.
Use this for all subprocess calls in plugin tests.
"""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
return env
def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
"""Find the Chromium binary using chrome_utils.js findChromium().
This uses the centralized findChromium() function which checks:
- CHROME_BINARY env var
- @puppeteer/browsers install locations
- System Chromium locations
- Falls back to Chrome (with warning)
Args:
data_dir: Directory where chromium was installed (contains chromium/ subdir)
Returns:
Path to Chromium binary or None if not found
"""
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
result = subprocess.run(
['node', str(CHROME_UTILS), 'findChromium', str(search_dir)],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
return None
def get_extensions_dir() -> str:
"""Get the Chrome extensions directory using chrome_utils.js getExtensionsDir().
This uses the centralized path calculation from chrome_utils.js which checks:
- CHROME_EXTENSIONS_DIR env var
- DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions
Returns:
Path to extensions directory
"""
result = subprocess.run(
['node', str(CHROME_UTILS), 'getExtensionsDir'],
capture_output=True,
text=True,
timeout=10,
env=get_test_env()
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip()
# Fallback to default computation if JS call fails
data_dir = os.environ.get('DATA_DIR', './data')
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
def setup_chrome_session(
tmpdir: Path,
crawl_id: str = 'test-crawl',
snapshot_id: str = 'test-snapshot',
test_url: str = 'about:blank',
navigate: bool = True,
timeout: int = 15,
) -> Tuple[subprocess.Popen, int, Path]:
"""Set up a Chrome session with tab and optional navigation.
Creates the directory structure, launches Chrome, creates a tab,
and optionally navigates to the test URL.
Args:
tmpdir: Temporary directory for test files
crawl_id: ID to use for the crawl
snapshot_id: ID to use for the snapshot
test_url: URL to navigate to (if navigate=True)
navigate: Whether to navigate to the URL after creating tab
timeout: Seconds to wait for Chrome to start
Returns:
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
Raises:
RuntimeError: If Chrome fails to start or tab creation fails
"""
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir(exist_ok=True)
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(exist_ok=True)
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(timeout):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
if not (chrome_dir / 'cdp_url.txt').exists():
raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot directory structure
snapshot_dir = Path(tmpdir) / 'snapshot'
snapshot_dir.mkdir(exist_ok=True)
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir(exist_ok=True)
# Create tab
tab_env = env.copy()
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=tab_env
)
if result.returncode != 0:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError(f"Tab creation failed: {result.stderr}")
# Navigate to URL if requested
if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
if result.returncode != 0:
cleanup_chrome(chrome_launch_process, chrome_pid)
raise RuntimeError(f"Navigation failed: {result.stderr}")
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None:
"""Clean up Chrome processes.
Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID.
Ignores errors if processes are already dead.
Args:
chrome_launch_process: The Popen object for the chrome launch hook
chrome_pid: The PID of the Chrome process
"""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except Exception:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
@contextmanager
def chrome_session(
tmpdir: Path,
crawl_id: str = 'test-crawl',
snapshot_id: str = 'test-snapshot',
test_url: str = 'about:blank',
navigate: bool = True,
timeout: int = 15,
):
"""Context manager for Chrome sessions with automatic cleanup.
Usage:
with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir):
# Run tests with chrome session
pass
# Chrome automatically cleaned up
Args:
tmpdir: Temporary directory for test files
crawl_id: ID to use for the crawl
snapshot_id: ID to use for the snapshot
test_url: URL to navigate to (if navigate=True)
navigate: Whether to navigate to the URL after creating tab
timeout: Seconds to wait for Chrome to start
Yields:
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
"""
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
tmpdir=tmpdir,
crawl_id=crawl_id,
snapshot_id=snapshot_id,
test_url=test_url,
navigate=navigate,
timeout=timeout,
)
yield chrome_launch_process, chrome_pid, snapshot_chrome_dir
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)

View File

@@ -14,7 +14,6 @@ Tests verify:
import json
import os
import re
import signal
import subprocess
import time
import tempfile
@@ -22,37 +21,19 @@ from pathlib import Path
import pytest
# Import shared Chrome test helpers
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
setup_chrome_session,
cleanup_chrome,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
def get_node_modules_dir():
"""Get NODE_MODULES_DIR for tests, checking env first."""
# Check if NODE_MODULES_DIR is already set in environment
if os.environ.get('NODE_MODULES_DIR'):
return Path(os.environ['NODE_MODULES_DIR'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
return lib_dir / 'npm' / 'node_modules'
NODE_MODULES_DIR = get_node_modules_dir()
def get_test_env():
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
@@ -117,95 +98,18 @@ def test_fails_gracefully_without_chrome_session():
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
def setup_chrome_session(tmpdir):
"""Helper to set up Chrome session with tab and navigation."""
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(15):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
if not (chrome_dir / 'cdp_url.txt').exists():
raise RuntimeError("Chrome CDP URL not found after 15s")
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot directory structure
snapshot_dir = Path(tmpdir) / 'snapshot'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
# Create tab
tab_env = env.copy()
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=tab_env
)
if result.returncode != 0:
raise RuntimeError(f"Tab creation failed: {result.stderr}")
# Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
if result.returncode != 0:
raise RuntimeError(f"Navigation failed: {result.stderr}")
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process, chrome_pid):
"""Helper to clean up Chrome processes."""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_scrolls_page_and_outputs_stats():
"""Integration test: scroll page and verify JSONL output format."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-infiniscroll',
snapshot_id='snap-infiniscroll',
test_url=TEST_URL,
)
# Create infiniscroll output directory (sibling to chrome)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
@@ -265,7 +169,12 @@ def test_config_scroll_limit_honored():
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-scroll-limit',
snapshot_id='snap-limit',
test_url=TEST_URL,
)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
@@ -317,7 +226,12 @@ def test_config_timeout_honored():
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-timeout',
snapshot_id='snap-timeout',
test_url=TEST_URL,
)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()

View File

@@ -17,11 +17,8 @@
* - Works on thousands of websites out of the box
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
@@ -29,69 +26,17 @@ const EXTENSION = {
name: 'istilldontcareaboutcookies',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install the I Still Don't Care About Cookies extension
*/
async function installCookiesExtension() {
console.log('[*] Installing I Still Don\'t Care About Cookies extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension');
return null;
}
console.log('[+] I Still Don\'t Care About Cookies extension installed');
console.log('[+] Cookie banners will be automatically dismissed during archiving');
return extension;
}
/**
* Main entry point - install extension before archiving
*
* Note: This extension works out of the box with no configuration needed.
* It automatically detects and dismisses cookie banners on page load.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
const extension = await installExtensionWithCache(EXTENSION);
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installCookiesExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
console.log('[+] Cookie banners will be automatically dismissed during archiving');
}
return extension;
@@ -100,7 +45,6 @@ async function main() {
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installCookiesExtension,
};
// Run if executed directly

View File

@@ -22,38 +22,20 @@ from pathlib import Path
import pytest
# Import shared Chrome test helpers
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
setup_chrome_session,
cleanup_chrome,
)
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/'
def get_node_modules_dir():
"""Get NODE_MODULES_DIR for tests, checking env first."""
# Check if NODE_MODULES_DIR is already set in environment
if os.environ.get('NODE_MODULES_DIR'):
return Path(os.environ['NODE_MODULES_DIR'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
return lib_dir / 'npm' / 'node_modules'
NODE_MODULES_DIR = get_node_modules_dir()
def get_test_env():
"""Get environment with NODE_MODULES_DIR set correctly."""
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert MODALCLOSER_HOOK is not None, "Modalcloser hook not found"
@@ -118,76 +100,6 @@ def test_fails_gracefully_without_chrome_session():
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
def setup_chrome_session(tmpdir):
"""Helper to set up Chrome session with tab."""
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(15):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
if not (chrome_dir / 'cdp_url.txt').exists():
raise RuntimeError("Chrome CDP URL not found after 15s")
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot directory structure
snapshot_dir = Path(tmpdir) / 'snapshot'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
# Create tab
tab_env = env.copy()
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser', '--crawl-id=test-modalcloser'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=tab_env
)
if result.returncode != 0:
raise RuntimeError(f"Tab creation failed: {result.stderr}")
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process, chrome_pid):
"""Helper to clean up Chrome processes."""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_background_script_handles_sigterm():
"""Test that background script runs and handles SIGTERM correctly."""
with tempfile.TemporaryDirectory() as tmpdir:
@@ -195,7 +107,12 @@ def test_background_script_handles_sigterm():
chrome_pid = None
modalcloser_process = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-modalcloser',
snapshot_id='snap-modalcloser',
test_url=TEST_URL,
)
# Create modalcloser output directory (sibling to chrome)
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
@@ -265,7 +182,12 @@ def test_dialog_handler_logs_dialogs():
chrome_pid = None
modalcloser_process = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-dialog',
snapshot_id='snap-dialog',
test_url=TEST_URL,
)
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()
@@ -313,7 +235,12 @@ def test_config_poll_interval():
chrome_pid = None
modalcloser_process = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
Path(tmpdir),
crawl_id='test-poll',
snapshot_id='snap-poll',
test_url=TEST_URL,
)
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
modalcloser_dir.mkdir()

View File

@@ -16,11 +16,8 @@
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
@@ -28,76 +25,25 @@ const EXTENSION = {
name: 'twocaptcha',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install and configure the 2captcha extension
*/
async function installCaptchaExtension() {
console.log('[*] Installing 2captcha extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install 2captcha extension');
return null;
}
// Check if API key is configured
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
} else {
console.log('[+] 2captcha extension installed and API key configured');
}
return extension;
}
/**
* Note: 2captcha configuration is now handled by chrome plugin
* Main entry point - install extension before archiving
*
* Note: 2captcha configuration is handled by on_Crawl__25_configure_twocaptcha_extension_options.js
* during first-time browser setup to avoid repeated configuration on every snapshot.
* The API key is injected via chrome.storage API once per browser session.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json');
const extension = await installExtensionWithCache(EXTENSION);
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] 2captcha extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installCaptchaExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
// Check if API key is configured
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
} else {
console.log('[+] 2captcha extension installed and API key configured');
}
}
return extension;
@@ -106,7 +52,6 @@ async function main() {
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installCaptchaExtension,
};
// Run if executed directly

View File

@@ -18,11 +18,8 @@
* - Uses efficient blocking with filter lists
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
@@ -30,69 +27,17 @@ const EXTENSION = {
name: 'ublock',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install the uBlock Origin extension
*/
async function installUblockExtension() {
console.log('[*] Installing uBlock Origin extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install uBlock Origin extension');
return null;
}
console.log('[+] uBlock Origin extension installed');
console.log('[+] Ads and trackers will be blocked during archiving');
return extension;
}
/**
* Main entry point - install extension before archiving
*
* Note: uBlock Origin works automatically with default filter lists.
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
const extension = await installExtensionWithCache(EXTENSION);
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] uBlock Origin extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installUblockExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
console.log('[+] Ads and trackers will be blocked during archiving');
}
return extension;
@@ -101,7 +46,6 @@ async function main() {
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installUblockExtension,
};
// Run if executed directly