mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Reduce code duplication between Chrome utilities (#1737)
This change consolidates duplicated logic between chrome_utils.js and extension installer hooks, as well as between Python plugin tests: JavaScript changes: - Add getExtensionsDir() to centralize extension directory path calculation - Add installExtensionWithCache() to handle extension install + cache workflow - Add CLI commands for new utilities - Refactor all 3 extension installers (ublock, istilldontcareaboutcookies, twocaptcha) to use shared utilities, reducing each from ~115 lines to ~60 - Update chrome_launch hook to use getExtensionsDir() Python test changes: - Add chrome_test_helpers.py with shared Chrome session management utilities - Refactor infiniscroll and modalcloser tests to use shared helpers - setup_chrome_session(), cleanup_chrome(), get_test_env() now centralized - Add chrome_session() context manager for automatic cleanup Net result: ~208 lines of code removed while maintaining same functionality. <!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes. --> # Summary <!--e.g. This PR fixes ABC or adds the ability to do XYZ...--> # Related issues <!-- e.g. #123 or Roadmap goal # https://github.com/pirate/ArchiveBox/wiki/Roadmap --> # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk
This commit is contained in:
@@ -1312,6 +1312,99 @@ function findChromium() {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Shared Extension Installer Utilities
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the extensions directory path.
|
||||||
|
* Centralized path calculation used by extension installers and chrome launch.
|
||||||
|
*
|
||||||
|
* Path is derived from environment variables in this priority:
|
||||||
|
* 1. CHROME_EXTENSIONS_DIR (explicit override)
|
||||||
|
* 2. DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions (default)
|
||||||
|
*
|
||||||
|
* @returns {string} - Absolute path to extensions directory
|
||||||
|
*/
|
||||||
|
function getExtensionsDir() {
|
||||||
|
const dataDir = getEnv('DATA_DIR', './data');
|
||||||
|
const persona = getEnv('ACTIVE_PERSONA', 'Default');
|
||||||
|
return getEnv('CHROME_EXTENSIONS_DIR') ||
|
||||||
|
path.join(dataDir, 'personas', persona, 'chrome_extensions');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Install a Chrome extension with caching support.
|
||||||
|
*
|
||||||
|
* This is the main entry point for extension installer hooks. It handles:
|
||||||
|
* - Checking for cached extension metadata
|
||||||
|
* - Installing the extension if not cached
|
||||||
|
* - Writing cache file for future runs
|
||||||
|
*
|
||||||
|
* @param {Object} extension - Extension metadata object
|
||||||
|
* @param {string} extension.webstore_id - Chrome Web Store extension ID
|
||||||
|
* @param {string} extension.name - Human-readable extension name (used for cache file)
|
||||||
|
* @param {Object} [options] - Options
|
||||||
|
* @param {string} [options.extensionsDir] - Override extensions directory
|
||||||
|
* @param {boolean} [options.quiet=false] - Suppress info logging
|
||||||
|
* @returns {Promise<Object|null>} - Installed extension metadata or null on failure
|
||||||
|
*/
|
||||||
|
async function installExtensionWithCache(extension, options = {}) {
|
||||||
|
const {
|
||||||
|
extensionsDir = getExtensionsDir(),
|
||||||
|
quiet = false,
|
||||||
|
} = options;
|
||||||
|
|
||||||
|
const cacheFile = path.join(extensionsDir, `${extension.name}.extension.json`);
|
||||||
|
|
||||||
|
// Check if extension is already cached and valid
|
||||||
|
if (fs.existsSync(cacheFile)) {
|
||||||
|
try {
|
||||||
|
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||||
|
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||||
|
|
||||||
|
if (fs.existsSync(manifestPath)) {
|
||||||
|
if (!quiet) {
|
||||||
|
console.log(`[*] ${extension.name} extension already installed (using cache)`);
|
||||||
|
}
|
||||||
|
return cached;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Cache file corrupted, re-install
|
||||||
|
console.warn(`[⚠️] Extension cache corrupted for ${extension.name}, re-installing...`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Install extension
|
||||||
|
if (!quiet) {
|
||||||
|
console.log(`[*] Installing ${extension.name} extension...`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const installedExt = await loadOrInstallExtension(extension, extensionsDir);
|
||||||
|
|
||||||
|
if (!installedExt) {
|
||||||
|
console.error(`[❌] Failed to install ${extension.name} extension`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write cache file
|
||||||
|
try {
|
||||||
|
await fs.promises.mkdir(extensionsDir, { recursive: true });
|
||||||
|
await fs.promises.writeFile(cacheFile, JSON.stringify(installedExt, null, 2));
|
||||||
|
if (!quiet) {
|
||||||
|
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.warn(`[⚠️] Failed to write cache file: ${e.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!quiet) {
|
||||||
|
console.log(`[+] ${extension.name} extension installed`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return installedExt;
|
||||||
|
}
|
||||||
|
|
||||||
// Export all functions
|
// Export all functions
|
||||||
module.exports = {
|
module.exports = {
|
||||||
// Environment helpers
|
// Environment helpers
|
||||||
@@ -1349,6 +1442,9 @@ module.exports = {
|
|||||||
getExtensionPaths,
|
getExtensionPaths,
|
||||||
waitForExtensionTarget,
|
waitForExtensionTarget,
|
||||||
getExtensionTargets,
|
getExtensionTargets,
|
||||||
|
// Shared extension installer utilities
|
||||||
|
getExtensionsDir,
|
||||||
|
installExtensionWithCache,
|
||||||
// Deprecated - use enableExtensions option instead
|
// Deprecated - use enableExtensions option instead
|
||||||
getExtensionLaunchArgs,
|
getExtensionLaunchArgs,
|
||||||
};
|
};
|
||||||
@@ -1371,6 +1467,8 @@ if (require.main === module) {
|
|||||||
console.log(' loadExtensionManifest <path>');
|
console.log(' loadExtensionManifest <path>');
|
||||||
console.log(' getExtensionLaunchArgs <extensions_json>');
|
console.log(' getExtensionLaunchArgs <extensions_json>');
|
||||||
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
|
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
|
||||||
|
console.log(' getExtensionsDir');
|
||||||
|
console.log(' installExtensionWithCache <webstore_id> <name>');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1483,6 +1581,26 @@ if (require.main === module) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case 'getExtensionsDir': {
|
||||||
|
console.log(getExtensionsDir());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'installExtensionWithCache': {
|
||||||
|
const [webstore_id, name] = commandArgs;
|
||||||
|
if (!webstore_id || !name) {
|
||||||
|
console.error('Usage: installExtensionWithCache <webstore_id> <name>');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
const ext = await installExtensionWithCache({ webstore_id, name });
|
||||||
|
if (ext) {
|
||||||
|
console.log(JSON.stringify(ext, null, 2));
|
||||||
|
} else {
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
console.error(`Unknown command: ${command}`);
|
console.error(`Unknown command: ${command}`);
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ const {
|
|||||||
killChrome,
|
killChrome,
|
||||||
getEnv,
|
getEnv,
|
||||||
writePidWithMtime,
|
writePidWithMtime,
|
||||||
|
getExtensionsDir,
|
||||||
} = require('./chrome_utils.js');
|
} = require('./chrome_utils.js');
|
||||||
|
|
||||||
// Extractor metadata
|
// Extractor metadata
|
||||||
@@ -115,8 +116,7 @@ async function main() {
|
|||||||
if (version) console.error(`[*] Version: ${version}`);
|
if (version) console.error(`[*] Version: ${version}`);
|
||||||
|
|
||||||
// Load installed extensions
|
// Load installed extensions
|
||||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
|
const extensionsDir = getExtensionsDir();
|
||||||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
|
|
||||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||||
|
|
||||||
if (userDataDir) {
|
if (userDataDir) {
|
||||||
|
|||||||
276
archivebox/plugins/chrome/tests/chrome_test_helpers.py
Normal file
276
archivebox/plugins/chrome/tests/chrome_test_helpers.py
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
"""
|
||||||
|
Shared Chrome test helpers for plugin integration tests.
|
||||||
|
|
||||||
|
This module provides common utilities for Chrome-based plugin tests, reducing
|
||||||
|
duplication across test files. It uses the JavaScript utilities from chrome_utils.js
|
||||||
|
where appropriate.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_test_env,
|
||||||
|
setup_chrome_session,
|
||||||
|
cleanup_chrome,
|
||||||
|
find_chromium_binary,
|
||||||
|
get_node_modules_dir,
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple, Optional
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
|
||||||
|
# Plugin directory locations
|
||||||
|
CHROME_PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
|
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
|
||||||
|
|
||||||
|
# Hook script locations
|
||||||
|
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||||
|
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||||
|
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||||
|
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
|
||||||
|
|
||||||
|
|
||||||
|
def get_node_modules_dir() -> Path:
|
||||||
|
"""Get NODE_MODULES_DIR for tests, checking env first.
|
||||||
|
|
||||||
|
Returns the path to the node_modules directory, checking:
|
||||||
|
1. NODE_MODULES_DIR environment variable
|
||||||
|
2. Computed from LIB_DIR via ArchiveBox config
|
||||||
|
"""
|
||||||
|
if os.environ.get('NODE_MODULES_DIR'):
|
||||||
|
return Path(os.environ['NODE_MODULES_DIR'])
|
||||||
|
# Otherwise compute from LIB_DIR
|
||||||
|
from archivebox.config.common import STORAGE_CONFIG
|
||||||
|
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||||
|
return lib_dir / 'npm' / 'node_modules'
|
||||||
|
|
||||||
|
|
||||||
|
def get_test_env() -> dict:
|
||||||
|
"""Get environment dict with NODE_MODULES_DIR set correctly for tests.
|
||||||
|
|
||||||
|
Returns a copy of os.environ with NODE_MODULES_DIR added/updated.
|
||||||
|
Use this for all subprocess calls in plugin tests.
|
||||||
|
"""
|
||||||
|
env = os.environ.copy()
|
||||||
|
env['NODE_MODULES_DIR'] = str(get_node_modules_dir())
|
||||||
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
def find_chromium_binary(data_dir: Optional[str] = None) -> Optional[str]:
|
||||||
|
"""Find the Chromium binary using chrome_utils.js findChromium().
|
||||||
|
|
||||||
|
This uses the centralized findChromium() function which checks:
|
||||||
|
- CHROME_BINARY env var
|
||||||
|
- @puppeteer/browsers install locations
|
||||||
|
- System Chromium locations
|
||||||
|
- Falls back to Chrome (with warning)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_dir: Directory where chromium was installed (contains chromium/ subdir)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to Chromium binary or None if not found
|
||||||
|
"""
|
||||||
|
search_dir = data_dir or os.environ.get('DATA_DIR', '.')
|
||||||
|
result = subprocess.run(
|
||||||
|
['node', str(CHROME_UTILS), 'findChromium', str(search_dir)],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
return result.stdout.strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_extensions_dir() -> str:
|
||||||
|
"""Get the Chrome extensions directory using chrome_utils.js getExtensionsDir().
|
||||||
|
|
||||||
|
This uses the centralized path calculation from chrome_utils.js which checks:
|
||||||
|
- CHROME_EXTENSIONS_DIR env var
|
||||||
|
- DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to extensions directory
|
||||||
|
"""
|
||||||
|
result = subprocess.run(
|
||||||
|
['node', str(CHROME_UTILS), 'getExtensionsDir'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=10,
|
||||||
|
env=get_test_env()
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
return result.stdout.strip()
|
||||||
|
# Fallback to default computation if JS call fails
|
||||||
|
data_dir = os.environ.get('DATA_DIR', './data')
|
||||||
|
persona = os.environ.get('ACTIVE_PERSONA', 'Default')
|
||||||
|
return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')
|
||||||
|
|
||||||
|
|
||||||
|
def setup_chrome_session(
|
||||||
|
tmpdir: Path,
|
||||||
|
crawl_id: str = 'test-crawl',
|
||||||
|
snapshot_id: str = 'test-snapshot',
|
||||||
|
test_url: str = 'about:blank',
|
||||||
|
navigate: bool = True,
|
||||||
|
timeout: int = 15,
|
||||||
|
) -> Tuple[subprocess.Popen, int, Path]:
|
||||||
|
"""Set up a Chrome session with tab and optional navigation.
|
||||||
|
|
||||||
|
Creates the directory structure, launches Chrome, creates a tab,
|
||||||
|
and optionally navigates to the test URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tmpdir: Temporary directory for test files
|
||||||
|
crawl_id: ID to use for the crawl
|
||||||
|
snapshot_id: ID to use for the snapshot
|
||||||
|
test_url: URL to navigate to (if navigate=True)
|
||||||
|
navigate: Whether to navigate to the URL after creating tab
|
||||||
|
timeout: Seconds to wait for Chrome to start
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If Chrome fails to start or tab creation fails
|
||||||
|
"""
|
||||||
|
crawl_dir = Path(tmpdir) / 'crawl'
|
||||||
|
crawl_dir.mkdir(exist_ok=True)
|
||||||
|
chrome_dir = crawl_dir / 'chrome'
|
||||||
|
chrome_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
env = get_test_env()
|
||||||
|
env['CHROME_HEADLESS'] = 'true'
|
||||||
|
|
||||||
|
# Launch Chrome at crawl level
|
||||||
|
chrome_launch_process = subprocess.Popen(
|
||||||
|
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||||
|
cwd=str(chrome_dir),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
env=env
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for Chrome to launch
|
||||||
|
for i in range(timeout):
|
||||||
|
if chrome_launch_process.poll() is not None:
|
||||||
|
stdout, stderr = chrome_launch_process.communicate()
|
||||||
|
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||||
|
if (chrome_dir / 'cdp_url.txt').exists():
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||||
|
raise RuntimeError(f"Chrome CDP URL not found after {timeout}s")
|
||||||
|
|
||||||
|
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||||
|
|
||||||
|
# Create snapshot directory structure
|
||||||
|
snapshot_dir = Path(tmpdir) / 'snapshot'
|
||||||
|
snapshot_dir.mkdir(exist_ok=True)
|
||||||
|
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||||
|
snapshot_chrome_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Create tab
|
||||||
|
tab_env = env.copy()
|
||||||
|
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||||
|
result = subprocess.run(
|
||||||
|
['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
|
||||||
|
cwd=str(snapshot_chrome_dir),
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=60,
|
||||||
|
env=tab_env
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||||
|
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
||||||
|
|
||||||
|
# Navigate to URL if requested
|
||||||
|
if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
|
||||||
|
result = subprocess.run(
|
||||||
|
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||||
|
cwd=str(snapshot_chrome_dir),
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=120,
|
||||||
|
env=env
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||||
|
raise RuntimeError(f"Navigation failed: {result.stderr}")
|
||||||
|
|
||||||
|
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int) -> None:
|
||||||
|
"""Clean up Chrome processes.
|
||||||
|
|
||||||
|
Sends SIGTERM to the chrome_launch_process and SIGKILL to the Chrome PID.
|
||||||
|
Ignores errors if processes are already dead.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chrome_launch_process: The Popen object for the chrome launch hook
|
||||||
|
chrome_pid: The PID of the Chrome process
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||||
|
chrome_launch_process.wait(timeout=5)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
os.kill(chrome_pid, signal.SIGKILL)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def chrome_session(
|
||||||
|
tmpdir: Path,
|
||||||
|
crawl_id: str = 'test-crawl',
|
||||||
|
snapshot_id: str = 'test-snapshot',
|
||||||
|
test_url: str = 'about:blank',
|
||||||
|
navigate: bool = True,
|
||||||
|
timeout: int = 15,
|
||||||
|
):
|
||||||
|
"""Context manager for Chrome sessions with automatic cleanup.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir):
|
||||||
|
# Run tests with chrome session
|
||||||
|
pass
|
||||||
|
# Chrome automatically cleaned up
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tmpdir: Temporary directory for test files
|
||||||
|
crawl_id: ID to use for the crawl
|
||||||
|
snapshot_id: ID to use for the snapshot
|
||||||
|
test_url: URL to navigate to (if navigate=True)
|
||||||
|
navigate: Whether to navigate to the URL after creating tab
|
||||||
|
timeout: Seconds to wait for Chrome to start
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir)
|
||||||
|
"""
|
||||||
|
chrome_launch_process = None
|
||||||
|
chrome_pid = None
|
||||||
|
try:
|
||||||
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
tmpdir=tmpdir,
|
||||||
|
crawl_id=crawl_id,
|
||||||
|
snapshot_id=snapshot_id,
|
||||||
|
test_url=test_url,
|
||||||
|
navigate=navigate,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
yield chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||||
|
finally:
|
||||||
|
if chrome_launch_process and chrome_pid:
|
||||||
|
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||||
@@ -14,7 +14,6 @@ Tests verify:
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import signal
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
@@ -22,37 +21,19 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
# Import shared Chrome test helpers
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_test_env,
|
||||||
|
setup_chrome_session,
|
||||||
|
cleanup_chrome,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
|
||||||
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
|
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
|
||||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
|
||||||
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
|
|
||||||
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
|
|
||||||
TEST_URL = 'https://www.singsing.movie/'
|
TEST_URL = 'https://www.singsing.movie/'
|
||||||
|
|
||||||
|
|
||||||
def get_node_modules_dir():
|
|
||||||
"""Get NODE_MODULES_DIR for tests, checking env first."""
|
|
||||||
# Check if NODE_MODULES_DIR is already set in environment
|
|
||||||
if os.environ.get('NODE_MODULES_DIR'):
|
|
||||||
return Path(os.environ['NODE_MODULES_DIR'])
|
|
||||||
# Otherwise compute from LIB_DIR
|
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
|
||||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
|
||||||
return lib_dir / 'npm' / 'node_modules'
|
|
||||||
|
|
||||||
|
|
||||||
NODE_MODULES_DIR = get_node_modules_dir()
|
|
||||||
|
|
||||||
|
|
||||||
def get_test_env():
|
|
||||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
|
||||||
env = os.environ.copy()
|
|
||||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
"""Verify on_Snapshot hook exists."""
|
"""Verify on_Snapshot hook exists."""
|
||||||
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
|
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
|
||||||
@@ -117,95 +98,18 @@ def test_fails_gracefully_without_chrome_session():
|
|||||||
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
||||||
|
|
||||||
|
|
||||||
def setup_chrome_session(tmpdir):
|
|
||||||
"""Helper to set up Chrome session with tab and navigation."""
|
|
||||||
crawl_dir = Path(tmpdir) / 'crawl'
|
|
||||||
crawl_dir.mkdir()
|
|
||||||
chrome_dir = crawl_dir / 'chrome'
|
|
||||||
chrome_dir.mkdir()
|
|
||||||
|
|
||||||
env = get_test_env()
|
|
||||||
env['CHROME_HEADLESS'] = 'true'
|
|
||||||
|
|
||||||
# Launch Chrome at crawl level
|
|
||||||
chrome_launch_process = subprocess.Popen(
|
|
||||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
|
|
||||||
cwd=str(chrome_dir),
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
env=env
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for Chrome to launch
|
|
||||||
for i in range(15):
|
|
||||||
if chrome_launch_process.poll() is not None:
|
|
||||||
stdout, stderr = chrome_launch_process.communicate()
|
|
||||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
|
||||||
if (chrome_dir / 'cdp_url.txt').exists():
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
|
||||||
raise RuntimeError("Chrome CDP URL not found after 15s")
|
|
||||||
|
|
||||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
|
||||||
|
|
||||||
# Create snapshot directory structure
|
|
||||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
|
||||||
snapshot_dir.mkdir()
|
|
||||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
|
||||||
snapshot_chrome_dir.mkdir()
|
|
||||||
|
|
||||||
# Create tab
|
|
||||||
tab_env = env.copy()
|
|
||||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
|
||||||
result = subprocess.run(
|
|
||||||
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'],
|
|
||||||
cwd=str(snapshot_chrome_dir),
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=60,
|
|
||||||
env=tab_env
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
|
||||||
|
|
||||||
# Navigate to URL
|
|
||||||
result = subprocess.run(
|
|
||||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
|
||||||
cwd=str(snapshot_chrome_dir),
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=120,
|
|
||||||
env=env
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
raise RuntimeError(f"Navigation failed: {result.stderr}")
|
|
||||||
|
|
||||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_chrome(chrome_launch_process, chrome_pid):
|
|
||||||
"""Helper to clean up Chrome processes."""
|
|
||||||
try:
|
|
||||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
|
||||||
chrome_launch_process.wait(timeout=5)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
os.kill(chrome_pid, signal.SIGKILL)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def test_scrolls_page_and_outputs_stats():
|
def test_scrolls_page_and_outputs_stats():
|
||||||
"""Integration test: scroll page and verify JSONL output format."""
|
"""Integration test: scroll page and verify JSONL output format."""
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
chrome_launch_process = None
|
chrome_launch_process = None
|
||||||
chrome_pid = None
|
chrome_pid = None
|
||||||
try:
|
try:
|
||||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
Path(tmpdir),
|
||||||
|
crawl_id='test-infiniscroll',
|
||||||
|
snapshot_id='snap-infiniscroll',
|
||||||
|
test_url=TEST_URL,
|
||||||
|
)
|
||||||
|
|
||||||
# Create infiniscroll output directory (sibling to chrome)
|
# Create infiniscroll output directory (sibling to chrome)
|
||||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||||
@@ -265,7 +169,12 @@ def test_config_scroll_limit_honored():
|
|||||||
chrome_launch_process = None
|
chrome_launch_process = None
|
||||||
chrome_pid = None
|
chrome_pid = None
|
||||||
try:
|
try:
|
||||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
Path(tmpdir),
|
||||||
|
crawl_id='test-scroll-limit',
|
||||||
|
snapshot_id='snap-limit',
|
||||||
|
test_url=TEST_URL,
|
||||||
|
)
|
||||||
|
|
||||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||||
infiniscroll_dir.mkdir()
|
infiniscroll_dir.mkdir()
|
||||||
@@ -317,7 +226,12 @@ def test_config_timeout_honored():
|
|||||||
chrome_launch_process = None
|
chrome_launch_process = None
|
||||||
chrome_pid = None
|
chrome_pid = None
|
||||||
try:
|
try:
|
||||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
Path(tmpdir),
|
||||||
|
crawl_id='test-timeout',
|
||||||
|
snapshot_id='snap-timeout',
|
||||||
|
test_url=TEST_URL,
|
||||||
|
)
|
||||||
|
|
||||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||||
infiniscroll_dir.mkdir()
|
infiniscroll_dir.mkdir()
|
||||||
|
|||||||
@@ -17,11 +17,8 @@
|
|||||||
* - Works on thousands of websites out of the box
|
* - Works on thousands of websites out of the box
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const path = require('path');
|
|
||||||
const fs = require('fs');
|
|
||||||
|
|
||||||
// Import extension utilities
|
// Import extension utilities
|
||||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
||||||
|
|
||||||
// Extension metadata
|
// Extension metadata
|
||||||
const EXTENSION = {
|
const EXTENSION = {
|
||||||
@@ -29,69 +26,17 @@ const EXTENSION = {
|
|||||||
name: 'istilldontcareaboutcookies',
|
name: 'istilldontcareaboutcookies',
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get extensions directory from environment or use default
|
|
||||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
|
||||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Install the I Still Don't Care About Cookies extension
|
|
||||||
*/
|
|
||||||
async function installCookiesExtension() {
|
|
||||||
console.log('[*] Installing I Still Don\'t Care About Cookies extension...');
|
|
||||||
|
|
||||||
// Install the extension
|
|
||||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
|
||||||
|
|
||||||
if (!extension) {
|
|
||||||
console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log('[+] I Still Don\'t Care About Cookies extension installed');
|
|
||||||
console.log('[+] Cookie banners will be automatically dismissed during archiving');
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Main entry point - install extension before archiving
|
||||||
|
*
|
||||||
* Note: This extension works out of the box with no configuration needed.
|
* Note: This extension works out of the box with no configuration needed.
|
||||||
* It automatically detects and dismisses cookie banners on page load.
|
* It automatically detects and dismisses cookie banners on page load.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
|
||||||
* Main entry point - install extension before archiving
|
|
||||||
*/
|
|
||||||
async function main() {
|
async function main() {
|
||||||
// Check if extension is already cached
|
const extension = await installExtensionWithCache(EXTENSION);
|
||||||
const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(cacheFile)) {
|
|
||||||
try {
|
|
||||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
|
||||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(manifestPath)) {
|
|
||||||
console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)');
|
|
||||||
return cached;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
// Cache file corrupted, re-install
|
|
||||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Install extension
|
|
||||||
const extension = await installCookiesExtension();
|
|
||||||
|
|
||||||
// Export extension metadata for chrome plugin to load
|
|
||||||
if (extension) {
|
if (extension) {
|
||||||
// Write extension info to a cache file that chrome plugin can read
|
console.log('[+] Cookie banners will be automatically dismissed during archiving');
|
||||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
|
||||||
await fs.promises.writeFile(
|
|
||||||
cacheFile,
|
|
||||||
JSON.stringify(extension, null, 2)
|
|
||||||
);
|
|
||||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return extension;
|
return extension;
|
||||||
@@ -100,7 +45,6 @@ async function main() {
|
|||||||
// Export functions for use by other plugins
|
// Export functions for use by other plugins
|
||||||
module.exports = {
|
module.exports = {
|
||||||
EXTENSION,
|
EXTENSION,
|
||||||
installCookiesExtension,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Run if executed directly
|
// Run if executed directly
|
||||||
|
|||||||
@@ -22,38 +22,20 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
# Import shared Chrome test helpers
|
||||||
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||||
|
get_test_env,
|
||||||
|
setup_chrome_session,
|
||||||
|
cleanup_chrome,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
|
||||||
MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
|
MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None)
|
||||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
|
|
||||||
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
|
|
||||||
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
|
|
||||||
TEST_URL = 'https://www.singsing.movie/'
|
TEST_URL = 'https://www.singsing.movie/'
|
||||||
COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/'
|
COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/'
|
||||||
|
|
||||||
|
|
||||||
def get_node_modules_dir():
|
|
||||||
"""Get NODE_MODULES_DIR for tests, checking env first."""
|
|
||||||
# Check if NODE_MODULES_DIR is already set in environment
|
|
||||||
if os.environ.get('NODE_MODULES_DIR'):
|
|
||||||
return Path(os.environ['NODE_MODULES_DIR'])
|
|
||||||
# Otherwise compute from LIB_DIR
|
|
||||||
from archivebox.config.common import STORAGE_CONFIG
|
|
||||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
|
||||||
return lib_dir / 'npm' / 'node_modules'
|
|
||||||
|
|
||||||
|
|
||||||
NODE_MODULES_DIR = get_node_modules_dir()
|
|
||||||
|
|
||||||
|
|
||||||
def get_test_env():
|
|
||||||
"""Get environment with NODE_MODULES_DIR set correctly."""
|
|
||||||
env = os.environ.copy()
|
|
||||||
env['NODE_MODULES_DIR'] = str(NODE_MODULES_DIR)
|
|
||||||
return env
|
|
||||||
|
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
"""Verify on_Snapshot hook exists."""
|
"""Verify on_Snapshot hook exists."""
|
||||||
assert MODALCLOSER_HOOK is not None, "Modalcloser hook not found"
|
assert MODALCLOSER_HOOK is not None, "Modalcloser hook not found"
|
||||||
@@ -118,76 +100,6 @@ def test_fails_gracefully_without_chrome_session():
|
|||||||
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
||||||
|
|
||||||
|
|
||||||
def setup_chrome_session(tmpdir):
|
|
||||||
"""Helper to set up Chrome session with tab."""
|
|
||||||
crawl_dir = Path(tmpdir) / 'crawl'
|
|
||||||
crawl_dir.mkdir()
|
|
||||||
chrome_dir = crawl_dir / 'chrome'
|
|
||||||
chrome_dir.mkdir()
|
|
||||||
|
|
||||||
env = get_test_env()
|
|
||||||
env['CHROME_HEADLESS'] = 'true'
|
|
||||||
|
|
||||||
# Launch Chrome at crawl level
|
|
||||||
chrome_launch_process = subprocess.Popen(
|
|
||||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-modalcloser'],
|
|
||||||
cwd=str(chrome_dir),
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
env=env
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for Chrome to launch
|
|
||||||
for i in range(15):
|
|
||||||
if chrome_launch_process.poll() is not None:
|
|
||||||
stdout, stderr = chrome_launch_process.communicate()
|
|
||||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
|
||||||
if (chrome_dir / 'cdp_url.txt').exists():
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
|
||||||
raise RuntimeError("Chrome CDP URL not found after 15s")
|
|
||||||
|
|
||||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
|
||||||
|
|
||||||
# Create snapshot directory structure
|
|
||||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
|
||||||
snapshot_dir.mkdir()
|
|
||||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
|
||||||
snapshot_chrome_dir.mkdir()
|
|
||||||
|
|
||||||
# Create tab
|
|
||||||
tab_env = env.copy()
|
|
||||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
|
||||||
result = subprocess.run(
|
|
||||||
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser', '--crawl-id=test-modalcloser'],
|
|
||||||
cwd=str(snapshot_chrome_dir),
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=60,
|
|
||||||
env=tab_env
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
|
||||||
|
|
||||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
|
||||||
|
|
||||||
|
|
||||||
def cleanup_chrome(chrome_launch_process, chrome_pid):
|
|
||||||
"""Helper to clean up Chrome processes."""
|
|
||||||
try:
|
|
||||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
|
||||||
chrome_launch_process.wait(timeout=5)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
os.kill(chrome_pid, signal.SIGKILL)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def test_background_script_handles_sigterm():
|
def test_background_script_handles_sigterm():
|
||||||
"""Test that background script runs and handles SIGTERM correctly."""
|
"""Test that background script runs and handles SIGTERM correctly."""
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
@@ -195,7 +107,12 @@ def test_background_script_handles_sigterm():
|
|||||||
chrome_pid = None
|
chrome_pid = None
|
||||||
modalcloser_process = None
|
modalcloser_process = None
|
||||||
try:
|
try:
|
||||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
Path(tmpdir),
|
||||||
|
crawl_id='test-modalcloser',
|
||||||
|
snapshot_id='snap-modalcloser',
|
||||||
|
test_url=TEST_URL,
|
||||||
|
)
|
||||||
|
|
||||||
# Create modalcloser output directory (sibling to chrome)
|
# Create modalcloser output directory (sibling to chrome)
|
||||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||||
@@ -265,7 +182,12 @@ def test_dialog_handler_logs_dialogs():
|
|||||||
chrome_pid = None
|
chrome_pid = None
|
||||||
modalcloser_process = None
|
modalcloser_process = None
|
||||||
try:
|
try:
|
||||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
Path(tmpdir),
|
||||||
|
crawl_id='test-dialog',
|
||||||
|
snapshot_id='snap-dialog',
|
||||||
|
test_url=TEST_URL,
|
||||||
|
)
|
||||||
|
|
||||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||||
modalcloser_dir.mkdir()
|
modalcloser_dir.mkdir()
|
||||||
@@ -313,7 +235,12 @@ def test_config_poll_interval():
|
|||||||
chrome_pid = None
|
chrome_pid = None
|
||||||
modalcloser_process = None
|
modalcloser_process = None
|
||||||
try:
|
try:
|
||||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(
|
||||||
|
Path(tmpdir),
|
||||||
|
crawl_id='test-poll',
|
||||||
|
snapshot_id='snap-poll',
|
||||||
|
test_url=TEST_URL,
|
||||||
|
)
|
||||||
|
|
||||||
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser'
|
||||||
modalcloser_dir.mkdir()
|
modalcloser_dir.mkdir()
|
||||||
|
|||||||
@@ -16,11 +16,8 @@
|
|||||||
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
|
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const path = require('path');
|
|
||||||
const fs = require('fs');
|
|
||||||
|
|
||||||
// Import extension utilities
|
// Import extension utilities
|
||||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
||||||
|
|
||||||
// Extension metadata
|
// Extension metadata
|
||||||
const EXTENSION = {
|
const EXTENSION = {
|
||||||
@@ -28,76 +25,25 @@ const EXTENSION = {
|
|||||||
name: 'twocaptcha',
|
name: 'twocaptcha',
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get extensions directory from environment or use default
|
|
||||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
|
||||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Install and configure the 2captcha extension
|
* Main entry point - install extension before archiving
|
||||||
*/
|
*
|
||||||
async function installCaptchaExtension() {
|
* Note: 2captcha configuration is handled by on_Crawl__25_configure_twocaptcha_extension_options.js
|
||||||
console.log('[*] Installing 2captcha extension...');
|
|
||||||
|
|
||||||
// Install the extension
|
|
||||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
|
||||||
|
|
||||||
if (!extension) {
|
|
||||||
console.error('[❌] Failed to install 2captcha extension');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if API key is configured
|
|
||||||
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
|
|
||||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
|
||||||
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
|
|
||||||
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
|
|
||||||
} else {
|
|
||||||
console.log('[+] 2captcha extension installed and API key configured');
|
|
||||||
}
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Note: 2captcha configuration is now handled by chrome plugin
|
|
||||||
* during first-time browser setup to avoid repeated configuration on every snapshot.
|
* during first-time browser setup to avoid repeated configuration on every snapshot.
|
||||||
* The API key is injected via chrome.storage API once per browser session.
|
* The API key is injected via chrome.storage API once per browser session.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
|
||||||
* Main entry point - install extension before archiving
|
|
||||||
*/
|
|
||||||
async function main() {
|
async function main() {
|
||||||
// Check if extension is already cached
|
const extension = await installExtensionWithCache(EXTENSION);
|
||||||
const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(cacheFile)) {
|
|
||||||
try {
|
|
||||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
|
||||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(manifestPath)) {
|
|
||||||
console.log('[*] 2captcha extension already installed (using cache)');
|
|
||||||
return cached;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
// Cache file corrupted, re-install
|
|
||||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Install extension
|
|
||||||
const extension = await installCaptchaExtension();
|
|
||||||
|
|
||||||
// Export extension metadata for chrome plugin to load
|
|
||||||
if (extension) {
|
if (extension) {
|
||||||
// Write extension info to a cache file that chrome plugin can read
|
// Check if API key is configured
|
||||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA;
|
||||||
await fs.promises.writeFile(
|
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||||
cacheFile,
|
console.warn('[⚠️] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured');
|
||||||
JSON.stringify(extension, null, 2)
|
console.warn('[⚠️] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving');
|
||||||
);
|
} else {
|
||||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
console.log('[+] 2captcha extension installed and API key configured');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return extension;
|
return extension;
|
||||||
@@ -106,7 +52,6 @@ async function main() {
|
|||||||
// Export functions for use by other plugins
|
// Export functions for use by other plugins
|
||||||
module.exports = {
|
module.exports = {
|
||||||
EXTENSION,
|
EXTENSION,
|
||||||
installCaptchaExtension,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Run if executed directly
|
// Run if executed directly
|
||||||
|
|||||||
@@ -18,11 +18,8 @@
|
|||||||
* - Uses efficient blocking with filter lists
|
* - Uses efficient blocking with filter lists
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const path = require('path');
|
|
||||||
const fs = require('fs');
|
|
||||||
|
|
||||||
// Import extension utilities
|
// Import extension utilities
|
||||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
||||||
|
|
||||||
// Extension metadata
|
// Extension metadata
|
||||||
const EXTENSION = {
|
const EXTENSION = {
|
||||||
@@ -30,69 +27,17 @@ const EXTENSION = {
|
|||||||
name: 'ublock',
|
name: 'ublock',
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get extensions directory from environment or use default
|
|
||||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
|
||||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Install the uBlock Origin extension
|
|
||||||
*/
|
|
||||||
async function installUblockExtension() {
|
|
||||||
console.log('[*] Installing uBlock Origin extension...');
|
|
||||||
|
|
||||||
// Install the extension
|
|
||||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
|
||||||
|
|
||||||
if (!extension) {
|
|
||||||
console.error('[❌] Failed to install uBlock Origin extension');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log('[+] uBlock Origin extension installed');
|
|
||||||
console.log('[+] Ads and trackers will be blocked during archiving');
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Main entry point - install extension before archiving
|
||||||
|
*
|
||||||
* Note: uBlock Origin works automatically with default filter lists.
|
* Note: uBlock Origin works automatically with default filter lists.
|
||||||
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
|
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
|
||||||
* Main entry point - install extension before archiving
|
|
||||||
*/
|
|
||||||
async function main() {
|
async function main() {
|
||||||
// Check if extension is already cached
|
const extension = await installExtensionWithCache(EXTENSION);
|
||||||
const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(cacheFile)) {
|
|
||||||
try {
|
|
||||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
|
||||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(manifestPath)) {
|
|
||||||
console.log('[*] uBlock Origin extension already installed (using cache)');
|
|
||||||
return cached;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
// Cache file corrupted, re-install
|
|
||||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Install extension
|
|
||||||
const extension = await installUblockExtension();
|
|
||||||
|
|
||||||
// Export extension metadata for chrome plugin to load
|
|
||||||
if (extension) {
|
if (extension) {
|
||||||
// Write extension info to a cache file that chrome plugin can read
|
console.log('[+] Ads and trackers will be blocked during archiving');
|
||||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
|
||||||
await fs.promises.writeFile(
|
|
||||||
cacheFile,
|
|
||||||
JSON.stringify(extension, null, 2)
|
|
||||||
);
|
|
||||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return extension;
|
return extension;
|
||||||
@@ -101,7 +46,6 @@ async function main() {
|
|||||||
// Export functions for use by other plugins
|
// Export functions for use by other plugins
|
||||||
module.exports = {
|
module.exports = {
|
||||||
EXTENSION,
|
EXTENSION,
|
||||||
installUblockExtension,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Run if executed directly
|
// Run if executed directly
|
||||||
|
|||||||
Reference in New Issue
Block a user