Clean up on_Crawl hooks: remove duplicates and standardize naming

Deleted dead/duplicate hooks:
- wget/on_Crawl__10_install_wget.py (duplicate of __10_wget_validate_config.py)
- chrome/on_Crawl__00_chrome_install.py (simpler version, kept full one)
- chrome/on_Crawl__20_chrome_launch.bg.js (legacy, kept __30 version)
- singlefile/on_Crawl__20_install_singlefile_extension.js (disabled/dead)
- istilldontcareaboutcookies/on_Crawl__20_install_*.js (legacy)
- ublock/on_Crawl__03_ublock.js (legacy, kept __20 version)
- Entire captcha2/ plugin (legacy version of twocaptcha/)

Renamed hooks to follow consistent pattern: on_Crawl__XX_<plugin>_<action>.<ext>
Priority bands:
  00-09: Binary/extension installation
  10-19: Config validation
  20-29: Browser launch and post-launch config

Final hooks:
  00 ripgrep_install.py, 01 chrome_install.py
  02 istilldontcareaboutcookies_install.js
  03 ublock_install.js, 04 singlefile_install.js
  05 twocaptcha_install.js
  10 chrome_validate.py, 11 wget_validate.py
  20 chrome_launch.bg.js, 25 twocaptcha_config.js
This commit is contained in:
Claude
2025-12-31 22:47:36 +00:00
parent f12c3b4b55
commit 4c77949197
21 changed files with 109 additions and 1729 deletions

View File

@@ -1,21 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"CAPTCHA2_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_CAPTCHA2"],
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
},
"CAPTCHA2_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for CAPTCHA solving in seconds"
}
}
}

View File

@@ -1,121 +0,0 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Plugin
*
* Installs and configures the 2captcha Chrome extension for automatic
* CAPTCHA solving during page archiving.
*
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
*
* Priority: 01 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
name: 'captcha2',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install and configure the 2captcha extension
*/
async function installCaptchaExtension() {
console.log('[*] Installing 2captcha extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install 2captcha extension');
return null;
}
// Check if API key is configured
const apiKey = process.env.API_KEY_2CAPTCHA;
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
} else {
console.log('[+] 2captcha extension installed and API key configured');
}
return extension;
}
/**
* Note: 2captcha configuration is now handled by chrome plugin
* during first-time browser setup to avoid repeated configuration on every snapshot.
* The API key is injected via chrome.storage API once per browser session.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] 2captcha extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installCaptchaExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installCaptchaExtension,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] 2captcha extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] 2captcha extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -1,279 +0,0 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Configuration
*
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
* Runs once per crawl to inject API key into extension storage.
*
* Priority: 11 (after chrome_launch at 20)
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - chrome plugin must have loaded extensions (extensions.json must exist)
*/
const path = require('path');
const fs = require('fs');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Get crawl's chrome directory from environment variable set by hooks.py
function getCrawlChromeSessionDir() {
const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
if (!crawlOutputDir) {
return null;
}
return path.join(crawlOutputDir, 'chrome');
}
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
async function configure2Captcha() {
// Check if already configured in this session
if (fs.existsSync(CONFIG_MARKER)) {
console.error('[*] 2captcha already configured in this browser session');
return { success: true, skipped: true };
}
// Check if API key is set
const apiKey = getEnv('API_KEY_2CAPTCHA');
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
}
// Load extensions metadata
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
}
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
if (!captchaExt) {
console.error('[*] 2captcha extension not installed, skipping configuration');
return { success: true, skipped: true };
}
console.error('[*] Configuring 2captcha extension with API key...');
try {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
try {
// Method 1: Try to inject via extension background page
if (captchaExt.target && captchaExt.target_ctx) {
console.error('[*] Attempting to configure via extension background page...');
// Reconnect to the browser to get fresh target context
const targets = await browser.targets();
const extTarget = targets.find(t =>
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
);
if (extTarget) {
const extContext = await extTarget.worker() || await extTarget.page();
if (extContext) {
await extContext.evaluate((key) => {
// Try all common storage patterns
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
chrome.storage.sync.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
}
// Also try localStorage as fallback
if (typeof localStorage !== 'undefined') {
localStorage.setItem('apiKey', key);
localStorage.setItem('2captcha_apikey', key);
localStorage.setItem('solver-api-key', key);
}
}, apiKey);
console.error('[+] 2captcha API key configured successfully via background page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'background_page' };
}
}
}
// Method 2: Try to configure via options page
console.error('[*] Attempting to configure via options page...');
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
const configPage = await browser.newPage();
try {
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
const configured = await configPage.evaluate((key) => {
// Try to find API key input field
const selectors = [
'input[name*="apikey" i]',
'input[id*="apikey" i]',
'input[name*="api-key" i]',
'input[id*="api-key" i]',
'input[name*="key" i]',
'input[placeholder*="api" i]',
'input[type="text"]',
];
for (const selector of selectors) {
const input = document.querySelector(selector);
if (input) {
input.value = key;
input.dispatchEvent(new Event('input', { bubbles: true }));
input.dispatchEvent(new Event('change', { bubbles: true }));
// Try to find and click save button
const saveSelectors = [
'button[type="submit"]',
'input[type="submit"]',
'button:contains("Save")',
'button:contains("Apply")',
];
for (const btnSel of saveSelectors) {
const btn = document.querySelector(btnSel);
if (btn) {
btn.click();
break;
}
}
// Also save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
}
return true;
}
}
// Fallback: Just save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
return true;
}
return false;
}, apiKey);
await configPage.close();
if (configured) {
console.error('[+] 2captcha API key configured successfully via options page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'options_page' };
}
} catch (e) {
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
try {
await configPage.close();
} catch (e2) {}
}
return { success: false, error: 'Could not configure via any method' };
} finally {
browser.disconnect();
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let error = '';
try {
const result = await configure2Captcha();
if (result.skipped) {
status = 'skipped';
} else if (result.success) {
status = 'succeeded';
} else {
status = 'failed';
error = result.error || 'Configuration failed';
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
if (error) {
console.error(`ERROR: ${error}`);
}
// Config hooks don't emit JSONL - they're utility hooks for setup
// Exit code indicates success/failure
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,184 +0,0 @@
"""
Unit tests for captcha2 plugin
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_config_script_exists():
"""Verify config script exists"""
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
def test_extension_metadata():
"""Test that captcha2 extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
# Just check the script can be loaded
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert metadata["name"] == "captcha2"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
# Check cache file was created
cache_file = ext_dir / "captcha2.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert cache_data["name"] == "captcha2"
assert "unpacked_path" in cache_data
assert "version" in cache_data
def test_install_twice_uses_cache():
"""Test that running install twice uses existing cache on second run"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# First install - downloads the extension
result1 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
# Verify cache was created
cache_file = ext_dir / "captcha2.extension.json"
assert cache_file.exists(), "Cache file should exist after first install"
# Second install - should use cache
result2 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
# Second run should mention cache reuse
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
def test_install_warns_without_api_key():
"""Test that install warns when API key not configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# Don't set API_KEY_2CAPTCHA
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should warn about missing API key
combined_output = result.stdout + result.stderr
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
def test_install_success_with_api_key():
"""Test that install succeeds when API key is configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should mention API key configured
combined_output = result.stdout + result.stderr
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
def test_config_script_structure():
"""Test that config script has proper structure"""
# Verify the script exists and contains expected markers
script_content = CONFIG_SCRIPT.read_text()
# Should mention configuration marker file
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
# Should mention API key
assert "API_KEY_2CAPTCHA" in script_content
# Should have main function or be executable
assert "async function" in script_content or "main" in script_content

View File

@@ -1,184 +0,0 @@
#!/usr/bin/env python3
"""
Install hook for Chrome/Chromium and puppeteer-core.
Runs at crawl start to install/find Chromium and puppeteer-core.
Outputs JSONL for Binary and Machine config updates.
Respects CHROME_BINARY env var for custom binary paths.
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
loading unpacked extensions in headless mode.
"""
import os
import sys
import json
import subprocess
from pathlib import Path
def get_chrome_version(binary_path: str) -> str | None:
"""Get Chrome/Chromium version string."""
try:
result = subprocess.run(
[binary_path, '--version'],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
return None
def install_puppeteer_core() -> bool:
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
if not node_modules_dir:
# No isolated node_modules, skip (will use global)
return True
node_modules_path = Path(node_modules_dir)
if (node_modules_path / 'puppeteer-core').exists():
return True
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
npm_prefix = node_modules_path.parent
try:
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
result = subprocess.run(
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
print(f"[+] puppeteer-core installed", file=sys.stderr)
return True
else:
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
return False
except Exception as e:
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
return False
def install_chromium() -> dict | None:
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
Output format: "chromium@<version> <path_to_binary>"
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
"""
try:
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
# Use --path to install to puppeteer's standard cache location
cache_path = os.path.expanduser('~/.cache/puppeteer')
result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
capture_output=True,
text=True,
stdin=subprocess.DEVNULL,
timeout=300
)
if result.returncode != 0:
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
return None
# Parse output: "chromium@1563294 /path/to/Chromium"
output = result.stdout.strip()
parts = output.split(' ', 1)
if len(parts) != 2:
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
return None
version_str = parts[0] # "chromium@1563294"
binary_path = parts[1].strip()
if not binary_path or not os.path.exists(binary_path):
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
return None
# Extract version number
version = version_str.split('@')[1] if '@' in version_str else None
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
return {
'name': 'chromium',
'abspath': binary_path,
'version': version,
'binprovider': 'puppeteer',
}
except subprocess.TimeoutExpired:
print("[!] Chromium install timed out", file=sys.stderr)
except FileNotFoundError:
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
except Exception as e:
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
return None
def main():
# Install puppeteer-core if NODE_MODULES_DIR is set
install_puppeteer_core()
# Check if CHROME_BINARY is already set and valid
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
version = get_chrome_version(configured_binary)
print(json.dumps({
'type': 'Binary',
'name': 'chromium',
'abspath': configured_binary,
'version': version,
'binprovider': 'env',
}))
sys.exit(0)
# Install/find Chromium via puppeteer
result = install_chromium()
if result and result.get('abspath'):
print(json.dumps({
'type': 'Binary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROME_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROMIUM_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print("Chromium binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -8,8 +8,8 @@
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Creates chrome/ directory under crawl output dir with:
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
* - port.txt: Debug port number
@@ -38,11 +38,12 @@ const {
killChrome,
getEnv,
writePidWithMtime,
getExtensionsDir,
} = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_launch';
const OUTPUT_DIR = 'chrome';
const OUTPUT_DIR = '.';
// Global state for cleanup
let chromePid = null;
@@ -115,8 +116,12 @@ async function main() {
if (version) console.error(`[*] Version: ${version}`);
// Load installed extensions
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
const extensionsDir = getExtensionsDir();
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
if (userDataDir) {
console.error(`[*] Using user data dir: ${userDataDir}`);
}
const installedExtensions = [];
const extensionPaths = [];
@@ -143,17 +148,18 @@ async function main() {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
// Write hook's own PID
const hookStartTime = Date.now() / 1000;
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
// Launch Chromium using consolidated function
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
const result = await launchChromium({
binary,
outputDir: OUTPUT_DIR,
userDataDir,
extensionPaths,
});
@@ -165,14 +171,6 @@ async function main() {
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
// Write extensions metadata
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
// Connect puppeteer for extension verification
console.error(`[*] Connecting puppeteer to CDP...`);
const browser = await puppeteer.connect({
@@ -181,30 +179,102 @@ async function main() {
});
browserInstance = browser;
// Verify extensions loaded
// Get actual extension IDs from chrome://extensions page
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 3000));
await new Promise(r => setTimeout(r, 2000));
const targets = browser.targets();
console.error(`[*] All browser targets (${targets.length}):`);
for (const t of targets) {
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
try {
const extPage = await browser.newPage();
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 2000));
// Parse extension info from the page
const extensionsFromPage = await extPage.evaluate(() => {
const extensions = [];
// Extensions manager uses shadow DOM
const manager = document.querySelector('extensions-manager');
if (!manager || !manager.shadowRoot) return extensions;
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
if (!itemList || !itemList.shadowRoot) return extensions;
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
for (const item of items) {
const id = item.getAttribute('id');
const nameEl = item.shadowRoot?.querySelector('#name');
const name = nameEl?.textContent?.trim() || '';
if (id && name) {
extensions.push({ id, name });
}
}
return extensions;
});
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
for (const e of extensionsFromPage) {
console.error(` - ${e.id}: "${e.name}"`);
}
// Match extensions by name (strict matching)
for (const ext of installedExtensions) {
// Read the extension's manifest to get its display name
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
let manifestName = manifest.name || '';
// Resolve message placeholder (e.g., __MSG_extName__)
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
const defaultLocale = manifest.default_locale || 'en';
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
if (fs.existsSync(messagesPath)) {
try {
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
if (messages[msgKey] && messages[msgKey].message) {
manifestName = messages[msgKey].message;
}
} catch (e) {
console.error(`[!] Failed to read messages.json: ${e.message}`);
}
}
}
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
// Find matching extension from page by exact name match first
let match = extensionsFromPage.find(e => e.name === manifestName);
// If no exact match, try case-insensitive exact match
if (!match) {
match = extensionsFromPage.find(e =>
e.name.toLowerCase() === manifestName.toLowerCase()
);
}
if (match) {
ext.id = match.id;
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
} else {
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
}
}
}
await extPage.close();
} catch (e) {
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
}
const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
// Filter out built-in extensions
// Fallback: check browser targets
const targets = browser.targets();
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
const customExtTargets = extTargets.filter(t => {
const customExtTargets = targets.filter(t => {
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
@@ -216,7 +286,7 @@ async function main() {
for (const target of customExtTargets) {
const url = target.url();
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
console.error(`[+] Extension target: ${extId} (${target.type()})`);
}
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
@@ -225,6 +295,14 @@ async function main() {
}
}
// Write extensions metadata with actual IDs
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
console.error(`[+] Chromium session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${cdpUrl}`);
console.error(`[+] PID: ${chromePid}`);

View File

@@ -1,323 +0,0 @@
#!/usr/bin/env node
/**
* Launch a shared Chromium browser session for the entire crawl.
*
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
* Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
*
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
* - port.txt: Debug port number
* - extensions.json: Loaded extensions metadata
*
* Environment variables:
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_HEADLESS: Run in headless mode (default: true)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
*/
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) {
module.paths.unshift(process.env.NODE_MODULES_DIR);
}
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const {
findChromium,
launchChromium,
killChrome,
getEnv,
writePidWithMtime,
getExtensionsDir,
} = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_launch';
const OUTPUT_DIR = '.';
// Global state for cleanup
let chromePid = null;
let browserInstance = null;
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach((arg) => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Cleanup handler for SIGTERM
async function cleanup() {
console.error('[*] Cleaning up Chrome session...');
// Try graceful browser close first
if (browserInstance) {
try {
console.error('[*] Closing browser gracefully...');
await browserInstance.close();
browserInstance = null;
console.error('[+] Browser closed gracefully');
} catch (e) {
console.error(`[!] Graceful close failed: ${e.message}`);
}
}
// Kill Chrome process
if (chromePid) {
await killChrome(chromePid, OUTPUT_DIR);
}
process.exit(0);
}
// Register signal handlers
process.on('SIGTERM', cleanup);
process.on('SIGINT', cleanup);
async function main() {
const args = parseArgs();
const crawlId = args.crawl_id;
try {
const binary = findChromium();
if (!binary) {
console.error('ERROR: Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chromium');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
process.exit(1);
}
// Get Chromium version
let version = '';
try {
const { execSync } = require('child_process');
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
.trim()
.slice(0, 64);
} catch (e) {}
console.error(`[*] Using browser: ${binary}`);
if (version) console.error(`[*] Version: ${version}`);
// Load installed extensions
const extensionsDir = getExtensionsDir();
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
if (userDataDir) {
console.error(`[*] Using user data dir: ${userDataDir}`);
}
const installedExtensions = [];
const extensionPaths = [];
if (fs.existsSync(extensionsDir)) {
const files = fs.readdirSync(extensionsDir);
for (const file of files) {
if (file.endsWith('.extension.json')) {
try {
const extPath = path.join(extensionsDir, file);
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
installedExtensions.push(extData);
extensionPaths.push(extData.unpacked_path);
console.error(`[*] Loading extension: ${extData.name || file}`);
}
} catch (e) {
console.warn(`[!] Skipping invalid extension cache: ${file}`);
}
}
}
}
if (installedExtensions.length > 0) {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Launch Chromium using consolidated function
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
const result = await launchChromium({
binary,
outputDir: OUTPUT_DIR,
userDataDir,
extensionPaths,
});
if (!result.success) {
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
// Connect puppeteer for extension verification
console.error(`[*] Connecting puppeteer to CDP...`);
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
browserInstance = browser;
// Get actual extension IDs from chrome://extensions page
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 2000));
try {
const extPage = await browser.newPage();
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 2000));
// Parse extension info from the page
const extensionsFromPage = await extPage.evaluate(() => {
const extensions = [];
// Extensions manager uses shadow DOM
const manager = document.querySelector('extensions-manager');
if (!manager || !manager.shadowRoot) return extensions;
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
if (!itemList || !itemList.shadowRoot) return extensions;
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
for (const item of items) {
const id = item.getAttribute('id');
const nameEl = item.shadowRoot?.querySelector('#name');
const name = nameEl?.textContent?.trim() || '';
if (id && name) {
extensions.push({ id, name });
}
}
return extensions;
});
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
for (const e of extensionsFromPage) {
console.error(` - ${e.id}: "${e.name}"`);
}
// Match extensions by name (strict matching)
for (const ext of installedExtensions) {
// Read the extension's manifest to get its display name
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
let manifestName = manifest.name || '';
// Resolve message placeholder (e.g., __MSG_extName__)
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
const defaultLocale = manifest.default_locale || 'en';
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
if (fs.existsSync(messagesPath)) {
try {
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
if (messages[msgKey] && messages[msgKey].message) {
manifestName = messages[msgKey].message;
}
} catch (e) {
console.error(`[!] Failed to read messages.json: ${e.message}`);
}
}
}
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
// Find matching extension from page by exact name match first
let match = extensionsFromPage.find(e => e.name === manifestName);
// If no exact match, try case-insensitive exact match
if (!match) {
match = extensionsFromPage.find(e =>
e.name.toLowerCase() === manifestName.toLowerCase()
);
}
if (match) {
ext.id = match.id;
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
} else {
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
}
}
}
await extPage.close();
} catch (e) {
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
}
// Fallback: check browser targets
const targets = browser.targets();
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
const customExtTargets = targets.filter(t => {
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
return !builtinIds.includes(extId);
});
console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
for (const target of customExtTargets) {
const url = target.url();
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension target: ${extId} (${target.type()})`);
}
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
}
}
// Write extensions metadata with actual IDs
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
console.error(`[+] Chromium session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${cdpUrl}`);
console.error(`[+] PID: ${chromePid}`);
// Stay alive to handle cleanup on SIGTERM
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
setInterval(() => {}, 1000000);
} catch (e) {
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
}
main().catch((e) => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,59 +0,0 @@
#!/usr/bin/env node
/**
* I Still Don't Care About Cookies Extension Plugin
*
* Installs and configures the "I still don't care about cookies" Chrome extension
* for automatic cookie consent banner dismissal during page archiving.
*
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
*
* Priority: 02 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Dismisses cookie consent popups
* - Removes cookie banners
* - Accepts necessary cookies to proceed with browsing
* - Works on thousands of websites out of the box
*/
// Import extension utilities
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
};
/**
* Main entry point - install extension before archiving
*
* Note: This extension works out of the box with no configuration needed.
* It automatically detects and dismisses cookie banners on page load.
*/
async function main() {
const extension = await installExtensionWithCache(EXTENSION);
if (extension) {
console.log('[+] Cookie banners will be automatically dismissed during archiving');
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -1,281 +0,0 @@
#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* DISABLED: Extension functionality commented out - using single-file-cli only
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// DISABLED: Extension functionality - using single-file-cli only
// // Import extension utilities
// const extensionUtils = require('../chrome/chrome_utils.js');
// // Extension metadata
// const EXTENSION = {
// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
// name: 'singlefile',
// };
// // Get extensions directory from environment or use default
// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
// DISABLED: Extension functionality - using single-file-cli only
// /**
// * Install the SingleFile extension
// */
// async function installSinglefileExtension() {
// console.log('[*] Installing SingleFile extension...');
// // Install the extension
// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
// if (!extension) {
// console.error('[❌] Failed to install SingleFile extension');
// return null;
// }
// console.log('[+] SingleFile extension installed');
// console.log('[+] Web pages will be saved as single HTML files');
// return extension;
// }
// /**
// * Wait for a specified amount of time
// */
// function wait(ms) {
// return new Promise(resolve => setTimeout(resolve, ms));
// }
// /**
// * Save a page using the SingleFile extension
// *
// * @param {Object} page - Puppeteer page object
// * @param {Object} extension - Extension metadata with dispatchAction method
// * @param {Object} options - Additional options
// * @returns {Promise<string|null>} - Path to saved file or null on failure
// */
// async function saveSinglefileWithExtension(page, extension, options = {}) {
// if (!extension || !extension.version) {
// throw new Error('SingleFile extension not found or not loaded');
// }
// const url = await page.url();
// // Check for unsupported URL schemes
// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
// const scheme = url.split(':')[0];
// if (URL_SCHEMES_IGNORED.includes(scheme)) {
// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
// return null;
// }
// // Ensure downloads directory exists
// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// // Get list of existing files to ignore
// const files_before = new Set(
// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
// .filter(fn => fn.endsWith('.html'))
// );
// // Output directory is current directory (hook already runs in output dir)
// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// // Bring page to front (extension action button acts on foreground tab)
// await page.bringToFront();
// // Trigger the extension's action (toolbar button click)
// await extension.dispatchAction();
// // Wait for file to appear in downloads directory
// const check_delay = 3000; // 3 seconds
// const max_tries = 10;
// let files_new = [];
// for (let attempt = 0; attempt < max_tries; attempt++) {
// await wait(check_delay);
// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
// .filter(fn => fn.endsWith('.html'));
// files_new = files_after.filter(file => !files_before.has(file));
// if (files_new.length === 0) {
// continue;
// }
// // Find the matching file by checking if it contains the URL in the HTML header
// for (const file of files_new) {
// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
// const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
// const dl_header = dl_text.split('meta charset')[0];
// if (dl_header.includes(`url: ${url}`)) {
// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
// await fs.promises.rename(dl_path, out_path);
// return out_path;
// }
// }
// }
// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
// return null;
// }
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
// DISABLED: Extension functionality - using single-file-cli only
// /**
// * Main entry point - install extension before archiving
// */
// async function main() {
// // Check if extension is already cached
// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
// if (fs.existsSync(cacheFile)) {
// try {
// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
// const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
// if (fs.existsSync(manifestPath)) {
// console.log('[*] SingleFile extension already installed (using cache)');
// return cached;
// }
// } catch (e) {
// // Cache file corrupted, re-install
// console.warn('[⚠️] Extension cache corrupted, re-installing...');
// }
// }
// // Install extension
// const extension = await installSinglefileExtension();
// // Export extension metadata for chrome plugin to load
// if (extension) {
// // Write extension info to a cache file that chrome plugin can read
// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
// await fs.promises.writeFile(
// cacheFile,
// JSON.stringify(extension, null, 2)
// );
// console.log(`[+] Extension metadata written to ${cacheFile}`);
// }
// return extension;
// }
// Export functions for use by other plugins
module.exports = {
// DISABLED: Extension functionality - using single-file-cli only
// EXTENSION,
// installSinglefileExtension,
// saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// DISABLED: Extension functionality - using single-file-cli only
// // Run if executed directly
// if (require.main === module) {
// main().then(() => {
// console.log('[✓] SingleFile extension setup complete');
// process.exit(0);
// }).catch(err => {
// console.error('[❌] SingleFile extension setup failed:', err);
// process.exit(1);
// });
// }
// No-op when run directly (extension install disabled)
if (require.main === module) {
console.log('[*] SingleFile extension install disabled - using single-file-cli only');
process.exit(0);
}

View File

@@ -1,116 +0,0 @@
#!/usr/bin/env node
/**
* uBlock Origin Extension Plugin
*
* Installs and configures the uBlock Origin Chrome extension for ad blocking
* and privacy protection during page archiving.
*
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
*
* Priority: 03 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* This extension automatically:
* - Blocks ads, trackers, and malware domains
* - Reduces page load time and bandwidth usage
* - Improves privacy during archiving
* - Removes clutter from archived pages
* - Uses efficient blocking with filter lists
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install the uBlock Origin extension
*/
async function installUblockExtension() {
console.log('[*] Installing uBlock Origin extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install uBlock Origin extension');
return null;
}
console.log('[+] uBlock Origin extension installed');
console.log('[+] Ads and trackers will be blocked during archiving');
return extension;
}
/**
* Note: uBlock Origin works automatically with default filter lists.
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] uBlock Origin extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installUblockExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installUblockExtension,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] uBlock Origin extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] uBlock Origin extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -1,130 +0,0 @@
#!/usr/bin/env python3
"""
Validate and compute derived wget config values.
This hook runs early in the Crawl lifecycle to:
1. Validate config values with warnings (not hard errors)
2. Compute derived values (USE_WGET from WGET_ENABLED)
3. Check binary availability and version
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- Binary JSONL records to stdout when binaries are found
"""
import json
import os
import shutil
import subprocess
import sys
from abx_pkg import Binary, EnvProvider
# Read config from environment (already validated by JSONSchema)
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
wget_enabled = get_env_bool('WGET_ENABLED', True)
wget_save_warc = get_env_bool('WGET_SAVE_WARC', True)
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
wget_binary = get_env('WGET_BINARY', 'wget')
# Compute derived values (USE_WGET for backward compatibility)
use_wget = wget_enabled
computed['USE_WGET'] = str(use_wget).lower()
# Validate timeout with warning (not error)
if use_wget and wget_timeout < 20:
warnings.append(
f"WGET_TIMEOUT={wget_timeout} is very low. "
"wget may fail to archive sites if set to less than ~20 seconds. "
"Consider setting WGET_TIMEOUT=60 or higher."
)
# Check binary availability using abx-pkg
provider = EnvProvider()
try:
binary = Binary(name=wget_binary, binproviders=[provider]).load()
binary_path = str(binary.abspath) if binary.abspath else ''
except Exception:
binary = None
binary_path = ''
if not binary_path:
if use_wget:
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
computed['WGET_BINARY'] = ''
else:
computed['WGET_BINARY'] = binary_path
wget_version = str(binary.version) if binary.version else 'unknown'
computed['WGET_VERSION'] = wget_version
# Output Binary JSONL record
output_binary(binary, name='wget')
# Check for compression support
if computed.get('WGET_BINARY'):
try:
result = subprocess.run(
[computed['WGET_BINARY'], '--compression=auto', '--help'],
capture_output=True, timeout=5
)
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
except Exception:
computed['WGET_AUTO_COMPRESSION'] = 'false'
# Output results
# Format: KEY=VALUE lines that hooks.py will parse and add to env
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
# Exit with error if any hard errors
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()