From 4c77949197cd2481e0ff48df083263dc0b9cb8ae Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 22:47:36 +0000 Subject: [PATCH] Clean up on_Crawl hooks: remove duplicates and standardize naming Deleted dead/duplicate hooks: - wget/on_Crawl__10_install_wget.py (duplicate of __10_wget_validate_config.py) - chrome/on_Crawl__00_chrome_install.py (simpler version, kept full one) - chrome/on_Crawl__20_chrome_launch.bg.js (legacy, kept __30 version) - singlefile/on_Crawl__20_install_singlefile_extension.js (disabled/dead) - istilldontcareaboutcookies/on_Crawl__20_install_*.js (legacy) - ublock/on_Crawl__03_ublock.js (legacy, kept __20 version) - Entire captcha2/ plugin (legacy version of twocaptcha/) Renamed hooks to follow consistent pattern: on_Crawl__XX__. Priority bands: 00-09: Binary/extension installation 10-19: Config validation 20-29: Browser launch and post-launch config Final hooks: 00 ripgrep_install.py, 01 chrome_install.py 02 istilldontcareaboutcookies_install.js 03 ublock_install.js, 04 singlefile_install.js 05 twocaptcha_install.js 10 chrome_validate.py, 11 wget_validate.py 20 chrome_launch.bg.js, 25 twocaptcha_config.js --- archivebox/plugins/captcha2/config.json | 21 -- .../plugins/captcha2/on_Crawl__01_captcha2.js | 121 ------- .../captcha2/on_Crawl__11_captcha2_config.js | 279 --------------- .../plugins/captcha2/templates/icon.html | 0 .../plugins/captcha2/tests/test_captcha2.py | 184 ---------- .../chrome/on_Crawl__00_chrome_install.py | 184 ---------- ...mium.py => on_Crawl__01_chrome_install.py} | 0 ...fig.py => on_Crawl__10_chrome_validate.py} | 0 .../chrome/on_Crawl__20_chrome_launch.bg.js | 140 ++++++-- .../chrome/on_Crawl__30_chrome_launch.bg.js | 323 ------------------ ..._02_istilldontcareaboutcookies_install.js} | 0 ...ll_istilldontcareaboutcookies_extension.js | 59 ---- ...rep.py => on_Crawl__00_ripgrep_install.py} | 0 ....js => on_Crawl__04_singlefile_install.js} | 0 ..._Crawl__20_install_singlefile_extension.js | 281 --------------- ....js => on_Crawl__05_twocaptcha_install.js} | 0 ...s.js => on_Crawl__25_twocaptcha_config.js} | 0 .../plugins/ublock/on_Crawl__03_ublock.js | 116 ------- ...sion.js => on_Crawl__03_ublock_install.js} | 0 .../wget/on_Crawl__10_wget_validate_config.py | 130 ------- ..._wget.py => on_Crawl__11_wget_validate.py} | 0 21 files changed, 109 insertions(+), 1729 deletions(-) delete mode 100644 archivebox/plugins/captcha2/config.json delete mode 100755 archivebox/plugins/captcha2/on_Crawl__01_captcha2.js delete mode 100755 archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js delete mode 100644 archivebox/plugins/captcha2/templates/icon.html delete mode 100644 archivebox/plugins/captcha2/tests/test_captcha2.py delete mode 100644 archivebox/plugins/chrome/on_Crawl__00_chrome_install.py rename archivebox/plugins/chrome/{on_Crawl__00_install_puppeteer_chromium.py => on_Crawl__01_chrome_install.py} (100%) rename archivebox/plugins/chrome/{on_Crawl__10_chrome_validate_config.py => on_Crawl__10_chrome_validate.py} (100%) delete mode 100644 archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js rename archivebox/plugins/istilldontcareaboutcookies/{on_Crawl__02_istilldontcareaboutcookies.js => on_Crawl__02_istilldontcareaboutcookies_install.js} (100%) delete mode 100755 archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js rename archivebox/plugins/search_backend_ripgrep/{on_Crawl__00_install_ripgrep.py => on_Crawl__00_ripgrep_install.py} (100%) rename archivebox/plugins/singlefile/{on_Crawl__04_singlefile.js => on_Crawl__04_singlefile_install.js} (100%) delete mode 100755 archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js rename archivebox/plugins/twocaptcha/{on_Crawl__20_install_twocaptcha_extension.js => on_Crawl__05_twocaptcha_install.js} (100%) rename archivebox/plugins/twocaptcha/{on_Crawl__25_configure_twocaptcha_extension_options.js => on_Crawl__25_twocaptcha_config.js} (100%) delete mode 100755 archivebox/plugins/ublock/on_Crawl__03_ublock.js rename archivebox/plugins/ublock/{on_Crawl__20_install_ublock_extension.js => on_Crawl__03_ublock_install.js} (100%) delete mode 100644 archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py rename archivebox/plugins/wget/{on_Crawl__10_install_wget.py => on_Crawl__11_wget_validate.py} (100%) diff --git a/archivebox/plugins/captcha2/config.json b/archivebox/plugins/captcha2/config.json deleted file mode 100644 index ba1a1383..00000000 --- a/archivebox/plugins/captcha2/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "CAPTCHA2_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["USE_CAPTCHA2"], - "description": "Enable Captcha2 browser extension for CAPTCHA solving" - }, - "CAPTCHA2_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for CAPTCHA solving in seconds" - } - } -} diff --git a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js deleted file mode 100755 index 45fb8956..00000000 --- a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env node -/** - * 2Captcha Extension Plugin - * - * Installs and configures the 2captcha Chrome extension for automatic - * CAPTCHA solving during page archiving. - * - * Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo - * Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer - * - * Priority: 01 (early) - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * Requirements: - * - API_KEY_2CAPTCHA environment variable must be set - * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc. - */ - -const path = require('path'); -const fs = require('fs'); - -// Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', - name: 'captcha2', -}; - -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -/** - * Install and configure the 2captcha extension - */ -async function installCaptchaExtension() { - console.log('[*] Installing 2captcha extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[❌] Failed to install 2captcha extension'); - return null; - } - - // Check if API key is configured - const apiKey = process.env.API_KEY_2CAPTCHA; - if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { - console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured'); - console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving'); - } else { - console.log('[+] 2captcha extension installed and API key configured'); - } - - return extension; -} - -/** - * Note: 2captcha configuration is now handled by chrome plugin - * during first-time browser setup to avoid repeated configuration on every snapshot. - * The API key is injected via chrome.storage API once per browser session. - */ - -/** - * Main entry point - install extension before archiving - */ -async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json'); - - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] 2captcha extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[⚠️] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installCaptchaExtension(); - - // Export extension metadata for chrome plugin to load - if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, - installCaptchaExtension, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[✓] 2captcha extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[❌] 2captcha extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js deleted file mode 100755 index cf528a1b..00000000 --- a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js +++ /dev/null @@ -1,279 +0,0 @@ -#!/usr/bin/env node -/** - * 2Captcha Extension Configuration - * - * Configures the 2captcha extension with API key after Crawl-level Chrome session starts. - * Runs once per crawl to inject API key into extension storage. - * - * Priority: 11 (after chrome_launch at 20) - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * Requirements: - * - API_KEY_2CAPTCHA environment variable must be set - * - chrome plugin must have loaded extensions (extensions.json must exist) - */ - -const path = require('path'); -const fs = require('fs'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -// Get crawl's chrome directory from environment variable set by hooks.py -function getCrawlChromeSessionDir() { - const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || ''; - if (!crawlOutputDir) { - return null; - } - return path.join(crawlOutputDir, 'chrome'); -} - -const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome'; -const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured'); - -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -async function configure2Captcha() { - // Check if already configured in this session - if (fs.existsSync(CONFIG_MARKER)) { - console.error('[*] 2captcha already configured in this browser session'); - return { success: true, skipped: true }; - } - - // Check if API key is set - const apiKey = getEnv('API_KEY_2CAPTCHA'); - if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { - console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured'); - console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving'); - return { success: false, error: 'API_KEY_2CAPTCHA not configured' }; - } - - // Load extensions metadata - const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json'); - if (!fs.existsSync(extensionsFile)) { - return { success: false, error: 'extensions.json not found - chrome plugin must run first' }; - } - - const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8')); - const captchaExt = extensions.find(ext => ext.name === 'captcha2'); - - if (!captchaExt) { - console.error('[*] 2captcha extension not installed, skipping configuration'); - return { success: true, skipped: true }; - } - - console.error('[*] Configuring 2captcha extension with API key...'); - - try { - // Connect to the existing Chrome session via CDP - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (!fs.existsSync(cdpFile)) { - return { success: false, error: 'CDP URL not found - chrome plugin must run first' }; - } - - const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); - const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - try { - // Method 1: Try to inject via extension background page - if (captchaExt.target && captchaExt.target_ctx) { - console.error('[*] Attempting to configure via extension background page...'); - - // Reconnect to the browser to get fresh target context - const targets = await browser.targets(); - const extTarget = targets.find(t => - t.url().startsWith(`chrome-extension://${captchaExt.id}`) - ); - - if (extTarget) { - const extContext = await extTarget.worker() || await extTarget.page(); - - if (extContext) { - await extContext.evaluate((key) => { - // Try all common storage patterns - if (typeof chrome !== 'undefined' && chrome.storage) { - chrome.storage.local.set({ - apiKey: key, - api_key: key, - '2captcha_apikey': key, - apikey: key, - 'solver-api-key': key, - }); - chrome.storage.sync.set({ - apiKey: key, - api_key: key, - '2captcha_apikey': key, - apikey: key, - 'solver-api-key': key, - }); - } - - // Also try localStorage as fallback - if (typeof localStorage !== 'undefined') { - localStorage.setItem('apiKey', key); - localStorage.setItem('2captcha_apikey', key); - localStorage.setItem('solver-api-key', key); - } - }, apiKey); - - console.error('[+] 2captcha API key configured successfully via background page'); - - // Mark as configured - fs.writeFileSync(CONFIG_MARKER, new Date().toISOString()); - - return { success: true, method: 'background_page' }; - } - } - } - - // Method 2: Try to configure via options page - console.error('[*] Attempting to configure via options page...'); - const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`; - const configPage = await browser.newPage(); - - try { - await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 }); - - const configured = await configPage.evaluate((key) => { - // Try to find API key input field - const selectors = [ - 'input[name*="apikey" i]', - 'input[id*="apikey" i]', - 'input[name*="api-key" i]', - 'input[id*="api-key" i]', - 'input[name*="key" i]', - 'input[placeholder*="api" i]', - 'input[type="text"]', - ]; - - for (const selector of selectors) { - const input = document.querySelector(selector); - if (input) { - input.value = key; - input.dispatchEvent(new Event('input', { bubbles: true })); - input.dispatchEvent(new Event('change', { bubbles: true })); - - // Try to find and click save button - const saveSelectors = [ - 'button[type="submit"]', - 'input[type="submit"]', - 'button:contains("Save")', - 'button:contains("Apply")', - ]; - - for (const btnSel of saveSelectors) { - const btn = document.querySelector(btnSel); - if (btn) { - btn.click(); - break; - } - } - - // Also save to storage - if (typeof chrome !== 'undefined' && chrome.storage) { - chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - } - - return true; - } - } - - // Fallback: Just save to storage - if (typeof chrome !== 'undefined' && chrome.storage) { - chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); - return true; - } - - return false; - }, apiKey); - - await configPage.close(); - - if (configured) { - console.error('[+] 2captcha API key configured successfully via options page'); - - // Mark as configured - fs.writeFileSync(CONFIG_MARKER, new Date().toISOString()); - - return { success: true, method: 'options_page' }; - } - } catch (e) { - console.warn(`[⚠️] Failed to configure via options page: ${e.message}`); - try { - await configPage.close(); - } catch (e2) {} - } - - return { success: false, error: 'Could not configure via any method' }; - } finally { - browser.disconnect(); - } - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__21_captcha2_config.js --url= --snapshot-id='); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let error = ''; - - try { - const result = await configure2Captcha(); - - if (result.skipped) { - status = 'skipped'; - } else if (result.success) { - status = 'succeeded'; - } else { - status = 'failed'; - error = result.error || 'Configuration failed'; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - if (error) { - console.error(`ERROR: ${error}`); - } - - // Config hooks don't emit JSONL - they're utility hooks for setup - // Exit code indicates success/failure - - process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/captcha2/templates/icon.html b/archivebox/plugins/captcha2/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/captcha2/tests/test_captcha2.py b/archivebox/plugins/captcha2/tests/test_captcha2.py deleted file mode 100644 index bc08a072..00000000 --- a/archivebox/plugins/captcha2/tests/test_captcha2.py +++ /dev/null @@ -1,184 +0,0 @@ -""" -Unit tests for captcha2 plugin - -Tests invoke the plugin hooks as external processes and verify outputs/side effects. -""" - -import json -import os -import subprocess -import tempfile -from pathlib import Path - -import pytest - - -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None) -CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None) - - -def test_install_script_exists(): - """Verify install script exists""" - assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}" - - -def test_config_script_exists(): - """Verify config script exists""" - assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}" - - -def test_extension_metadata(): - """Test that captcha2 extension has correct metadata""" - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions") - - # Just check the script can be loaded - result = subprocess.run( - ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"], - capture_output=True, - text=True, - env=env - ) - - assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}" - - metadata = json.loads(result.stdout) - assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" - assert metadata["name"] == "captcha2" - - -def test_install_creates_cache(): - """Test that install creates extension cache""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - env["API_KEY_2CAPTCHA"] = "test_api_key" - - # Run install script - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Check output mentions installation - assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout - - # Check cache file was created - cache_file = ext_dir / "captcha2.extension.json" - assert cache_file.exists(), "Cache file should be created" - - # Verify cache content - cache_data = json.loads(cache_file.read_text()) - assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" - assert cache_data["name"] == "captcha2" - assert "unpacked_path" in cache_data - assert "version" in cache_data - - -def test_install_twice_uses_cache(): - """Test that running install twice uses existing cache on second run""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - env["API_KEY_2CAPTCHA"] = "test_api_key" - - # First install - downloads the extension - result1 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - assert result1.returncode == 0, f"First install failed: {result1.stderr}" - - # Verify cache was created - cache_file = ext_dir / "captcha2.extension.json" - assert cache_file.exists(), "Cache file should exist after first install" - - # Second install - should use cache - result2 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=30 - ) - assert result2.returncode == 0, f"Second install failed: {result2.stderr}" - - # Second run should mention cache reuse - assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0 - - -def test_install_warns_without_api_key(): - """Test that install warns when API key not configured""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - # Don't set API_KEY_2CAPTCHA - - # Run install script - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should warn about missing API key - combined_output = result.stdout + result.stderr - assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output - - -def test_install_success_with_api_key(): - """Test that install succeeds when API key is configured""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123" - - # Run install script - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should mention API key configured - combined_output = result.stdout + result.stderr - assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output - - -def test_config_script_structure(): - """Test that config script has proper structure""" - # Verify the script exists and contains expected markers - script_content = CONFIG_SCRIPT.read_text() - - # Should mention configuration marker file - assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content - - # Should mention API key - assert "API_KEY_2CAPTCHA" in script_content - - # Should have main function or be executable - assert "async function" in script_content or "main" in script_content diff --git a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py deleted file mode 100644 index 4c6bbbdd..00000000 --- a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python3 -""" -Install hook for Chrome/Chromium and puppeteer-core. - -Runs at crawl start to install/find Chromium and puppeteer-core. -Outputs JSONL for Binary and Machine config updates. -Respects CHROME_BINARY env var for custom binary paths. -Uses `npx @puppeteer/browsers install chromium@latest` and parses output. - -NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for ---load-extension and --disable-extensions-except flags, which are needed for -loading unpacked extensions in headless mode. -""" - -import os -import sys -import json -import subprocess -from pathlib import Path - - -def get_chrome_version(binary_path: str) -> str | None: - """Get Chrome/Chromium version string.""" - try: - result = subprocess.run( - [binary_path, '--version'], - capture_output=True, - text=True, - timeout=5 - ) - if result.returncode == 0: - return result.stdout.strip() - except Exception: - pass - return None - - -def install_puppeteer_core() -> bool: - """Install puppeteer-core to NODE_MODULES_DIR if not present.""" - node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip() - if not node_modules_dir: - # No isolated node_modules, skip (will use global) - return True - - node_modules_path = Path(node_modules_dir) - if (node_modules_path / 'puppeteer-core').exists(): - return True - - # Get npm prefix from NODE_MODULES_DIR (parent of node_modules) - npm_prefix = node_modules_path.parent - - try: - print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr) - result = subprocess.run( - ['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'], - capture_output=True, - text=True, - timeout=60 - ) - if result.returncode == 0: - print(f"[+] puppeteer-core installed", file=sys.stderr) - return True - else: - print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr) - return False - except Exception as e: - print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr) - return False - - -def install_chromium() -> dict | None: - """Install Chromium using @puppeteer/browsers and parse output for binary path. - - Output format: "chromium@ " - e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium" - - Note: npx is fast when chromium is already cached - it returns the path without re-downloading. - """ - try: - print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr) - - # Use --path to install to puppeteer's standard cache location - cache_path = os.path.expanduser('~/.cache/puppeteer') - - result = subprocess.run( - ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'], - capture_output=True, - text=True, - stdin=subprocess.DEVNULL, - timeout=300 - ) - - if result.returncode != 0: - print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr) - return None - - # Parse output: "chromium@1563294 /path/to/Chromium" - output = result.stdout.strip() - parts = output.split(' ', 1) - if len(parts) != 2: - print(f"[!] Failed to parse install output: {output}", file=sys.stderr) - return None - - version_str = parts[0] # "chromium@1563294" - binary_path = parts[1].strip() - - if not binary_path or not os.path.exists(binary_path): - print(f"[!] Binary not found at: {binary_path}", file=sys.stderr) - return None - - # Extract version number - version = version_str.split('@')[1] if '@' in version_str else None - - print(f"[+] Chromium installed: {binary_path}", file=sys.stderr) - - return { - 'name': 'chromium', - 'abspath': binary_path, - 'version': version, - 'binprovider': 'puppeteer', - } - - except subprocess.TimeoutExpired: - print("[!] Chromium install timed out", file=sys.stderr) - except FileNotFoundError: - print("[!] npx not found - is Node.js installed?", file=sys.stderr) - except Exception as e: - print(f"[!] Failed to install Chromium: {e}", file=sys.stderr) - - return None - - -def main(): - # Install puppeteer-core if NODE_MODULES_DIR is set - install_puppeteer_core() - - # Check if CHROME_BINARY is already set and valid - configured_binary = os.environ.get('CHROME_BINARY', '').strip() - if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): - version = get_chrome_version(configured_binary) - print(json.dumps({ - 'type': 'Binary', - 'name': 'chromium', - 'abspath': configured_binary, - 'version': version, - 'binprovider': 'env', - })) - sys.exit(0) - - # Install/find Chromium via puppeteer - result = install_chromium() - - if result and result.get('abspath'): - print(json.dumps({ - 'type': 'Binary', - 'name': result['name'], - 'abspath': result['abspath'], - 'version': result['version'], - 'binprovider': result['binprovider'], - })) - - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/CHROME_BINARY', - 'value': result['abspath'], - })) - - if result['version']: - print(json.dumps({ - 'type': 'Machine', - '_method': 'update', - 'key': 'config/CHROMIUM_VERSION', - 'value': result['version'], - })) - - sys.exit(0) - else: - print("Chromium binary not found", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py b/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py similarity index 100% rename from archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py rename to archivebox/plugins/chrome/on_Crawl__01_chrome_install.py diff --git a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py b/archivebox/plugins/chrome/on_Crawl__10_chrome_validate.py similarity index 100% rename from archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py rename to archivebox/plugins/chrome/on_Crawl__10_chrome_validate.py diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js index c2d62775..0799f3ad 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js @@ -8,8 +8,8 @@ * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for * --load-extension and --disable-extensions-except flags. * - * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id= --source-url= - * Output: Creates chrome/ directory under crawl output dir with: + * Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id= --source-url= + * Output: Writes to current directory (executor creates chrome/ dir): * - cdp_url.txt: WebSocket URL for CDP connection * - chrome.pid: Chromium process ID (for cleanup) * - port.txt: Debug port number @@ -38,11 +38,12 @@ const { killChrome, getEnv, writePidWithMtime, + getExtensionsDir, } = require('./chrome_utils.js'); // Extractor metadata const PLUGIN_NAME = 'chrome_launch'; -const OUTPUT_DIR = 'chrome'; +const OUTPUT_DIR = '.'; // Global state for cleanup let chromePid = null; @@ -115,8 +116,12 @@ async function main() { if (version) console.error(`[*] Version: ${version}`); // Load installed extensions - const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || - path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + const extensionsDir = getExtensionsDir(); + const userDataDir = getEnv('CHROME_USER_DATA_DIR'); + + if (userDataDir) { + console.error(`[*] Using user data dir: ${userDataDir}`); + } const installedExtensions = []; const extensionPaths = []; @@ -143,17 +148,18 @@ async function main() { console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); } - // Write hook's own PID - const hookStartTime = Date.now() / 1000; + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } - writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); // Launch Chromium using consolidated function + // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set const result = await launchChromium({ binary, outputDir: OUTPUT_DIR, + userDataDir, extensionPaths, }); @@ -165,14 +171,6 @@ async function main() { chromePid = result.pid; const cdpUrl = result.cdpUrl; - // Write extensions metadata - if (installedExtensions.length > 0) { - fs.writeFileSync( - path.join(OUTPUT_DIR, 'extensions.json'), - JSON.stringify(installedExtensions, null, 2) - ); - } - // Connect puppeteer for extension verification console.error(`[*] Connecting puppeteer to CDP...`); const browser = await puppeteer.connect({ @@ -181,30 +179,102 @@ async function main() { }); browserInstance = browser; - // Verify extensions loaded + // Get actual extension IDs from chrome://extensions page if (extensionPaths.length > 0) { - await new Promise(r => setTimeout(r, 3000)); + await new Promise(r => setTimeout(r, 2000)); - const targets = browser.targets(); - console.error(`[*] All browser targets (${targets.length}):`); - for (const t of targets) { - console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`); + try { + const extPage = await browser.newPage(); + await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 }); + await new Promise(r => setTimeout(r, 2000)); + + // Parse extension info from the page + const extensionsFromPage = await extPage.evaluate(() => { + const extensions = []; + // Extensions manager uses shadow DOM + const manager = document.querySelector('extensions-manager'); + if (!manager || !manager.shadowRoot) return extensions; + + const itemList = manager.shadowRoot.querySelector('extensions-item-list'); + if (!itemList || !itemList.shadowRoot) return extensions; + + const items = itemList.shadowRoot.querySelectorAll('extensions-item'); + for (const item of items) { + const id = item.getAttribute('id'); + const nameEl = item.shadowRoot?.querySelector('#name'); + const name = nameEl?.textContent?.trim() || ''; + if (id && name) { + extensions.push({ id, name }); + } + } + return extensions; + }); + + console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`); + for (const e of extensionsFromPage) { + console.error(` - ${e.id}: "${e.name}"`); + } + + // Match extensions by name (strict matching) + for (const ext of installedExtensions) { + // Read the extension's manifest to get its display name + const manifestPath = path.join(ext.unpacked_path, 'manifest.json'); + if (fs.existsSync(manifestPath)) { + const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); + let manifestName = manifest.name || ''; + + // Resolve message placeholder (e.g., __MSG_extName__) + if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) { + const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__ + const defaultLocale = manifest.default_locale || 'en'; + const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json'); + if (fs.existsSync(messagesPath)) { + try { + const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8')); + if (messages[msgKey] && messages[msgKey].message) { + manifestName = messages[msgKey].message; + } + } catch (e) { + console.error(`[!] Failed to read messages.json: ${e.message}`); + } + } + } + + console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`); + + // Find matching extension from page by exact name match first + let match = extensionsFromPage.find(e => e.name === manifestName); + + // If no exact match, try case-insensitive exact match + if (!match) { + match = extensionsFromPage.find(e => + e.name.toLowerCase() === manifestName.toLowerCase() + ); + } + + if (match) { + ext.id = match.id; + console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`); + } else { + console.error(`[!] No match found for: ${ext.name} (${manifestName})`); + } + } + } + + await extPage.close(); + } catch (e) { + console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`); } - const extTargets = targets.filter(t => - t.url().startsWith('chrome-extension://') || - t.type() === 'service_worker' || - t.type() === 'background_page' - ); - - // Filter out built-in extensions + // Fallback: check browser targets + const targets = browser.targets(); const builtinIds = [ 'nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf', 'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai', ]; - const customExtTargets = extTargets.filter(t => { + const customExtTargets = targets.filter(t => { const url = t.url(); if (!url.startsWith('chrome-extension://')) return false; const extId = url.split('://')[1].split('/')[0]; @@ -216,7 +286,7 @@ async function main() { for (const target of customExtTargets) { const url = target.url(); const extId = url.split('://')[1].split('/')[0]; - console.error(`[+] Extension loaded: ${extId} (${target.type()})`); + console.error(`[+] Extension target: ${extId} (${target.type()})`); } if (customExtTargets.length === 0 && extensionPaths.length > 0) { @@ -225,6 +295,14 @@ async function main() { } } + // Write extensions metadata with actual IDs + if (installedExtensions.length > 0) { + fs.writeFileSync( + path.join(OUTPUT_DIR, 'extensions.json'), + JSON.stringify(installedExtensions, null, 2) + ); + } + console.error(`[+] Chromium session started for crawl ${crawlId}`); console.error(`[+] CDP URL: ${cdpUrl}`); console.error(`[+] PID: ${chromePid}`); diff --git a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js deleted file mode 100644 index 0799f3ad..00000000 --- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js +++ /dev/null @@ -1,323 +0,0 @@ -#!/usr/bin/env node -/** - * Launch a shared Chromium browser session for the entire crawl. - * - * This runs once per crawl and keeps Chromium alive for all snapshots to share. - * Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js. - * - * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for - * --load-extension and --disable-extensions-except flags. - * - * Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id= --source-url= - * Output: Writes to current directory (executor creates chrome/ dir): - * - cdp_url.txt: WebSocket URL for CDP connection - * - chrome.pid: Chromium process ID (for cleanup) - * - port.txt: Debug port number - * - extensions.json: Loaded extensions metadata - * - * Environment variables: - * NODE_MODULES_DIR: Path to node_modules directory for module resolution - * CHROME_BINARY: Path to Chromium binary (falls back to auto-detection) - * CHROME_RESOLUTION: Page resolution (default: 1440,2000) - * CHROME_HEADLESS: Run in headless mode (default: true) - * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) - * CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions - */ - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) { - module.paths.unshift(process.env.NODE_MODULES_DIR); -} - -const fs = require('fs'); -const path = require('path'); -const puppeteer = require('puppeteer-core'); -const { - findChromium, - launchChromium, - killChrome, - getEnv, - writePidWithMtime, - getExtensionsDir, -} = require('./chrome_utils.js'); - -// Extractor metadata -const PLUGIN_NAME = 'chrome_launch'; -const OUTPUT_DIR = '.'; - -// Global state for cleanup -let chromePid = null; -let browserInstance = null; - -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach((arg) => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -// Cleanup handler for SIGTERM -async function cleanup() { - console.error('[*] Cleaning up Chrome session...'); - - // Try graceful browser close first - if (browserInstance) { - try { - console.error('[*] Closing browser gracefully...'); - await browserInstance.close(); - browserInstance = null; - console.error('[+] Browser closed gracefully'); - } catch (e) { - console.error(`[!] Graceful close failed: ${e.message}`); - } - } - - // Kill Chrome process - if (chromePid) { - await killChrome(chromePid, OUTPUT_DIR); - } - - process.exit(0); -} - -// Register signal handlers -process.on('SIGTERM', cleanup); -process.on('SIGINT', cleanup); - -async function main() { - const args = parseArgs(); - const crawlId = args.crawl_id; - - try { - const binary = findChromium(); - if (!binary) { - console.error('ERROR: Chromium binary not found'); - console.error('DEPENDENCY_NEEDED=chromium'); - console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); - console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest'); - process.exit(1); - } - - // Get Chromium version - let version = ''; - try { - const { execSync } = require('child_process'); - version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }) - .trim() - .slice(0, 64); - } catch (e) {} - - console.error(`[*] Using browser: ${binary}`); - if (version) console.error(`[*] Version: ${version}`); - - // Load installed extensions - const extensionsDir = getExtensionsDir(); - const userDataDir = getEnv('CHROME_USER_DATA_DIR'); - - if (userDataDir) { - console.error(`[*] Using user data dir: ${userDataDir}`); - } - - const installedExtensions = []; - const extensionPaths = []; - if (fs.existsSync(extensionsDir)) { - const files = fs.readdirSync(extensionsDir); - for (const file of files) { - if (file.endsWith('.extension.json')) { - try { - const extPath = path.join(extensionsDir, file); - const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8')); - if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) { - installedExtensions.push(extData); - extensionPaths.push(extData.unpacked_path); - console.error(`[*] Loading extension: ${extData.name || file}`); - } - } catch (e) { - console.warn(`[!] Skipping invalid extension cache: ${file}`); - } - } - } - } - - if (installedExtensions.length > 0) { - console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); - } - - // Note: PID file is written by run_hook() with hook-specific name - // Snapshot.cleanup() kills all *.pid processes when done - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } - - // Launch Chromium using consolidated function - // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set - const result = await launchChromium({ - binary, - outputDir: OUTPUT_DIR, - userDataDir, - extensionPaths, - }); - - if (!result.success) { - console.error(`ERROR: ${result.error}`); - process.exit(1); - } - - chromePid = result.pid; - const cdpUrl = result.cdpUrl; - - // Connect puppeteer for extension verification - console.error(`[*] Connecting puppeteer to CDP...`); - const browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: null, - }); - browserInstance = browser; - - // Get actual extension IDs from chrome://extensions page - if (extensionPaths.length > 0) { - await new Promise(r => setTimeout(r, 2000)); - - try { - const extPage = await browser.newPage(); - await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 }); - await new Promise(r => setTimeout(r, 2000)); - - // Parse extension info from the page - const extensionsFromPage = await extPage.evaluate(() => { - const extensions = []; - // Extensions manager uses shadow DOM - const manager = document.querySelector('extensions-manager'); - if (!manager || !manager.shadowRoot) return extensions; - - const itemList = manager.shadowRoot.querySelector('extensions-item-list'); - if (!itemList || !itemList.shadowRoot) return extensions; - - const items = itemList.shadowRoot.querySelectorAll('extensions-item'); - for (const item of items) { - const id = item.getAttribute('id'); - const nameEl = item.shadowRoot?.querySelector('#name'); - const name = nameEl?.textContent?.trim() || ''; - if (id && name) { - extensions.push({ id, name }); - } - } - return extensions; - }); - - console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`); - for (const e of extensionsFromPage) { - console.error(` - ${e.id}: "${e.name}"`); - } - - // Match extensions by name (strict matching) - for (const ext of installedExtensions) { - // Read the extension's manifest to get its display name - const manifestPath = path.join(ext.unpacked_path, 'manifest.json'); - if (fs.existsSync(manifestPath)) { - const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); - let manifestName = manifest.name || ''; - - // Resolve message placeholder (e.g., __MSG_extName__) - if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) { - const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__ - const defaultLocale = manifest.default_locale || 'en'; - const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json'); - if (fs.existsSync(messagesPath)) { - try { - const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8')); - if (messages[msgKey] && messages[msgKey].message) { - manifestName = messages[msgKey].message; - } - } catch (e) { - console.error(`[!] Failed to read messages.json: ${e.message}`); - } - } - } - - console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`); - - // Find matching extension from page by exact name match first - let match = extensionsFromPage.find(e => e.name === manifestName); - - // If no exact match, try case-insensitive exact match - if (!match) { - match = extensionsFromPage.find(e => - e.name.toLowerCase() === manifestName.toLowerCase() - ); - } - - if (match) { - ext.id = match.id; - console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`); - } else { - console.error(`[!] No match found for: ${ext.name} (${manifestName})`); - } - } - } - - await extPage.close(); - } catch (e) { - console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`); - } - - // Fallback: check browser targets - const targets = browser.targets(); - const builtinIds = [ - 'nkeimhogjdpnpccoofpliimaahmaaome', - 'fignfifoniblkonapihmkfakmlgkbkcf', - 'ahfgeienlihckogmohjhadlkjgocpleb', - 'mhjfbmdgcfjbbpaeojofohoefgiehjai', - ]; - const customExtTargets = targets.filter(t => { - const url = t.url(); - if (!url.startsWith('chrome-extension://')) return false; - const extId = url.split('://')[1].split('/')[0]; - return !builtinIds.includes(extId); - }); - - console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`); - - for (const target of customExtTargets) { - const url = target.url(); - const extId = url.split('://')[1].split('/')[0]; - console.error(`[+] Extension target: ${extId} (${target.type()})`); - } - - if (customExtTargets.length === 0 && extensionPaths.length > 0) { - console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`); - console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`); - } - } - - // Write extensions metadata with actual IDs - if (installedExtensions.length > 0) { - fs.writeFileSync( - path.join(OUTPUT_DIR, 'extensions.json'), - JSON.stringify(installedExtensions, null, 2) - ); - } - - console.error(`[+] Chromium session started for crawl ${crawlId}`); - console.error(`[+] CDP URL: ${cdpUrl}`); - console.error(`[+] PID: ${chromePid}`); - - // Stay alive to handle cleanup on SIGTERM - console.log('[*] Chromium launch hook staying alive to handle cleanup...'); - setInterval(() => {}, 1000000); - - } catch (e) { - console.error(`ERROR: ${e.name}: ${e.message}`); - process.exit(1); - } -} - -main().catch((e) => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies_install.js similarity index 100% rename from archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js rename to archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies_install.js diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js deleted file mode 100755 index 2a8053cd..00000000 --- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env node -/** - * I Still Don't Care About Cookies Extension Plugin - * - * Installs and configures the "I still don't care about cookies" Chrome extension - * for automatic cookie consent banner dismissal during page archiving. - * - * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm - * - * Priority: 02 (early) - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * This extension automatically: - * - Dismisses cookie consent popups - * - Removes cookie banners - * - Accepts necessary cookies to proceed with browsing - * - Works on thousands of websites out of the box - */ - -// Import extension utilities -const { installExtensionWithCache } = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', - name: 'istilldontcareaboutcookies', -}; - -/** - * Main entry point - install extension before archiving - * - * Note: This extension works out of the box with no configuration needed. - * It automatically detects and dismisses cookie banners on page load. - */ -async function main() { - const extension = await installExtensionWithCache(EXTENSION); - - if (extension) { - console.log('[+] Cookie banners will be automatically dismissed during archiving'); - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[✓] I Still Don\'t Care About Cookies extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_ripgrep_install.py similarity index 100% rename from archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py rename to archivebox/plugins/search_backend_ripgrep/on_Crawl__00_ripgrep_install.py diff --git a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js b/archivebox/plugins/singlefile/on_Crawl__04_singlefile_install.js similarity index 100% rename from archivebox/plugins/singlefile/on_Crawl__04_singlefile.js rename to archivebox/plugins/singlefile/on_Crawl__04_singlefile_install.js diff --git a/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js b/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js deleted file mode 100755 index 59bbda46..00000000 --- a/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js +++ /dev/null @@ -1,281 +0,0 @@ -#!/usr/bin/env node -/** - * SingleFile Extension Plugin - * - * DISABLED: Extension functionality commented out - using single-file-cli only - * - * Installs and uses the SingleFile Chrome extension for archiving complete web pages. - * Falls back to single-file-cli if the extension is not available. - * - * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle - * - * Priority: 04 (early) - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * This extension automatically: - * - Saves complete web pages as single HTML files - * - Inlines all resources (CSS, JS, images, fonts) - * - Preserves page fidelity better than wget/curl - * - Works with SPAs and dynamically loaded content - */ - -const path = require('path'); -const fs = require('fs'); -const { promisify } = require('util'); -const { exec } = require('child_process'); - -const execAsync = promisify(exec); - -// DISABLED: Extension functionality - using single-file-cli only -// // Import extension utilities -// const extensionUtils = require('../chrome/chrome_utils.js'); - -// // Extension metadata -// const EXTENSION = { -// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', -// name: 'singlefile', -// }; - -// // Get extensions directory from environment or use default -// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || -// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || -// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); - -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'singlefile.html'; - -// DISABLED: Extension functionality - using single-file-cli only -// /** -// * Install the SingleFile extension -// */ -// async function installSinglefileExtension() { -// console.log('[*] Installing SingleFile extension...'); - -// // Install the extension -// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - -// if (!extension) { -// console.error('[❌] Failed to install SingleFile extension'); -// return null; -// } - -// console.log('[+] SingleFile extension installed'); -// console.log('[+] Web pages will be saved as single HTML files'); - -// return extension; -// } - -// /** -// * Wait for a specified amount of time -// */ -// function wait(ms) { -// return new Promise(resolve => setTimeout(resolve, ms)); -// } - -// /** -// * Save a page using the SingleFile extension -// * -// * @param {Object} page - Puppeteer page object -// * @param {Object} extension - Extension metadata with dispatchAction method -// * @param {Object} options - Additional options -// * @returns {Promise} - Path to saved file or null on failure -// */ -// async function saveSinglefileWithExtension(page, extension, options = {}) { -// if (!extension || !extension.version) { -// throw new Error('SingleFile extension not found or not loaded'); -// } - -// const url = await page.url(); - -// // Check for unsupported URL schemes -// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob']; -// const scheme = url.split(':')[0]; -// if (URL_SCHEMES_IGNORED.includes(scheme)) { -// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`); -// return null; -// } - -// // Ensure downloads directory exists -// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true }); - -// // Get list of existing files to ignore -// const files_before = new Set( -// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) -// .filter(fn => fn.endsWith('.html')) -// ); - -// // Output directory is current directory (hook already runs in output dir) -// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); - -// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`); - -// // Bring page to front (extension action button acts on foreground tab) -// await page.bringToFront(); - -// // Trigger the extension's action (toolbar button click) -// await extension.dispatchAction(); - -// // Wait for file to appear in downloads directory -// const check_delay = 3000; // 3 seconds -// const max_tries = 10; -// let files_new = []; - -// for (let attempt = 0; attempt < max_tries; attempt++) { -// await wait(check_delay); - -// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) -// .filter(fn => fn.endsWith('.html')); - -// files_new = files_after.filter(file => !files_before.has(file)); - -// if (files_new.length === 0) { -// continue; -// } - -// // Find the matching file by checking if it contains the URL in the HTML header -// for (const file of files_new) { -// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file); -// const dl_text = await fs.promises.readFile(dl_path, 'utf-8'); -// const dl_header = dl_text.split('meta charset')[0]; - -// if (dl_header.includes(`url: ${url}`)) { -// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`); -// await fs.promises.rename(dl_path, out_path); -// return out_path; -// } -// } -// } - -// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`); -// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`); -// return null; -// } - -/** - * Save a page using single-file-cli (fallback method) - * - * @param {string} url - URL to archive - * @param {Object} options - Additional options - * @returns {Promise} - Path to saved file or null on failure - */ -async function saveSinglefileWithCLI(url, options = {}) { - console.log('[*] Falling back to single-file-cli...'); - - // Find single-file binary - let binary = null; - try { - const { stdout } = await execAsync('which single-file'); - binary = stdout.trim(); - } catch (err) { - console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli'); - return null; - } - - // Output directory is current directory (hook already runs in output dir) - const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); - - // Build command - const cmd = [ - binary, - '--browser-headless', - url, - out_path, - ]; - - // Add optional args - if (options.userAgent) { - cmd.splice(2, 0, '--browser-user-agent', options.userAgent); - } - if (options.cookiesFile && fs.existsSync(options.cookiesFile)) { - cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile); - } - if (options.ignoreSSL) { - cmd.splice(2, 0, '--browser-ignore-insecure-certs'); - } - - // Execute - try { - const timeout = options.timeout || 120000; - await execAsync(cmd.join(' '), { timeout }); - - if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) { - console.log(`[+] SingleFile saved via CLI: ${out_path}`); - return out_path; - } - - console.error('[❌] SingleFile CLI completed but no output file found'); - return null; - } catch (err) { - console.error(`[❌] SingleFile CLI error: ${err.message}`); - return null; - } -} - -// DISABLED: Extension functionality - using single-file-cli only -// /** -// * Main entry point - install extension before archiving -// */ -// async function main() { -// // Check if extension is already cached -// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json'); - -// if (fs.existsSync(cacheFile)) { -// try { -// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); -// const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - -// if (fs.existsSync(manifestPath)) { -// console.log('[*] SingleFile extension already installed (using cache)'); -// return cached; -// } -// } catch (e) { -// // Cache file corrupted, re-install -// console.warn('[⚠️] Extension cache corrupted, re-installing...'); -// } -// } - -// // Install extension -// const extension = await installSinglefileExtension(); - -// // Export extension metadata for chrome plugin to load -// if (extension) { -// // Write extension info to a cache file that chrome plugin can read -// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); -// await fs.promises.writeFile( -// cacheFile, -// JSON.stringify(extension, null, 2) -// ); -// console.log(`[+] Extension metadata written to ${cacheFile}`); -// } - -// return extension; -// } - -// Export functions for use by other plugins -module.exports = { - // DISABLED: Extension functionality - using single-file-cli only - // EXTENSION, - // installSinglefileExtension, - // saveSinglefileWithExtension, - saveSinglefileWithCLI, -}; - -// DISABLED: Extension functionality - using single-file-cli only -// // Run if executed directly -// if (require.main === module) { -// main().then(() => { -// console.log('[✓] SingleFile extension setup complete'); -// process.exit(0); -// }).catch(err => { -// console.error('[❌] SingleFile extension setup failed:', err); -// process.exit(1); -// }); -// } - -// No-op when run directly (extension install disabled) -if (require.main === module) { - console.log('[*] SingleFile extension install disabled - using single-file-cli only'); - process.exit(0); -} diff --git a/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js b/archivebox/plugins/twocaptcha/on_Crawl__05_twocaptcha_install.js similarity index 100% rename from archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js rename to archivebox/plugins/twocaptcha/on_Crawl__05_twocaptcha_install.js diff --git a/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js b/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js similarity index 100% rename from archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js rename to archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js diff --git a/archivebox/plugins/ublock/on_Crawl__03_ublock.js b/archivebox/plugins/ublock/on_Crawl__03_ublock.js deleted file mode 100755 index b8a0219c..00000000 --- a/archivebox/plugins/ublock/on_Crawl__03_ublock.js +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env node -/** - * uBlock Origin Extension Plugin - * - * Installs and configures the uBlock Origin Chrome extension for ad blocking - * and privacy protection during page archiving. - * - * Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm - * - * Priority: 03 (early) - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * This extension automatically: - * - Blocks ads, trackers, and malware domains - * - Reduces page load time and bandwidth usage - * - Improves privacy during archiving - * - Removes clutter from archived pages - * - Uses efficient blocking with filter lists - */ - -const path = require('path'); -const fs = require('fs'); - -// Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', - name: 'ublock', -}; - -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -/** - * Install the uBlock Origin extension - */ -async function installUblockExtension() { - console.log('[*] Installing uBlock Origin extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[❌] Failed to install uBlock Origin extension'); - return null; - } - - console.log('[+] uBlock Origin extension installed'); - console.log('[+] Ads and trackers will be blocked during archiving'); - - return extension; -} - -/** - * Note: uBlock Origin works automatically with default filter lists. - * No configuration needed - blocks ads, trackers, and malware domains out of the box. - */ - -/** - * Main entry point - install extension before archiving - */ -async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json'); - - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] uBlock Origin extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[⚠️] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installUblockExtension(); - - // Export extension metadata for chrome plugin to load - if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, - installUblockExtension, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[✓] uBlock Origin extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[❌] uBlock Origin extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js b/archivebox/plugins/ublock/on_Crawl__03_ublock_install.js similarity index 100% rename from archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js rename to archivebox/plugins/ublock/on_Crawl__03_ublock_install.js diff --git a/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py b/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py deleted file mode 100644 index d3116ed3..00000000 --- a/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3 -""" -Validate and compute derived wget config values. - -This hook runs early in the Crawl lifecycle to: -1. Validate config values with warnings (not hard errors) -2. Compute derived values (USE_WGET from WGET_ENABLED) -3. Check binary availability and version - -Output: - - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env - - Binary JSONL records to stdout when binaries are found -""" - -import json -import os -import shutil -import subprocess -import sys - -from abx_pkg import Binary, EnvProvider - - -# Read config from environment (already validated by JSONSchema) -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def output_binary(binary: Binary, name: str): - """Output Binary JSONL record to stdout.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - warnings = [] - errors = [] - computed = {} - - # Get config values - wget_enabled = get_env_bool('WGET_ENABLED', True) - wget_save_warc = get_env_bool('WGET_SAVE_WARC', True) - wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) - wget_binary = get_env('WGET_BINARY', 'wget') - - # Compute derived values (USE_WGET for backward compatibility) - use_wget = wget_enabled - computed['USE_WGET'] = str(use_wget).lower() - - # Validate timeout with warning (not error) - if use_wget and wget_timeout < 20: - warnings.append( - f"WGET_TIMEOUT={wget_timeout} is very low. " - "wget may fail to archive sites if set to less than ~20 seconds. " - "Consider setting WGET_TIMEOUT=60 or higher." - ) - - # Check binary availability using abx-pkg - provider = EnvProvider() - try: - binary = Binary(name=wget_binary, binproviders=[provider]).load() - binary_path = str(binary.abspath) if binary.abspath else '' - except Exception: - binary = None - binary_path = '' - - if not binary_path: - if use_wget: - errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.") - computed['WGET_BINARY'] = '' - else: - computed['WGET_BINARY'] = binary_path - wget_version = str(binary.version) if binary.version else 'unknown' - computed['WGET_VERSION'] = wget_version - - # Output Binary JSONL record - output_binary(binary, name='wget') - - # Check for compression support - if computed.get('WGET_BINARY'): - try: - result = subprocess.run( - [computed['WGET_BINARY'], '--compression=auto', '--help'], - capture_output=True, timeout=5 - ) - computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false' - except Exception: - computed['WGET_AUTO_COMPRESSION'] = 'false' - - # Output results - # Format: KEY=VALUE lines that hooks.py will parse and add to env - for key, value in computed.items(): - print(f"COMPUTED:{key}={value}") - - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - - for error in errors: - print(f"ERROR:{error}", file=sys.stderr) - - # Exit with error if any hard errors - sys.exit(1 if errors else 0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/wget/on_Crawl__10_install_wget.py b/archivebox/plugins/wget/on_Crawl__11_wget_validate.py similarity index 100% rename from archivebox/plugins/wget/on_Crawl__10_install_wget.py rename to archivebox/plugins/wget/on_Crawl__11_wget_validate.py