new jsonl cli interface

This commit is contained in:
Nick Sweeting
2025-12-30 16:12:53 -08:00
parent ba8c28a866
commit dd2302ad92
37 changed files with 2919 additions and 1602 deletions

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"CAPTCHA2_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_CAPTCHA2"],
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
},
"CAPTCHA2_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for CAPTCHA solving in seconds"
}
}
}

View File

@@ -0,0 +1,121 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Plugin
*
* Installs and configures the 2captcha Chrome extension for automatic
* CAPTCHA solving during page archiving.
*
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
*
* Priority: 01 (early) - Must install before Chrome session starts at Crawl level
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome/chrome_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
name: 'twocaptcha',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install and configure the 2captcha extension
*/
async function installCaptchaExtension() {
console.log('[*] Installing 2captcha extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install 2captcha extension');
return null;
}
// Check if API key is configured
const apiKey = process.env.API_KEY_2CAPTCHA;
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
} else {
console.log('[+] 2captcha extension installed and API key configured');
}
return extension;
}
/**
* Note: 2captcha configuration is now handled by chrome plugin
* during first-time browser setup to avoid repeated configuration on every snapshot.
* The API key is injected via chrome.storage API once per browser session.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] 2captcha extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installCaptchaExtension();
// Export extension metadata for chrome plugin to load
if (extension) {
// Write extension info to a cache file that chrome plugin can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installCaptchaExtension,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] 2captcha extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] 2captcha extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,279 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Configuration
*
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
* Runs once per crawl to inject API key into extension storage.
*
* Priority: 11 (after chrome_launch at 20)
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - chrome plugin must have loaded extensions (extensions.json must exist)
*/
const path = require('path');
const fs = require('fs');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Get crawl's chrome directory from environment variable set by hooks.py
function getCrawlChromeSessionDir() {
const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
if (!crawlOutputDir) {
return null;
}
return path.join(crawlOutputDir, 'chrome');
}
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.twocaptcha_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
async function configure2Captcha() {
// Check if already configured in this session
if (fs.existsSync(CONFIG_MARKER)) {
console.error('[*] 2captcha already configured in this browser session');
return { success: true, skipped: true };
}
// Check if API key is set
const apiKey = getEnv('API_KEY_2CAPTCHA');
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
}
// Load extensions metadata
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
}
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
if (!captchaExt) {
console.error('[*] 2captcha extension not installed, skipping configuration');
return { success: true, skipped: true };
}
console.error('[*] Configuring 2captcha extension with API key...');
try {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
try {
// Method 1: Try to inject via extension background page
if (captchaExt.target && captchaExt.target_ctx) {
console.error('[*] Attempting to configure via extension background page...');
// Reconnect to the browser to get fresh target context
const targets = await browser.targets();
const extTarget = targets.find(t =>
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
);
if (extTarget) {
const extContext = await extTarget.worker() || await extTarget.page();
if (extContext) {
await extContext.evaluate((key) => {
// Try all common storage patterns
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
chrome.storage.sync.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
}
// Also try localStorage as fallback
if (typeof localStorage !== 'undefined') {
localStorage.setItem('apiKey', key);
localStorage.setItem('2captcha_apikey', key);
localStorage.setItem('solver-api-key', key);
}
}, apiKey);
console.error('[+] 2captcha API key configured successfully via background page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'background_page' };
}
}
}
// Method 2: Try to configure via options page
console.error('[*] Attempting to configure via options page...');
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
const configPage = await browser.newPage();
try {
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
const configured = await configPage.evaluate((key) => {
// Try to find API key input field
const selectors = [
'input[name*="apikey" i]',
'input[id*="apikey" i]',
'input[name*="api-key" i]',
'input[id*="api-key" i]',
'input[name*="key" i]',
'input[placeholder*="api" i]',
'input[type="text"]',
];
for (const selector of selectors) {
const input = document.querySelector(selector);
if (input) {
input.value = key;
input.dispatchEvent(new Event('input', { bubbles: true }));
input.dispatchEvent(new Event('change', { bubbles: true }));
// Try to find and click save button
const saveSelectors = [
'button[type="submit"]',
'input[type="submit"]',
'button:contains("Save")',
'button:contains("Apply")',
];
for (const btnSel of saveSelectors) {
const btn = document.querySelector(btnSel);
if (btn) {
btn.click();
break;
}
}
// Also save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
}
return true;
}
}
// Fallback: Just save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
return true;
}
return false;
}, apiKey);
await configPage.close();
if (configured) {
console.error('[+] 2captcha API key configured successfully via options page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'options_page' };
}
} catch (e) {
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
try {
await configPage.close();
} catch (e2) {}
}
return { success: false, error: 'Could not configure via any method' };
} finally {
browser.disconnect();
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let error = '';
try {
const result = await configure2Captcha();
if (result.skipped) {
status = 'skipped';
} else if (result.success) {
status = 'succeeded';
} else {
status = 'failed';
error = result.error || 'Configuration failed';
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
if (error) {
console.error(`ERROR: ${error}`);
}
// Config hooks don't emit JSONL - they're utility hooks for setup
// Exit code indicates success/failure
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,184 @@
"""
Unit tests for twocaptcha plugin
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None)
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None)
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_config_script_exists():
"""Verify config script exists"""
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
def test_extension_metadata():
"""Test that twocaptcha extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
# Just check the script can be loaded
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert metadata["name"] == "twocaptcha"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
# Check cache file was created
cache_file = ext_dir / "twocaptcha.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert cache_data["name"] == "twocaptcha"
assert "unpacked_path" in cache_data
assert "version" in cache_data
def test_install_twice_uses_cache():
"""Test that running install twice uses existing cache on second run"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# First install - downloads the extension
result1 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
# Verify cache was created
cache_file = ext_dir / "twocaptcha.extension.json"
assert cache_file.exists(), "Cache file should exist after first install"
# Second install - should use cache
result2 = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
# Second run should mention cache reuse
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
def test_install_warns_without_api_key():
"""Test that install warns when API key not configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# Don't set API_KEY_2CAPTCHA
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should warn about missing API key
combined_output = result.stdout + result.stderr
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
def test_install_success_with_api_key():
"""Test that install succeeds when API key is configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should mention API key configured
combined_output = result.stdout + result.stderr
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
def test_config_script_structure():
"""Test that config script has proper structure"""
# Verify the script exists and contains expected markers
script_content = CONFIG_SCRIPT.read_text()
# Should mention configuration marker file
assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content
# Should mention API key
assert "API_KEY_2CAPTCHA" in script_content
# Should have main function or be executable
assert "async function" in script_content or "main" in script_content