mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-02 17:05:38 +10:00
Clean up on_Crawl hooks and remove dead code (#1751)
Deleted dead/duplicate hooks:
- wget/on_Crawl__10_install_wget.py (duplicate of
__10_wget_validate_config.py)
- chrome/on_Crawl__00_chrome_install.py (simpler version, kept full one)
- chrome/on_Crawl__20_chrome_launch.bg.js (legacy, kept __30 version)
- singlefile/on_Crawl__20_install_singlefile_extension.js
(disabled/dead)
- istilldontcareaboutcookies/on_Crawl__20_install_*.js (legacy)
- ublock/on_Crawl__03_ublock.js (legacy, kept __20 version)
- Entire captcha2/ plugin (legacy version of twocaptcha/)
Renamed hooks to follow consistent pattern:
on_Crawl__XX_<plugin>_<action>.<ext>
Priority bands:
00-09: Binary/extension installation 10-19: Config validation 20-29:
Browser launch and post-launch config
Final hooks:
00 ripgrep_install.py, 01 chrome_install.py 02
istilldontcareaboutcookies_install.js 03 ublock_install.js, 04
singlefile_install.js 05 twocaptcha_install.js 10 chrome_validate.py, 11
wget_validate.py 20 chrome_launch.bg.js, 25 twocaptcha_config.js
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->
# Summary
<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->
# Related issues
<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->
# Changes these areas
- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Cleaned up Crawl-level hooks by removing legacy/duplicate code and
standardizing hook names and priorities. Chrome launch is now a single,
updated hook with better extension detection and cleaner outputs.
- **Refactors**
- Removed dead hooks (legacy chrome install/launch, singlefile
extension, old ublock/cookies scripts, duplicate wget validate) and the
legacy captcha2 plugin in favor of twocaptcha.
- Renamed hooks to on_Crawl__XX_<plugin>_<action> with priority bands:
00-09 install, 10-19 validate, 20-29 launch/config.
- Consolidated Chrome launch into on_Crawl__20_chrome_launch.bg.js;
writes outputs to the current dir, resolves real extension IDs via
chrome://extensions, and records extensions.json after verification.
- **Migration**
- If you used captcha2, switch to the twocaptcha hooks
(on_Crawl__05_twocaptcha_install.js and
on_Crawl__25_twocaptcha_config.js).
- Update any docs/scripts that reference old hook filenames.
<sup>Written for commit 4c77949197.
Summary will update on new commits.</sup>
<!-- End of auto-generated description by cubic. -->
This commit is contained in:
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"CAPTCHA2_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_CAPTCHA2"],
|
||||
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
|
||||
},
|
||||
"CAPTCHA2_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for CAPTCHA solving in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,121 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* 2Captcha Extension Plugin
|
||||
*
|
||||
* Installs and configures the 2captcha Chrome extension for automatic
|
||||
* CAPTCHA solving during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
|
||||
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
|
||||
*
|
||||
* Priority: 01 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* Requirements:
|
||||
* - API_KEY_2CAPTCHA environment variable must be set
|
||||
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
|
||||
name: 'captcha2',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
/**
|
||||
* Install and configure the 2captcha extension
|
||||
*/
|
||||
async function installCaptchaExtension() {
|
||||
console.log('[*] Installing 2captcha extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install 2captcha extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if API key is configured
|
||||
const apiKey = process.env.API_KEY_2CAPTCHA;
|
||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
|
||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
||||
} else {
|
||||
console.log('[+] 2captcha extension installed and API key configured');
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: 2captcha configuration is now handled by chrome plugin
|
||||
* during first-time browser setup to avoid repeated configuration on every snapshot.
|
||||
* The API key is injected via chrome.storage API once per browser session.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] 2captcha extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installCaptchaExtension();
|
||||
|
||||
// Export extension metadata for chrome plugin to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome plugin can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installCaptchaExtension,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] 2captcha extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] 2captcha extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -1,279 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* 2Captcha Extension Configuration
|
||||
*
|
||||
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
|
||||
* Runs once per crawl to inject API key into extension storage.
|
||||
*
|
||||
* Priority: 11 (after chrome_launch at 20)
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* Requirements:
|
||||
* - API_KEY_2CAPTCHA environment variable must be set
|
||||
* - chrome plugin must have loaded extensions (extensions.json must exist)
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Get crawl's chrome directory from environment variable set by hooks.py
|
||||
function getCrawlChromeSessionDir() {
|
||||
const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
|
||||
if (!crawlOutputDir) {
|
||||
return null;
|
||||
}
|
||||
return path.join(crawlOutputDir, 'chrome');
|
||||
}
|
||||
|
||||
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
|
||||
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
async function configure2Captcha() {
|
||||
// Check if already configured in this session
|
||||
if (fs.existsSync(CONFIG_MARKER)) {
|
||||
console.error('[*] 2captcha already configured in this browser session');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
// Check if API key is set
|
||||
const apiKey = getEnv('API_KEY_2CAPTCHA');
|
||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
|
||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
||||
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
|
||||
}
|
||||
|
||||
// Load extensions metadata
|
||||
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
|
||||
if (!fs.existsSync(extensionsFile)) {
|
||||
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
|
||||
}
|
||||
|
||||
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
|
||||
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
|
||||
|
||||
if (!captchaExt) {
|
||||
console.error('[*] 2captcha extension not installed, skipping configuration');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
console.error('[*] Configuring 2captcha extension with API key...');
|
||||
|
||||
try {
|
||||
// Connect to the existing Chrome session via CDP
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (!fs.existsSync(cdpFile)) {
|
||||
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
|
||||
}
|
||||
|
||||
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
|
||||
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
|
||||
try {
|
||||
// Method 1: Try to inject via extension background page
|
||||
if (captchaExt.target && captchaExt.target_ctx) {
|
||||
console.error('[*] Attempting to configure via extension background page...');
|
||||
|
||||
// Reconnect to the browser to get fresh target context
|
||||
const targets = await browser.targets();
|
||||
const extTarget = targets.find(t =>
|
||||
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
|
||||
);
|
||||
|
||||
if (extTarget) {
|
||||
const extContext = await extTarget.worker() || await extTarget.page();
|
||||
|
||||
if (extContext) {
|
||||
await extContext.evaluate((key) => {
|
||||
// Try all common storage patterns
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({
|
||||
apiKey: key,
|
||||
api_key: key,
|
||||
'2captcha_apikey': key,
|
||||
apikey: key,
|
||||
'solver-api-key': key,
|
||||
});
|
||||
chrome.storage.sync.set({
|
||||
apiKey: key,
|
||||
api_key: key,
|
||||
'2captcha_apikey': key,
|
||||
apikey: key,
|
||||
'solver-api-key': key,
|
||||
});
|
||||
}
|
||||
|
||||
// Also try localStorage as fallback
|
||||
if (typeof localStorage !== 'undefined') {
|
||||
localStorage.setItem('apiKey', key);
|
||||
localStorage.setItem('2captcha_apikey', key);
|
||||
localStorage.setItem('solver-api-key', key);
|
||||
}
|
||||
}, apiKey);
|
||||
|
||||
console.error('[+] 2captcha API key configured successfully via background page');
|
||||
|
||||
// Mark as configured
|
||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
||||
|
||||
return { success: true, method: 'background_page' };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Method 2: Try to configure via options page
|
||||
console.error('[*] Attempting to configure via options page...');
|
||||
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
|
||||
const configPage = await browser.newPage();
|
||||
|
||||
try {
|
||||
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
|
||||
|
||||
const configured = await configPage.evaluate((key) => {
|
||||
// Try to find API key input field
|
||||
const selectors = [
|
||||
'input[name*="apikey" i]',
|
||||
'input[id*="apikey" i]',
|
||||
'input[name*="api-key" i]',
|
||||
'input[id*="api-key" i]',
|
||||
'input[name*="key" i]',
|
||||
'input[placeholder*="api" i]',
|
||||
'input[type="text"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const input = document.querySelector(selector);
|
||||
if (input) {
|
||||
input.value = key;
|
||||
input.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
input.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
|
||||
// Try to find and click save button
|
||||
const saveSelectors = [
|
||||
'button[type="submit"]',
|
||||
'input[type="submit"]',
|
||||
'button:contains("Save")',
|
||||
'button:contains("Apply")',
|
||||
];
|
||||
|
||||
for (const btnSel of saveSelectors) {
|
||||
const btn = document.querySelector(btnSel);
|
||||
if (btn) {
|
||||
btn.click();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Also save to storage
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Just save to storage
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}, apiKey);
|
||||
|
||||
await configPage.close();
|
||||
|
||||
if (configured) {
|
||||
console.error('[+] 2captcha API key configured successfully via options page');
|
||||
|
||||
// Mark as configured
|
||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
||||
|
||||
return { success: true, method: 'options_page' };
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
|
||||
try {
|
||||
await configPage.close();
|
||||
} catch (e2) {}
|
||||
}
|
||||
|
||||
return { success: false, error: 'Could not configure via any method' };
|
||||
} finally {
|
||||
browser.disconnect();
|
||||
}
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
const result = await configure2Captcha();
|
||||
|
||||
if (result.skipped) {
|
||||
status = 'skipped';
|
||||
} else if (result.success) {
|
||||
status = 'succeeded';
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error || 'Configuration failed';
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR: ${error}`);
|
||||
}
|
||||
|
||||
// Config hooks don't emit JSONL - they're utility hooks for setup
|
||||
// Exit code indicates success/failure
|
||||
|
||||
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,184 +0,0 @@
|
||||
"""
|
||||
Unit tests for captcha2 plugin
|
||||
|
||||
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
|
||||
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
|
||||
|
||||
def test_config_script_exists():
|
||||
"""Verify config script exists"""
|
||||
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that captcha2 extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
# Just check the script can be loaded
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert metadata["name"] == "captcha2"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "captcha2.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert cache_data["name"] == "captcha2"
|
||||
assert "unpacked_path" in cache_data
|
||||
assert "version" in cache_data
|
||||
|
||||
|
||||
def test_install_twice_uses_cache():
|
||||
"""Test that running install twice uses existing cache on second run"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
||||
|
||||
# First install - downloads the extension
|
||||
result1 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
|
||||
|
||||
# Verify cache was created
|
||||
cache_file = ext_dir / "captcha2.extension.json"
|
||||
assert cache_file.exists(), "Cache file should exist after first install"
|
||||
|
||||
# Second install - should use cache
|
||||
result2 = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
|
||||
|
||||
# Second run should mention cache reuse
|
||||
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
|
||||
|
||||
|
||||
def test_install_warns_without_api_key():
|
||||
"""Test that install warns when API key not configured"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# Don't set API_KEY_2CAPTCHA
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should warn about missing API key
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
|
||||
|
||||
|
||||
def test_install_success_with_api_key():
|
||||
"""Test that install succeeds when API key is configured"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should mention API key configured
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
|
||||
|
||||
|
||||
def test_config_script_structure():
|
||||
"""Test that config script has proper structure"""
|
||||
# Verify the script exists and contains expected markers
|
||||
script_content = CONFIG_SCRIPT.read_text()
|
||||
|
||||
# Should mention configuration marker file
|
||||
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
|
||||
|
||||
# Should mention API key
|
||||
assert "API_KEY_2CAPTCHA" in script_content
|
||||
|
||||
# Should have main function or be executable
|
||||
assert "async function" in script_content or "main" in script_content
|
||||
@@ -1,184 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install hook for Chrome/Chromium and puppeteer-core.
|
||||
|
||||
Runs at crawl start to install/find Chromium and puppeteer-core.
|
||||
Outputs JSONL for Binary and Machine config updates.
|
||||
Respects CHROME_BINARY env var for custom binary paths.
|
||||
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
|
||||
|
||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
--load-extension and --disable-extensions-except flags, which are needed for
|
||||
loading unpacked extensions in headless mode.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_chrome_version(binary_path: str) -> str | None:
|
||||
"""Get Chrome/Chromium version string."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[binary_path, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def install_puppeteer_core() -> bool:
|
||||
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
|
||||
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
|
||||
if not node_modules_dir:
|
||||
# No isolated node_modules, skip (will use global)
|
||||
return True
|
||||
|
||||
node_modules_path = Path(node_modules_dir)
|
||||
if (node_modules_path / 'puppeteer-core').exists():
|
||||
return True
|
||||
|
||||
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
|
||||
npm_prefix = node_modules_path.parent
|
||||
|
||||
try:
|
||||
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
if result.returncode == 0:
|
||||
print(f"[+] puppeteer-core installed", file=sys.stderr)
|
||||
return True
|
||||
else:
|
||||
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def install_chromium() -> dict | None:
|
||||
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
|
||||
|
||||
Output format: "chromium@<version> <path_to_binary>"
|
||||
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
|
||||
|
||||
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
|
||||
"""
|
||||
try:
|
||||
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
|
||||
|
||||
# Use --path to install to puppeteer's standard cache location
|
||||
cache_path = os.path.expanduser('~/.cache/puppeteer')
|
||||
|
||||
result = subprocess.run(
|
||||
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
stdin=subprocess.DEVNULL,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Parse output: "chromium@1563294 /path/to/Chromium"
|
||||
output = result.stdout.strip()
|
||||
parts = output.split(' ', 1)
|
||||
if len(parts) != 2:
|
||||
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
version_str = parts[0] # "chromium@1563294"
|
||||
binary_path = parts[1].strip()
|
||||
|
||||
if not binary_path or not os.path.exists(binary_path):
|
||||
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Extract version number
|
||||
version = version_str.split('@')[1] if '@' in version_str else None
|
||||
|
||||
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
|
||||
|
||||
return {
|
||||
'name': 'chromium',
|
||||
'abspath': binary_path,
|
||||
'version': version,
|
||||
'binprovider': 'puppeteer',
|
||||
}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("[!] Chromium install timed out", file=sys.stderr)
|
||||
except FileNotFoundError:
|
||||
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
# Install puppeteer-core if NODE_MODULES_DIR is set
|
||||
install_puppeteer_core()
|
||||
|
||||
# Check if CHROME_BINARY is already set and valid
|
||||
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
||||
version = get_chrome_version(configured_binary)
|
||||
print(json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': 'chromium',
|
||||
'abspath': configured_binary,
|
||||
'version': version,
|
||||
'binprovider': 'env',
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
# Install/find Chromium via puppeteer
|
||||
result = install_chromium()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/CHROME_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/CHROMIUM_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("Chromium binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -8,8 +8,8 @@
|
||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Creates chrome/ directory under crawl output dir with:
|
||||
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
* - port.txt: Debug port number
|
||||
@@ -38,11 +38,12 @@ const {
|
||||
killChrome,
|
||||
getEnv,
|
||||
writePidWithMtime,
|
||||
getExtensionsDir,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = 'chrome';
|
||||
const OUTPUT_DIR = '.';
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
@@ -115,8 +116,12 @@ async function main() {
|
||||
if (version) console.error(`[*] Version: ${version}`);
|
||||
|
||||
// Load installed extensions
|
||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
|
||||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
|
||||
const extensionsDir = getExtensionsDir();
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
|
||||
if (userDataDir) {
|
||||
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||
}
|
||||
|
||||
const installedExtensions = [];
|
||||
const extensionPaths = [];
|
||||
@@ -143,17 +148,18 @@ async function main() {
|
||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||
}
|
||||
|
||||
// Write hook's own PID
|
||||
const hookStartTime = Date.now() / 1000;
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir: OUTPUT_DIR,
|
||||
userDataDir,
|
||||
extensionPaths,
|
||||
});
|
||||
|
||||
@@ -165,14 +171,6 @@ async function main() {
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
|
||||
// Write extensions metadata
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
// Connect puppeteer for extension verification
|
||||
console.error(`[*] Connecting puppeteer to CDP...`);
|
||||
const browser = await puppeteer.connect({
|
||||
@@ -181,30 +179,102 @@ async function main() {
|
||||
});
|
||||
browserInstance = browser;
|
||||
|
||||
// Verify extensions loaded
|
||||
// Get actual extension IDs from chrome://extensions page
|
||||
if (extensionPaths.length > 0) {
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
const targets = browser.targets();
|
||||
console.error(`[*] All browser targets (${targets.length}):`);
|
||||
for (const t of targets) {
|
||||
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
|
||||
try {
|
||||
const extPage = await browser.newPage();
|
||||
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Parse extension info from the page
|
||||
const extensionsFromPage = await extPage.evaluate(() => {
|
||||
const extensions = [];
|
||||
// Extensions manager uses shadow DOM
|
||||
const manager = document.querySelector('extensions-manager');
|
||||
if (!manager || !manager.shadowRoot) return extensions;
|
||||
|
||||
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
|
||||
if (!itemList || !itemList.shadowRoot) return extensions;
|
||||
|
||||
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
|
||||
for (const item of items) {
|
||||
const id = item.getAttribute('id');
|
||||
const nameEl = item.shadowRoot?.querySelector('#name');
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
if (id && name) {
|
||||
extensions.push({ id, name });
|
||||
}
|
||||
}
|
||||
return extensions;
|
||||
});
|
||||
|
||||
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
|
||||
for (const e of extensionsFromPage) {
|
||||
console.error(` - ${e.id}: "${e.name}"`);
|
||||
}
|
||||
|
||||
// Match extensions by name (strict matching)
|
||||
for (const ext of installedExtensions) {
|
||||
// Read the extension's manifest to get its display name
|
||||
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
|
||||
let manifestName = manifest.name || '';
|
||||
|
||||
// Resolve message placeholder (e.g., __MSG_extName__)
|
||||
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
|
||||
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
|
||||
const defaultLocale = manifest.default_locale || 'en';
|
||||
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
|
||||
if (fs.existsSync(messagesPath)) {
|
||||
try {
|
||||
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
|
||||
if (messages[msgKey] && messages[msgKey].message) {
|
||||
manifestName = messages[msgKey].message;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to read messages.json: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
|
||||
|
||||
// Find matching extension from page by exact name match first
|
||||
let match = extensionsFromPage.find(e => e.name === manifestName);
|
||||
|
||||
// If no exact match, try case-insensitive exact match
|
||||
if (!match) {
|
||||
match = extensionsFromPage.find(e =>
|
||||
e.name.toLowerCase() === manifestName.toLowerCase()
|
||||
);
|
||||
}
|
||||
|
||||
if (match) {
|
||||
ext.id = match.id;
|
||||
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
|
||||
} else {
|
||||
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await extPage.close();
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
|
||||
}
|
||||
|
||||
const extTargets = targets.filter(t =>
|
||||
t.url().startsWith('chrome-extension://') ||
|
||||
t.type() === 'service_worker' ||
|
||||
t.type() === 'background_page'
|
||||
);
|
||||
|
||||
// Filter out built-in extensions
|
||||
// Fallback: check browser targets
|
||||
const targets = browser.targets();
|
||||
const builtinIds = [
|
||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
||||
];
|
||||
const customExtTargets = extTargets.filter(t => {
|
||||
const customExtTargets = targets.filter(t => {
|
||||
const url = t.url();
|
||||
if (!url.startsWith('chrome-extension://')) return false;
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
@@ -216,7 +286,7 @@ async function main() {
|
||||
for (const target of customExtTargets) {
|
||||
const url = target.url();
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
|
||||
console.error(`[+] Extension target: ${extId} (${target.type()})`);
|
||||
}
|
||||
|
||||
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
||||
@@ -225,6 +295,14 @@ async function main() {
|
||||
}
|
||||
}
|
||||
|
||||
// Write extensions metadata with actual IDs
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
||||
console.error(`[+] PID: ${chromePid}`);
|
||||
|
||||
@@ -1,323 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Launch a shared Chromium browser session for the entire crawl.
|
||||
*
|
||||
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
|
||||
* Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
|
||||
*
|
||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
* - port.txt: Debug port number
|
||||
* - extensions.json: Loaded extensions metadata
|
||||
*
|
||||
* Environment variables:
|
||||
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
|
||||
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
|
||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
|
||||
*/
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const {
|
||||
findChromium,
|
||||
launchChromium,
|
||||
killChrome,
|
||||
getEnv,
|
||||
writePidWithMtime,
|
||||
getExtensionsDir,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = '.';
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
let browserInstance = null;
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach((arg) => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Cleanup handler for SIGTERM
|
||||
async function cleanup() {
|
||||
console.error('[*] Cleaning up Chrome session...');
|
||||
|
||||
// Try graceful browser close first
|
||||
if (browserInstance) {
|
||||
try {
|
||||
console.error('[*] Closing browser gracefully...');
|
||||
await browserInstance.close();
|
||||
browserInstance = null;
|
||||
console.error('[+] Browser closed gracefully');
|
||||
} catch (e) {
|
||||
console.error(`[!] Graceful close failed: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Kill Chrome process
|
||||
if (chromePid) {
|
||||
await killChrome(chromePid, OUTPUT_DIR);
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Register signal handlers
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const crawlId = args.crawl_id;
|
||||
|
||||
try {
|
||||
const binary = findChromium();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chromium binary not found');
|
||||
console.error('DEPENDENCY_NEEDED=chromium');
|
||||
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
|
||||
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Get Chromium version
|
||||
let version = '';
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
|
||||
.trim()
|
||||
.slice(0, 64);
|
||||
} catch (e) {}
|
||||
|
||||
console.error(`[*] Using browser: ${binary}`);
|
||||
if (version) console.error(`[*] Version: ${version}`);
|
||||
|
||||
// Load installed extensions
|
||||
const extensionsDir = getExtensionsDir();
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
|
||||
if (userDataDir) {
|
||||
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||
}
|
||||
|
||||
const installedExtensions = [];
|
||||
const extensionPaths = [];
|
||||
if (fs.existsSync(extensionsDir)) {
|
||||
const files = fs.readdirSync(extensionsDir);
|
||||
for (const file of files) {
|
||||
if (file.endsWith('.extension.json')) {
|
||||
try {
|
||||
const extPath = path.join(extensionsDir, file);
|
||||
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
|
||||
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
|
||||
installedExtensions.push(extData);
|
||||
extensionPaths.push(extData.unpacked_path);
|
||||
console.error(`[*] Loading extension: ${extData.name || file}`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[!] Skipping invalid extension cache: ${file}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (installedExtensions.length > 0) {
|
||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||
}
|
||||
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir: OUTPUT_DIR,
|
||||
userDataDir,
|
||||
extensionPaths,
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
|
||||
// Connect puppeteer for extension verification
|
||||
console.error(`[*] Connecting puppeteer to CDP...`);
|
||||
const browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: null,
|
||||
});
|
||||
browserInstance = browser;
|
||||
|
||||
// Get actual extension IDs from chrome://extensions page
|
||||
if (extensionPaths.length > 0) {
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
try {
|
||||
const extPage = await browser.newPage();
|
||||
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Parse extension info from the page
|
||||
const extensionsFromPage = await extPage.evaluate(() => {
|
||||
const extensions = [];
|
||||
// Extensions manager uses shadow DOM
|
||||
const manager = document.querySelector('extensions-manager');
|
||||
if (!manager || !manager.shadowRoot) return extensions;
|
||||
|
||||
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
|
||||
if (!itemList || !itemList.shadowRoot) return extensions;
|
||||
|
||||
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
|
||||
for (const item of items) {
|
||||
const id = item.getAttribute('id');
|
||||
const nameEl = item.shadowRoot?.querySelector('#name');
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
if (id && name) {
|
||||
extensions.push({ id, name });
|
||||
}
|
||||
}
|
||||
return extensions;
|
||||
});
|
||||
|
||||
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
|
||||
for (const e of extensionsFromPage) {
|
||||
console.error(` - ${e.id}: "${e.name}"`);
|
||||
}
|
||||
|
||||
// Match extensions by name (strict matching)
|
||||
for (const ext of installedExtensions) {
|
||||
// Read the extension's manifest to get its display name
|
||||
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
|
||||
let manifestName = manifest.name || '';
|
||||
|
||||
// Resolve message placeholder (e.g., __MSG_extName__)
|
||||
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
|
||||
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
|
||||
const defaultLocale = manifest.default_locale || 'en';
|
||||
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
|
||||
if (fs.existsSync(messagesPath)) {
|
||||
try {
|
||||
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
|
||||
if (messages[msgKey] && messages[msgKey].message) {
|
||||
manifestName = messages[msgKey].message;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to read messages.json: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
|
||||
|
||||
// Find matching extension from page by exact name match first
|
||||
let match = extensionsFromPage.find(e => e.name === manifestName);
|
||||
|
||||
// If no exact match, try case-insensitive exact match
|
||||
if (!match) {
|
||||
match = extensionsFromPage.find(e =>
|
||||
e.name.toLowerCase() === manifestName.toLowerCase()
|
||||
);
|
||||
}
|
||||
|
||||
if (match) {
|
||||
ext.id = match.id;
|
||||
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
|
||||
} else {
|
||||
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await extPage.close();
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
|
||||
}
|
||||
|
||||
// Fallback: check browser targets
|
||||
const targets = browser.targets();
|
||||
const builtinIds = [
|
||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
||||
];
|
||||
const customExtTargets = targets.filter(t => {
|
||||
const url = t.url();
|
||||
if (!url.startsWith('chrome-extension://')) return false;
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
return !builtinIds.includes(extId);
|
||||
});
|
||||
|
||||
console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
|
||||
|
||||
for (const target of customExtTargets) {
|
||||
const url = target.url();
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
console.error(`[+] Extension target: ${extId} (${target.type()})`);
|
||||
}
|
||||
|
||||
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
||||
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
|
||||
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Write extensions metadata with actual IDs
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
||||
console.error(`[+] PID: ${chromePid}`);
|
||||
|
||||
// Stay alive to handle cleanup on SIGTERM
|
||||
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
|
||||
setInterval(() => {}, 1000000);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,59 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* I Still Don't Care About Cookies Extension Plugin
|
||||
*
|
||||
* Installs and configures the "I still don't care about cookies" Chrome extension
|
||||
* for automatic cookie consent banner dismissal during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
|
||||
*
|
||||
* Priority: 02 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Dismisses cookie consent popups
|
||||
* - Removes cookie banners
|
||||
* - Accepts necessary cookies to proceed with browsing
|
||||
* - Works on thousands of websites out of the box
|
||||
*/
|
||||
|
||||
// Import extension utilities
|
||||
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
};
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*
|
||||
* Note: This extension works out of the box with no configuration needed.
|
||||
* It automatically detects and dismisses cookie banners on page load.
|
||||
*/
|
||||
async function main() {
|
||||
const extension = await installExtensionWithCache(EXTENSION);
|
||||
|
||||
if (extension) {
|
||||
console.log('[+] Cookie banners will be automatically dismissed during archiving');
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -1,281 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* SingleFile Extension Plugin
|
||||
*
|
||||
* DISABLED: Extension functionality commented out - using single-file-cli only
|
||||
*
|
||||
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
||||
* Falls back to single-file-cli if the extension is not available.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
||||
*
|
||||
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Saves complete web pages as single HTML files
|
||||
* - Inlines all resources (CSS, JS, images, fonts)
|
||||
* - Preserves page fidelity better than wget/curl
|
||||
* - Works with SPAs and dynamically loaded content
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const { promisify } = require('util');
|
||||
const { exec } = require('child_process');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// // Import extension utilities
|
||||
// const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// // Extension metadata
|
||||
// const EXTENSION = {
|
||||
// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
// name: 'singlefile',
|
||||
// };
|
||||
|
||||
// // Get extensions directory from environment or use default
|
||||
// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// /**
|
||||
// * Install the SingleFile extension
|
||||
// */
|
||||
// async function installSinglefileExtension() {
|
||||
// console.log('[*] Installing SingleFile extension...');
|
||||
|
||||
// // Install the extension
|
||||
// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
// if (!extension) {
|
||||
// console.error('[❌] Failed to install SingleFile extension');
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// console.log('[+] SingleFile extension installed');
|
||||
// console.log('[+] Web pages will be saved as single HTML files');
|
||||
|
||||
// return extension;
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Wait for a specified amount of time
|
||||
// */
|
||||
// function wait(ms) {
|
||||
// return new Promise(resolve => setTimeout(resolve, ms));
|
||||
// }
|
||||
|
||||
// /**
|
||||
// * Save a page using the SingleFile extension
|
||||
// *
|
||||
// * @param {Object} page - Puppeteer page object
|
||||
// * @param {Object} extension - Extension metadata with dispatchAction method
|
||||
// * @param {Object} options - Additional options
|
||||
// * @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
// */
|
||||
// async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
// if (!extension || !extension.version) {
|
||||
// throw new Error('SingleFile extension not found or not loaded');
|
||||
// }
|
||||
|
||||
// const url = await page.url();
|
||||
|
||||
// // Check for unsupported URL schemes
|
||||
// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
// const scheme = url.split(':')[0];
|
||||
// if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
||||
// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
// // Ensure downloads directory exists
|
||||
// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
|
||||
// // Get list of existing files to ignore
|
||||
// const files_before = new Set(
|
||||
// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
// .filter(fn => fn.endsWith('.html'))
|
||||
// );
|
||||
|
||||
// // Output directory is current directory (hook already runs in output dir)
|
||||
// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
|
||||
// // Bring page to front (extension action button acts on foreground tab)
|
||||
// await page.bringToFront();
|
||||
|
||||
// // Trigger the extension's action (toolbar button click)
|
||||
// await extension.dispatchAction();
|
||||
|
||||
// // Wait for file to appear in downloads directory
|
||||
// const check_delay = 3000; // 3 seconds
|
||||
// const max_tries = 10;
|
||||
// let files_new = [];
|
||||
|
||||
// for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
// await wait(check_delay);
|
||||
|
||||
// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
// .filter(fn => fn.endsWith('.html'));
|
||||
|
||||
// files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
// if (files_new.length === 0) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// // Find the matching file by checking if it contains the URL in the HTML header
|
||||
// for (const file of files_new) {
|
||||
// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
// const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
// const dl_header = dl_text.split('meta charset')[0];
|
||||
|
||||
// if (dl_header.includes(`url: ${url}`)) {
|
||||
// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
// await fs.promises.rename(dl_path, out_path);
|
||||
// return out_path;
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
// return null;
|
||||
// }
|
||||
|
||||
/**
|
||||
* Save a page using single-file-cli (fallback method)
|
||||
*
|
||||
* @param {string} url - URL to archive
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithCLI(url, options = {}) {
|
||||
console.log('[*] Falling back to single-file-cli...');
|
||||
|
||||
// Find single-file binary
|
||||
let binary = null;
|
||||
try {
|
||||
const { stdout } = await execAsync('which single-file');
|
||||
binary = stdout.trim();
|
||||
} catch (err) {
|
||||
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
const cmd = [
|
||||
binary,
|
||||
'--browser-headless',
|
||||
url,
|
||||
out_path,
|
||||
];
|
||||
|
||||
// Add optional args
|
||||
if (options.userAgent) {
|
||||
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
||||
}
|
||||
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
||||
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
||||
}
|
||||
if (options.ignoreSSL) {
|
||||
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
||||
}
|
||||
|
||||
// Execute
|
||||
try {
|
||||
const timeout = options.timeout || 120000;
|
||||
await execAsync(cmd.join(' '), { timeout });
|
||||
|
||||
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
||||
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
||||
return out_path;
|
||||
}
|
||||
|
||||
console.error('[❌] SingleFile CLI completed but no output file found');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// /**
|
||||
// * Main entry point - install extension before archiving
|
||||
// */
|
||||
// async function main() {
|
||||
// // Check if extension is already cached
|
||||
// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
|
||||
// if (fs.existsSync(cacheFile)) {
|
||||
// try {
|
||||
// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
// const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
// if (fs.existsSync(manifestPath)) {
|
||||
// console.log('[*] SingleFile extension already installed (using cache)');
|
||||
// return cached;
|
||||
// }
|
||||
// } catch (e) {
|
||||
// // Cache file corrupted, re-install
|
||||
// console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
// }
|
||||
// }
|
||||
|
||||
// // Install extension
|
||||
// const extension = await installSinglefileExtension();
|
||||
|
||||
// // Export extension metadata for chrome plugin to load
|
||||
// if (extension) {
|
||||
// // Write extension info to a cache file that chrome plugin can read
|
||||
// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
// await fs.promises.writeFile(
|
||||
// cacheFile,
|
||||
// JSON.stringify(extension, null, 2)
|
||||
// );
|
||||
// console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
// }
|
||||
|
||||
// return extension;
|
||||
// }
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// EXTENSION,
|
||||
// installSinglefileExtension,
|
||||
// saveSinglefileWithExtension,
|
||||
saveSinglefileWithCLI,
|
||||
};
|
||||
|
||||
// DISABLED: Extension functionality - using single-file-cli only
|
||||
// // Run if executed directly
|
||||
// if (require.main === module) {
|
||||
// main().then(() => {
|
||||
// console.log('[✓] SingleFile extension setup complete');
|
||||
// process.exit(0);
|
||||
// }).catch(err => {
|
||||
// console.error('[❌] SingleFile extension setup failed:', err);
|
||||
// process.exit(1);
|
||||
// });
|
||||
// }
|
||||
|
||||
// No-op when run directly (extension install disabled)
|
||||
if (require.main === module) {
|
||||
console.log('[*] SingleFile extension install disabled - using single-file-cli only');
|
||||
process.exit(0);
|
||||
}
|
||||
@@ -1,116 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* uBlock Origin Extension Plugin
|
||||
*
|
||||
* Installs and configures the uBlock Origin Chrome extension for ad blocking
|
||||
* and privacy protection during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
|
||||
*
|
||||
* Priority: 03 (early) - Must install before Chrome session starts at Crawl level
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Blocks ads, trackers, and malware domains
|
||||
* - Reduces page load time and bandwidth usage
|
||||
* - Improves privacy during archiving
|
||||
* - Removes clutter from archived pages
|
||||
* - Uses efficient blocking with filter lists
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
|
||||
name: 'ublock',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
/**
|
||||
* Install the uBlock Origin extension
|
||||
*/
|
||||
async function installUblockExtension() {
|
||||
console.log('[*] Installing uBlock Origin extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install uBlock Origin extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log('[+] uBlock Origin extension installed');
|
||||
console.log('[+] Ads and trackers will be blocked during archiving');
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: uBlock Origin works automatically with default filter lists.
|
||||
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] uBlock Origin extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installUblockExtension();
|
||||
|
||||
// Export extension metadata for chrome plugin to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome plugin can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installUblockExtension,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] uBlock Origin extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] uBlock Origin extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -1,130 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate and compute derived wget config values.
|
||||
|
||||
This hook runs early in the Crawl lifecycle to:
|
||||
1. Validate config values with warnings (not hard errors)
|
||||
2. Compute derived values (USE_WGET from WGET_ENABLED)
|
||||
3. Check binary availability and version
|
||||
|
||||
Output:
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
- Binary JSONL records to stdout when binaries are found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
# Read config from environment (already validated by JSONSchema)
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def output_binary(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Get config values
|
||||
wget_enabled = get_env_bool('WGET_ENABLED', True)
|
||||
wget_save_warc = get_env_bool('WGET_SAVE_WARC', True)
|
||||
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
wget_binary = get_env('WGET_BINARY', 'wget')
|
||||
|
||||
# Compute derived values (USE_WGET for backward compatibility)
|
||||
use_wget = wget_enabled
|
||||
computed['USE_WGET'] = str(use_wget).lower()
|
||||
|
||||
# Validate timeout with warning (not error)
|
||||
if use_wget and wget_timeout < 20:
|
||||
warnings.append(
|
||||
f"WGET_TIMEOUT={wget_timeout} is very low. "
|
||||
"wget may fail to archive sites if set to less than ~20 seconds. "
|
||||
"Consider setting WGET_TIMEOUT=60 or higher."
|
||||
)
|
||||
|
||||
# Check binary availability using abx-pkg
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=wget_binary, binproviders=[provider]).load()
|
||||
binary_path = str(binary.abspath) if binary.abspath else ''
|
||||
except Exception:
|
||||
binary = None
|
||||
binary_path = ''
|
||||
|
||||
if not binary_path:
|
||||
if use_wget:
|
||||
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
|
||||
computed['WGET_BINARY'] = ''
|
||||
else:
|
||||
computed['WGET_BINARY'] = binary_path
|
||||
wget_version = str(binary.version) if binary.version else 'unknown'
|
||||
computed['WGET_VERSION'] = wget_version
|
||||
|
||||
# Output Binary JSONL record
|
||||
output_binary(binary, name='wget')
|
||||
|
||||
# Check for compression support
|
||||
if computed.get('WGET_BINARY'):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[computed['WGET_BINARY'], '--compression=auto', '--help'],
|
||||
capture_output=True, timeout=5
|
||||
)
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
|
||||
except Exception:
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'false'
|
||||
|
||||
# Output results
|
||||
# Format: KEY=VALUE lines that hooks.py will parse and add to env
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
# Exit with error if any hard errors
|
||||
sys.exit(1 if errors else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user