mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
Clean up on_Crawl hooks and remove dead code (#1751)
Deleted dead/duplicate hooks:
- wget/on_Crawl__10_install_wget.py (duplicate of
__10_wget_validate_config.py)
- chrome/on_Crawl__00_chrome_install.py (simpler version, kept full one)
- chrome/on_Crawl__20_chrome_launch.bg.js (legacy, kept __30 version)
- singlefile/on_Crawl__20_install_singlefile_extension.js
(disabled/dead)
- istilldontcareaboutcookies/on_Crawl__20_install_*.js (legacy)
- ublock/on_Crawl__03_ublock.js (legacy, kept __20 version)
- Entire captcha2/ plugin (legacy version of twocaptcha/)
Renamed hooks to follow consistent pattern:
on_Crawl__XX_<plugin>_<action>.<ext>
Priority bands:
00-09: Binary/extension installation 10-19: Config validation 20-29:
Browser launch and post-launch config
Final hooks:
00 ripgrep_install.py, 01 chrome_install.py 02
istilldontcareaboutcookies_install.js 03 ublock_install.js, 04
singlefile_install.js 05 twocaptcha_install.js 10 chrome_validate.py, 11
wget_validate.py 20 chrome_launch.bg.js, 25 twocaptcha_config.js
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->
# Summary
<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->
# Related issues
<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->
# Changes these areas
- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Cleaned up Crawl-level hooks by removing legacy/duplicate code and
standardizing hook names and priorities. Chrome launch is now a single,
updated hook with better extension detection and cleaner outputs.
- **Refactors**
- Removed dead hooks (legacy chrome install/launch, singlefile
extension, old ublock/cookies scripts, duplicate wget validate) and the
legacy captcha2 plugin in favor of twocaptcha.
- Renamed hooks to on_Crawl__XX_<plugin>_<action> with priority bands:
00-09 install, 10-19 validate, 20-29 launch/config.
- Consolidated Chrome launch into on_Crawl__20_chrome_launch.bg.js;
writes outputs to the current dir, resolves real extension IDs via
chrome://extensions, and records extensions.json after verification.
- **Migration**
- If you used captcha2, switch to the twocaptcha hooks
(on_Crawl__05_twocaptcha_install.js and
on_Crawl__25_twocaptcha_config.js).
- Update any docs/scripts that reference old hook filenames.
<sup>Written for commit 4c77949197.
Summary will update on new commits.</sup>
<!-- End of auto-generated description by cubic. -->
This commit is contained in:
@@ -1,21 +0,0 @@
|
|||||||
{
|
|
||||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required_plugins": ["chrome"],
|
|
||||||
"properties": {
|
|
||||||
"CAPTCHA2_ENABLED": {
|
|
||||||
"type": "boolean",
|
|
||||||
"default": true,
|
|
||||||
"x-aliases": ["USE_CAPTCHA2"],
|
|
||||||
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
|
|
||||||
},
|
|
||||||
"CAPTCHA2_TIMEOUT": {
|
|
||||||
"type": "integer",
|
|
||||||
"default": 60,
|
|
||||||
"minimum": 5,
|
|
||||||
"x-fallback": "TIMEOUT",
|
|
||||||
"description": "Timeout for CAPTCHA solving in seconds"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,121 +0,0 @@
|
|||||||
#!/usr/bin/env node
|
|
||||||
/**
|
|
||||||
* 2Captcha Extension Plugin
|
|
||||||
*
|
|
||||||
* Installs and configures the 2captcha Chrome extension for automatic
|
|
||||||
* CAPTCHA solving during page archiving.
|
|
||||||
*
|
|
||||||
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
|
|
||||||
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
|
|
||||||
*
|
|
||||||
* Priority: 01 (early) - Must install before Chrome session starts at Crawl level
|
|
||||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
|
||||||
*
|
|
||||||
* Requirements:
|
|
||||||
* - API_KEY_2CAPTCHA environment variable must be set
|
|
||||||
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
|
|
||||||
*/
|
|
||||||
|
|
||||||
const path = require('path');
|
|
||||||
const fs = require('fs');
|
|
||||||
|
|
||||||
// Import extension utilities
|
|
||||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
|
||||||
|
|
||||||
// Extension metadata
|
|
||||||
const EXTENSION = {
|
|
||||||
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
|
|
||||||
name: 'captcha2',
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get extensions directory from environment or use default
|
|
||||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
|
||||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Install and configure the 2captcha extension
|
|
||||||
*/
|
|
||||||
async function installCaptchaExtension() {
|
|
||||||
console.log('[*] Installing 2captcha extension...');
|
|
||||||
|
|
||||||
// Install the extension
|
|
||||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
|
||||||
|
|
||||||
if (!extension) {
|
|
||||||
console.error('[❌] Failed to install 2captcha extension');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if API key is configured
|
|
||||||
const apiKey = process.env.API_KEY_2CAPTCHA;
|
|
||||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
|
||||||
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
|
|
||||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
|
||||||
} else {
|
|
||||||
console.log('[+] 2captcha extension installed and API key configured');
|
|
||||||
}
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Note: 2captcha configuration is now handled by chrome plugin
|
|
||||||
* during first-time browser setup to avoid repeated configuration on every snapshot.
|
|
||||||
* The API key is injected via chrome.storage API once per browser session.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Main entry point - install extension before archiving
|
|
||||||
*/
|
|
||||||
async function main() {
|
|
||||||
// Check if extension is already cached
|
|
||||||
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(cacheFile)) {
|
|
||||||
try {
|
|
||||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
|
||||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(manifestPath)) {
|
|
||||||
console.log('[*] 2captcha extension already installed (using cache)');
|
|
||||||
return cached;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
// Cache file corrupted, re-install
|
|
||||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Install extension
|
|
||||||
const extension = await installCaptchaExtension();
|
|
||||||
|
|
||||||
// Export extension metadata for chrome plugin to load
|
|
||||||
if (extension) {
|
|
||||||
// Write extension info to a cache file that chrome plugin can read
|
|
||||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
|
||||||
await fs.promises.writeFile(
|
|
||||||
cacheFile,
|
|
||||||
JSON.stringify(extension, null, 2)
|
|
||||||
);
|
|
||||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Export functions for use by other plugins
|
|
||||||
module.exports = {
|
|
||||||
EXTENSION,
|
|
||||||
installCaptchaExtension,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Run if executed directly
|
|
||||||
if (require.main === module) {
|
|
||||||
main().then(() => {
|
|
||||||
console.log('[✓] 2captcha extension setup complete');
|
|
||||||
process.exit(0);
|
|
||||||
}).catch(err => {
|
|
||||||
console.error('[❌] 2captcha extension setup failed:', err);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
@@ -1,279 +0,0 @@
|
|||||||
#!/usr/bin/env node
|
|
||||||
/**
|
|
||||||
* 2Captcha Extension Configuration
|
|
||||||
*
|
|
||||||
* Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
|
|
||||||
* Runs once per crawl to inject API key into extension storage.
|
|
||||||
*
|
|
||||||
* Priority: 11 (after chrome_launch at 20)
|
|
||||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
|
||||||
*
|
|
||||||
* Requirements:
|
|
||||||
* - API_KEY_2CAPTCHA environment variable must be set
|
|
||||||
* - chrome plugin must have loaded extensions (extensions.json must exist)
|
|
||||||
*/
|
|
||||||
|
|
||||||
const path = require('path');
|
|
||||||
const fs = require('fs');
|
|
||||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
|
||||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
|
||||||
const puppeteer = require('puppeteer-core');
|
|
||||||
|
|
||||||
// Get crawl's chrome directory from environment variable set by hooks.py
|
|
||||||
function getCrawlChromeSessionDir() {
|
|
||||||
const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
|
|
||||||
if (!crawlOutputDir) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return path.join(crawlOutputDir, 'chrome');
|
|
||||||
}
|
|
||||||
|
|
||||||
const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
|
|
||||||
const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
|
|
||||||
|
|
||||||
// Get environment variable with default
|
|
||||||
function getEnv(name, defaultValue = '') {
|
|
||||||
return (process.env[name] || defaultValue).trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse command line arguments
|
|
||||||
function parseArgs() {
|
|
||||||
const args = {};
|
|
||||||
process.argv.slice(2).forEach(arg => {
|
|
||||||
if (arg.startsWith('--')) {
|
|
||||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
|
||||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return args;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function configure2Captcha() {
|
|
||||||
// Check if already configured in this session
|
|
||||||
if (fs.existsSync(CONFIG_MARKER)) {
|
|
||||||
console.error('[*] 2captcha already configured in this browser session');
|
|
||||||
return { success: true, skipped: true };
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if API key is set
|
|
||||||
const apiKey = getEnv('API_KEY_2CAPTCHA');
|
|
||||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
|
||||||
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
|
|
||||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
|
||||||
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load extensions metadata
|
|
||||||
const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
|
|
||||||
if (!fs.existsSync(extensionsFile)) {
|
|
||||||
return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
|
|
||||||
}
|
|
||||||
|
|
||||||
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
|
|
||||||
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
|
|
||||||
|
|
||||||
if (!captchaExt) {
|
|
||||||
console.error('[*] 2captcha extension not installed, skipping configuration');
|
|
||||||
return { success: true, skipped: true };
|
|
||||||
}
|
|
||||||
|
|
||||||
console.error('[*] Configuring 2captcha extension with API key...');
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Connect to the existing Chrome session via CDP
|
|
||||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
|
||||||
if (!fs.existsSync(cdpFile)) {
|
|
||||||
return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
|
|
||||||
}
|
|
||||||
|
|
||||||
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
|
|
||||||
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Method 1: Try to inject via extension background page
|
|
||||||
if (captchaExt.target && captchaExt.target_ctx) {
|
|
||||||
console.error('[*] Attempting to configure via extension background page...');
|
|
||||||
|
|
||||||
// Reconnect to the browser to get fresh target context
|
|
||||||
const targets = await browser.targets();
|
|
||||||
const extTarget = targets.find(t =>
|
|
||||||
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
|
|
||||||
);
|
|
||||||
|
|
||||||
if (extTarget) {
|
|
||||||
const extContext = await extTarget.worker() || await extTarget.page();
|
|
||||||
|
|
||||||
if (extContext) {
|
|
||||||
await extContext.evaluate((key) => {
|
|
||||||
// Try all common storage patterns
|
|
||||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
|
||||||
chrome.storage.local.set({
|
|
||||||
apiKey: key,
|
|
||||||
api_key: key,
|
|
||||||
'2captcha_apikey': key,
|
|
||||||
apikey: key,
|
|
||||||
'solver-api-key': key,
|
|
||||||
});
|
|
||||||
chrome.storage.sync.set({
|
|
||||||
apiKey: key,
|
|
||||||
api_key: key,
|
|
||||||
'2captcha_apikey': key,
|
|
||||||
apikey: key,
|
|
||||||
'solver-api-key': key,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also try localStorage as fallback
|
|
||||||
if (typeof localStorage !== 'undefined') {
|
|
||||||
localStorage.setItem('apiKey', key);
|
|
||||||
localStorage.setItem('2captcha_apikey', key);
|
|
||||||
localStorage.setItem('solver-api-key', key);
|
|
||||||
}
|
|
||||||
}, apiKey);
|
|
||||||
|
|
||||||
console.error('[+] 2captcha API key configured successfully via background page');
|
|
||||||
|
|
||||||
// Mark as configured
|
|
||||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
|
||||||
|
|
||||||
return { success: true, method: 'background_page' };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Method 2: Try to configure via options page
|
|
||||||
console.error('[*] Attempting to configure via options page...');
|
|
||||||
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
|
|
||||||
const configPage = await browser.newPage();
|
|
||||||
|
|
||||||
try {
|
|
||||||
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
|
|
||||||
|
|
||||||
const configured = await configPage.evaluate((key) => {
|
|
||||||
// Try to find API key input field
|
|
||||||
const selectors = [
|
|
||||||
'input[name*="apikey" i]',
|
|
||||||
'input[id*="apikey" i]',
|
|
||||||
'input[name*="api-key" i]',
|
|
||||||
'input[id*="api-key" i]',
|
|
||||||
'input[name*="key" i]',
|
|
||||||
'input[placeholder*="api" i]',
|
|
||||||
'input[type="text"]',
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const selector of selectors) {
|
|
||||||
const input = document.querySelector(selector);
|
|
||||||
if (input) {
|
|
||||||
input.value = key;
|
|
||||||
input.dispatchEvent(new Event('input', { bubbles: true }));
|
|
||||||
input.dispatchEvent(new Event('change', { bubbles: true }));
|
|
||||||
|
|
||||||
// Try to find and click save button
|
|
||||||
const saveSelectors = [
|
|
||||||
'button[type="submit"]',
|
|
||||||
'input[type="submit"]',
|
|
||||||
'button:contains("Save")',
|
|
||||||
'button:contains("Apply")',
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const btnSel of saveSelectors) {
|
|
||||||
const btn = document.querySelector(btnSel);
|
|
||||||
if (btn) {
|
|
||||||
btn.click();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also save to storage
|
|
||||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
|
||||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
|
||||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: Just save to storage
|
|
||||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
|
||||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
|
||||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}, apiKey);
|
|
||||||
|
|
||||||
await configPage.close();
|
|
||||||
|
|
||||||
if (configured) {
|
|
||||||
console.error('[+] 2captcha API key configured successfully via options page');
|
|
||||||
|
|
||||||
// Mark as configured
|
|
||||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
|
||||||
|
|
||||||
return { success: true, method: 'options_page' };
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
|
|
||||||
try {
|
|
||||||
await configPage.close();
|
|
||||||
} catch (e2) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
return { success: false, error: 'Could not configure via any method' };
|
|
||||||
} finally {
|
|
||||||
browser.disconnect();
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
return { success: false, error: `${e.name}: ${e.message}` };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function main() {
|
|
||||||
const args = parseArgs();
|
|
||||||
const url = args.url;
|
|
||||||
const snapshotId = args.snapshot_id;
|
|
||||||
|
|
||||||
if (!url || !snapshotId) {
|
|
||||||
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const startTs = new Date();
|
|
||||||
let status = 'failed';
|
|
||||||
let error = '';
|
|
||||||
|
|
||||||
try {
|
|
||||||
const result = await configure2Captcha();
|
|
||||||
|
|
||||||
if (result.skipped) {
|
|
||||||
status = 'skipped';
|
|
||||||
} else if (result.success) {
|
|
||||||
status = 'succeeded';
|
|
||||||
} else {
|
|
||||||
status = 'failed';
|
|
||||||
error = result.error || 'Configuration failed';
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
error = `${e.name}: ${e.message}`;
|
|
||||||
status = 'failed';
|
|
||||||
}
|
|
||||||
|
|
||||||
const endTs = new Date();
|
|
||||||
const duration = (endTs - startTs) / 1000;
|
|
||||||
|
|
||||||
if (error) {
|
|
||||||
console.error(`ERROR: ${error}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Config hooks don't emit JSONL - they're utility hooks for setup
|
|
||||||
// Exit code indicates success/failure
|
|
||||||
|
|
||||||
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
main().catch(e => {
|
|
||||||
console.error(`Fatal error: ${e.message}`);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
"""
|
|
||||||
Unit tests for captcha2 plugin
|
|
||||||
|
|
||||||
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
|
||||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
|
|
||||||
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
|
|
||||||
|
|
||||||
|
|
||||||
def test_install_script_exists():
|
|
||||||
"""Verify install script exists"""
|
|
||||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
|
||||||
|
|
||||||
|
|
||||||
def test_config_script_exists():
|
|
||||||
"""Verify config script exists"""
|
|
||||||
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
|
|
||||||
|
|
||||||
|
|
||||||
def test_extension_metadata():
|
|
||||||
"""Test that captcha2 extension has correct metadata"""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
env = os.environ.copy()
|
|
||||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
|
||||||
|
|
||||||
# Just check the script can be loaded
|
|
||||||
result = subprocess.run(
|
|
||||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
env=env
|
|
||||||
)
|
|
||||||
|
|
||||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
|
||||||
|
|
||||||
metadata = json.loads(result.stdout)
|
|
||||||
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
|
||||||
assert metadata["name"] == "captcha2"
|
|
||||||
|
|
||||||
|
|
||||||
def test_install_creates_cache():
|
|
||||||
"""Test that install creates extension cache"""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
|
||||||
ext_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
|
||||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
|
||||||
|
|
||||||
# Run install script
|
|
||||||
result = subprocess.run(
|
|
||||||
["node", str(INSTALL_SCRIPT)],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
env=env,
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check output mentions installation
|
|
||||||
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
|
|
||||||
|
|
||||||
# Check cache file was created
|
|
||||||
cache_file = ext_dir / "captcha2.extension.json"
|
|
||||||
assert cache_file.exists(), "Cache file should be created"
|
|
||||||
|
|
||||||
# Verify cache content
|
|
||||||
cache_data = json.loads(cache_file.read_text())
|
|
||||||
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
|
||||||
assert cache_data["name"] == "captcha2"
|
|
||||||
assert "unpacked_path" in cache_data
|
|
||||||
assert "version" in cache_data
|
|
||||||
|
|
||||||
|
|
||||||
def test_install_twice_uses_cache():
|
|
||||||
"""Test that running install twice uses existing cache on second run"""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
|
||||||
ext_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
|
||||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
|
||||||
|
|
||||||
# First install - downloads the extension
|
|
||||||
result1 = subprocess.run(
|
|
||||||
["node", str(INSTALL_SCRIPT)],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
env=env,
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
assert result1.returncode == 0, f"First install failed: {result1.stderr}"
|
|
||||||
|
|
||||||
# Verify cache was created
|
|
||||||
cache_file = ext_dir / "captcha2.extension.json"
|
|
||||||
assert cache_file.exists(), "Cache file should exist after first install"
|
|
||||||
|
|
||||||
# Second install - should use cache
|
|
||||||
result2 = subprocess.run(
|
|
||||||
["node", str(INSTALL_SCRIPT)],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
env=env,
|
|
||||||
timeout=30
|
|
||||||
)
|
|
||||||
assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
|
|
||||||
|
|
||||||
# Second run should mention cache reuse
|
|
||||||
assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_install_warns_without_api_key():
|
|
||||||
"""Test that install warns when API key not configured"""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
|
||||||
ext_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
|
||||||
# Don't set API_KEY_2CAPTCHA
|
|
||||||
|
|
||||||
# Run install script
|
|
||||||
result = subprocess.run(
|
|
||||||
["node", str(INSTALL_SCRIPT)],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
env=env,
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
|
|
||||||
# Should warn about missing API key
|
|
||||||
combined_output = result.stdout + result.stderr
|
|
||||||
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
|
|
||||||
|
|
||||||
|
|
||||||
def test_install_success_with_api_key():
|
|
||||||
"""Test that install succeeds when API key is configured"""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
|
||||||
ext_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
env = os.environ.copy()
|
|
||||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
|
||||||
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
|
|
||||||
|
|
||||||
# Run install script
|
|
||||||
result = subprocess.run(
|
|
||||||
["node", str(INSTALL_SCRIPT)],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
env=env,
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
|
|
||||||
# Should mention API key configured
|
|
||||||
combined_output = result.stdout + result.stderr
|
|
||||||
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
|
|
||||||
|
|
||||||
|
|
||||||
def test_config_script_structure():
|
|
||||||
"""Test that config script has proper structure"""
|
|
||||||
# Verify the script exists and contains expected markers
|
|
||||||
script_content = CONFIG_SCRIPT.read_text()
|
|
||||||
|
|
||||||
# Should mention configuration marker file
|
|
||||||
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
|
|
||||||
|
|
||||||
# Should mention API key
|
|
||||||
assert "API_KEY_2CAPTCHA" in script_content
|
|
||||||
|
|
||||||
# Should have main function or be executable
|
|
||||||
assert "async function" in script_content or "main" in script_content
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Install hook for Chrome/Chromium and puppeteer-core.
|
|
||||||
|
|
||||||
Runs at crawl start to install/find Chromium and puppeteer-core.
|
|
||||||
Outputs JSONL for Binary and Machine config updates.
|
|
||||||
Respects CHROME_BINARY env var for custom binary paths.
|
|
||||||
Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
|
|
||||||
|
|
||||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
|
||||||
--load-extension and --disable-extensions-except flags, which are needed for
|
|
||||||
loading unpacked extensions in headless mode.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import subprocess
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def get_chrome_version(binary_path: str) -> str | None:
|
|
||||||
"""Get Chrome/Chromium version string."""
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
[binary_path, '--version'],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=5
|
|
||||||
)
|
|
||||||
if result.returncode == 0:
|
|
||||||
return result.stdout.strip()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def install_puppeteer_core() -> bool:
|
|
||||||
"""Install puppeteer-core to NODE_MODULES_DIR if not present."""
|
|
||||||
node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
|
|
||||||
if not node_modules_dir:
|
|
||||||
# No isolated node_modules, skip (will use global)
|
|
||||||
return True
|
|
||||||
|
|
||||||
node_modules_path = Path(node_modules_dir)
|
|
||||||
if (node_modules_path / 'puppeteer-core').exists():
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
|
|
||||||
npm_prefix = node_modules_path.parent
|
|
||||||
|
|
||||||
try:
|
|
||||||
print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
|
|
||||||
result = subprocess.run(
|
|
||||||
['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
if result.returncode == 0:
|
|
||||||
print(f"[+] puppeteer-core installed", file=sys.stderr)
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
|
|
||||||
return False
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def install_chromium() -> dict | None:
|
|
||||||
"""Install Chromium using @puppeteer/browsers and parse output for binary path.
|
|
||||||
|
|
||||||
Output format: "chromium@<version> <path_to_binary>"
|
|
||||||
e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
|
|
||||||
|
|
||||||
Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
|
|
||||||
|
|
||||||
# Use --path to install to puppeteer's standard cache location
|
|
||||||
cache_path = os.path.expanduser('~/.cache/puppeteer')
|
|
||||||
|
|
||||||
result = subprocess.run(
|
|
||||||
['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
stdin=subprocess.DEVNULL,
|
|
||||||
timeout=300
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
|
||||||
print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Parse output: "chromium@1563294 /path/to/Chromium"
|
|
||||||
output = result.stdout.strip()
|
|
||||||
parts = output.split(' ', 1)
|
|
||||||
if len(parts) != 2:
|
|
||||||
print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
version_str = parts[0] # "chromium@1563294"
|
|
||||||
binary_path = parts[1].strip()
|
|
||||||
|
|
||||||
if not binary_path or not os.path.exists(binary_path):
|
|
||||||
print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Extract version number
|
|
||||||
version = version_str.split('@')[1] if '@' in version_str else None
|
|
||||||
|
|
||||||
print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
|
|
||||||
|
|
||||||
return {
|
|
||||||
'name': 'chromium',
|
|
||||||
'abspath': binary_path,
|
|
||||||
'version': version,
|
|
||||||
'binprovider': 'puppeteer',
|
|
||||||
}
|
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
print("[!] Chromium install timed out", file=sys.stderr)
|
|
||||||
except FileNotFoundError:
|
|
||||||
print("[!] npx not found - is Node.js installed?", file=sys.stderr)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Install puppeteer-core if NODE_MODULES_DIR is set
|
|
||||||
install_puppeteer_core()
|
|
||||||
|
|
||||||
# Check if CHROME_BINARY is already set and valid
|
|
||||||
configured_binary = os.environ.get('CHROME_BINARY', '').strip()
|
|
||||||
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
|
|
||||||
version = get_chrome_version(configured_binary)
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Binary',
|
|
||||||
'name': 'chromium',
|
|
||||||
'abspath': configured_binary,
|
|
||||||
'version': version,
|
|
||||||
'binprovider': 'env',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# Install/find Chromium via puppeteer
|
|
||||||
result = install_chromium()
|
|
||||||
|
|
||||||
if result and result.get('abspath'):
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Binary',
|
|
||||||
'name': result['name'],
|
|
||||||
'abspath': result['abspath'],
|
|
||||||
'version': result['version'],
|
|
||||||
'binprovider': result['binprovider'],
|
|
||||||
}))
|
|
||||||
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Machine',
|
|
||||||
'_method': 'update',
|
|
||||||
'key': 'config/CHROME_BINARY',
|
|
||||||
'value': result['abspath'],
|
|
||||||
}))
|
|
||||||
|
|
||||||
if result['version']:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Machine',
|
|
||||||
'_method': 'update',
|
|
||||||
'key': 'config/CHROMIUM_VERSION',
|
|
||||||
'value': result['version'],
|
|
||||||
}))
|
|
||||||
|
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
print("Chromium binary not found", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
@@ -8,8 +8,8 @@
|
|||||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||||
* --load-extension and --disable-extensions-except flags.
|
* --load-extension and --disable-extensions-except flags.
|
||||||
*
|
*
|
||||||
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||||
* Output: Creates chrome/ directory under crawl output dir with:
|
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||||
* - chrome.pid: Chromium process ID (for cleanup)
|
* - chrome.pid: Chromium process ID (for cleanup)
|
||||||
* - port.txt: Debug port number
|
* - port.txt: Debug port number
|
||||||
@@ -38,11 +38,12 @@ const {
|
|||||||
killChrome,
|
killChrome,
|
||||||
getEnv,
|
getEnv,
|
||||||
writePidWithMtime,
|
writePidWithMtime,
|
||||||
|
getExtensionsDir,
|
||||||
} = require('./chrome_utils.js');
|
} = require('./chrome_utils.js');
|
||||||
|
|
||||||
// Extractor metadata
|
// Extractor metadata
|
||||||
const PLUGIN_NAME = 'chrome_launch';
|
const PLUGIN_NAME = 'chrome_launch';
|
||||||
const OUTPUT_DIR = 'chrome';
|
const OUTPUT_DIR = '.';
|
||||||
|
|
||||||
// Global state for cleanup
|
// Global state for cleanup
|
||||||
let chromePid = null;
|
let chromePid = null;
|
||||||
@@ -115,8 +116,12 @@ async function main() {
|
|||||||
if (version) console.error(`[*] Version: ${version}`);
|
if (version) console.error(`[*] Version: ${version}`);
|
||||||
|
|
||||||
// Load installed extensions
|
// Load installed extensions
|
||||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
|
const extensionsDir = getExtensionsDir();
|
||||||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
|
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||||
|
|
||||||
|
if (userDataDir) {
|
||||||
|
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||||
|
}
|
||||||
|
|
||||||
const installedExtensions = [];
|
const installedExtensions = [];
|
||||||
const extensionPaths = [];
|
const extensionPaths = [];
|
||||||
@@ -143,17 +148,18 @@ async function main() {
|
|||||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write hook's own PID
|
// Note: PID file is written by run_hook() with hook-specific name
|
||||||
const hookStartTime = Date.now() / 1000;
|
// Snapshot.cleanup() kills all *.pid processes when done
|
||||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||||
}
|
}
|
||||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
|
||||||
|
|
||||||
// Launch Chromium using consolidated function
|
// Launch Chromium using consolidated function
|
||||||
|
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
||||||
const result = await launchChromium({
|
const result = await launchChromium({
|
||||||
binary,
|
binary,
|
||||||
outputDir: OUTPUT_DIR,
|
outputDir: OUTPUT_DIR,
|
||||||
|
userDataDir,
|
||||||
extensionPaths,
|
extensionPaths,
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -165,14 +171,6 @@ async function main() {
|
|||||||
chromePid = result.pid;
|
chromePid = result.pid;
|
||||||
const cdpUrl = result.cdpUrl;
|
const cdpUrl = result.cdpUrl;
|
||||||
|
|
||||||
// Write extensions metadata
|
|
||||||
if (installedExtensions.length > 0) {
|
|
||||||
fs.writeFileSync(
|
|
||||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
|
||||||
JSON.stringify(installedExtensions, null, 2)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Connect puppeteer for extension verification
|
// Connect puppeteer for extension verification
|
||||||
console.error(`[*] Connecting puppeteer to CDP...`);
|
console.error(`[*] Connecting puppeteer to CDP...`);
|
||||||
const browser = await puppeteer.connect({
|
const browser = await puppeteer.connect({
|
||||||
@@ -181,30 +179,102 @@ async function main() {
|
|||||||
});
|
});
|
||||||
browserInstance = browser;
|
browserInstance = browser;
|
||||||
|
|
||||||
// Verify extensions loaded
|
// Get actual extension IDs from chrome://extensions page
|
||||||
if (extensionPaths.length > 0) {
|
if (extensionPaths.length > 0) {
|
||||||
await new Promise(r => setTimeout(r, 3000));
|
await new Promise(r => setTimeout(r, 2000));
|
||||||
|
|
||||||
const targets = browser.targets();
|
try {
|
||||||
console.error(`[*] All browser targets (${targets.length}):`);
|
const extPage = await browser.newPage();
|
||||||
for (const t of targets) {
|
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
||||||
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
|
await new Promise(r => setTimeout(r, 2000));
|
||||||
|
|
||||||
|
// Parse extension info from the page
|
||||||
|
const extensionsFromPage = await extPage.evaluate(() => {
|
||||||
|
const extensions = [];
|
||||||
|
// Extensions manager uses shadow DOM
|
||||||
|
const manager = document.querySelector('extensions-manager');
|
||||||
|
if (!manager || !manager.shadowRoot) return extensions;
|
||||||
|
|
||||||
|
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
|
||||||
|
if (!itemList || !itemList.shadowRoot) return extensions;
|
||||||
|
|
||||||
|
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
|
||||||
|
for (const item of items) {
|
||||||
|
const id = item.getAttribute('id');
|
||||||
|
const nameEl = item.shadowRoot?.querySelector('#name');
|
||||||
|
const name = nameEl?.textContent?.trim() || '';
|
||||||
|
if (id && name) {
|
||||||
|
extensions.push({ id, name });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return extensions;
|
||||||
|
});
|
||||||
|
|
||||||
|
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
|
||||||
|
for (const e of extensionsFromPage) {
|
||||||
|
console.error(` - ${e.id}: "${e.name}"`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Match extensions by name (strict matching)
|
||||||
|
for (const ext of installedExtensions) {
|
||||||
|
// Read the extension's manifest to get its display name
|
||||||
|
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
|
||||||
|
if (fs.existsSync(manifestPath)) {
|
||||||
|
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
|
||||||
|
let manifestName = manifest.name || '';
|
||||||
|
|
||||||
|
// Resolve message placeholder (e.g., __MSG_extName__)
|
||||||
|
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
|
||||||
|
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
|
||||||
|
const defaultLocale = manifest.default_locale || 'en';
|
||||||
|
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
|
||||||
|
if (fs.existsSync(messagesPath)) {
|
||||||
|
try {
|
||||||
|
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
|
||||||
|
if (messages[msgKey] && messages[msgKey].message) {
|
||||||
|
manifestName = messages[msgKey].message;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[!] Failed to read messages.json: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
|
||||||
|
|
||||||
|
// Find matching extension from page by exact name match first
|
||||||
|
let match = extensionsFromPage.find(e => e.name === manifestName);
|
||||||
|
|
||||||
|
// If no exact match, try case-insensitive exact match
|
||||||
|
if (!match) {
|
||||||
|
match = extensionsFromPage.find(e =>
|
||||||
|
e.name.toLowerCase() === manifestName.toLowerCase()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (match) {
|
||||||
|
ext.id = match.id;
|
||||||
|
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
|
||||||
|
} else {
|
||||||
|
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await extPage.close();
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const extTargets = targets.filter(t =>
|
// Fallback: check browser targets
|
||||||
t.url().startsWith('chrome-extension://') ||
|
const targets = browser.targets();
|
||||||
t.type() === 'service_worker' ||
|
|
||||||
t.type() === 'background_page'
|
|
||||||
);
|
|
||||||
|
|
||||||
// Filter out built-in extensions
|
|
||||||
const builtinIds = [
|
const builtinIds = [
|
||||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
'nkeimhogjdpnpccoofpliimaahmaaome',
|
||||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
'ahfgeienlihckogmohjhadlkjgocpleb',
|
||||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
||||||
];
|
];
|
||||||
const customExtTargets = extTargets.filter(t => {
|
const customExtTargets = targets.filter(t => {
|
||||||
const url = t.url();
|
const url = t.url();
|
||||||
if (!url.startsWith('chrome-extension://')) return false;
|
if (!url.startsWith('chrome-extension://')) return false;
|
||||||
const extId = url.split('://')[1].split('/')[0];
|
const extId = url.split('://')[1].split('/')[0];
|
||||||
@@ -216,7 +286,7 @@ async function main() {
|
|||||||
for (const target of customExtTargets) {
|
for (const target of customExtTargets) {
|
||||||
const url = target.url();
|
const url = target.url();
|
||||||
const extId = url.split('://')[1].split('/')[0];
|
const extId = url.split('://')[1].split('/')[0];
|
||||||
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
|
console.error(`[+] Extension target: ${extId} (${target.type()})`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
||||||
@@ -225,6 +295,14 @@ async function main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Write extensions metadata with actual IDs
|
||||||
|
if (installedExtensions.length > 0) {
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||||
|
JSON.stringify(installedExtensions, null, 2)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
||||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
console.error(`[+] CDP URL: ${cdpUrl}`);
|
||||||
console.error(`[+] PID: ${chromePid}`);
|
console.error(`[+] PID: ${chromePid}`);
|
||||||
|
|||||||
@@ -1,323 +0,0 @@
|
|||||||
#!/usr/bin/env node
|
|
||||||
/**
|
|
||||||
* Launch a shared Chromium browser session for the entire crawl.
|
|
||||||
*
|
|
||||||
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
|
|
||||||
* Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
|
|
||||||
*
|
|
||||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
|
||||||
* --load-extension and --disable-extensions-except flags.
|
|
||||||
*
|
|
||||||
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
|
||||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
|
||||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
|
||||||
* - chrome.pid: Chromium process ID (for cleanup)
|
|
||||||
* - port.txt: Debug port number
|
|
||||||
* - extensions.json: Loaded extensions metadata
|
|
||||||
*
|
|
||||||
* Environment variables:
|
|
||||||
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
|
|
||||||
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
|
|
||||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
|
||||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
|
||||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
|
||||||
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
|
||||||
if (process.env.NODE_MODULES_DIR) {
|
|
||||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
|
||||||
}
|
|
||||||
|
|
||||||
const fs = require('fs');
|
|
||||||
const path = require('path');
|
|
||||||
const puppeteer = require('puppeteer-core');
|
|
||||||
const {
|
|
||||||
findChromium,
|
|
||||||
launchChromium,
|
|
||||||
killChrome,
|
|
||||||
getEnv,
|
|
||||||
writePidWithMtime,
|
|
||||||
getExtensionsDir,
|
|
||||||
} = require('./chrome_utils.js');
|
|
||||||
|
|
||||||
// Extractor metadata
|
|
||||||
const PLUGIN_NAME = 'chrome_launch';
|
|
||||||
const OUTPUT_DIR = '.';
|
|
||||||
|
|
||||||
// Global state for cleanup
|
|
||||||
let chromePid = null;
|
|
||||||
let browserInstance = null;
|
|
||||||
|
|
||||||
// Parse command line arguments
|
|
||||||
function parseArgs() {
|
|
||||||
const args = {};
|
|
||||||
process.argv.slice(2).forEach((arg) => {
|
|
||||||
if (arg.startsWith('--')) {
|
|
||||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
|
||||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return args;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Cleanup handler for SIGTERM
|
|
||||||
async function cleanup() {
|
|
||||||
console.error('[*] Cleaning up Chrome session...');
|
|
||||||
|
|
||||||
// Try graceful browser close first
|
|
||||||
if (browserInstance) {
|
|
||||||
try {
|
|
||||||
console.error('[*] Closing browser gracefully...');
|
|
||||||
await browserInstance.close();
|
|
||||||
browserInstance = null;
|
|
||||||
console.error('[+] Browser closed gracefully');
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`[!] Graceful close failed: ${e.message}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Kill Chrome process
|
|
||||||
if (chromePid) {
|
|
||||||
await killChrome(chromePid, OUTPUT_DIR);
|
|
||||||
}
|
|
||||||
|
|
||||||
process.exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register signal handlers
|
|
||||||
process.on('SIGTERM', cleanup);
|
|
||||||
process.on('SIGINT', cleanup);
|
|
||||||
|
|
||||||
async function main() {
|
|
||||||
const args = parseArgs();
|
|
||||||
const crawlId = args.crawl_id;
|
|
||||||
|
|
||||||
try {
|
|
||||||
const binary = findChromium();
|
|
||||||
if (!binary) {
|
|
||||||
console.error('ERROR: Chromium binary not found');
|
|
||||||
console.error('DEPENDENCY_NEEDED=chromium');
|
|
||||||
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
|
|
||||||
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get Chromium version
|
|
||||||
let version = '';
|
|
||||||
try {
|
|
||||||
const { execSync } = require('child_process');
|
|
||||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
|
|
||||||
.trim()
|
|
||||||
.slice(0, 64);
|
|
||||||
} catch (e) {}
|
|
||||||
|
|
||||||
console.error(`[*] Using browser: ${binary}`);
|
|
||||||
if (version) console.error(`[*] Version: ${version}`);
|
|
||||||
|
|
||||||
// Load installed extensions
|
|
||||||
const extensionsDir = getExtensionsDir();
|
|
||||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
|
||||||
|
|
||||||
if (userDataDir) {
|
|
||||||
console.error(`[*] Using user data dir: ${userDataDir}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const installedExtensions = [];
|
|
||||||
const extensionPaths = [];
|
|
||||||
if (fs.existsSync(extensionsDir)) {
|
|
||||||
const files = fs.readdirSync(extensionsDir);
|
|
||||||
for (const file of files) {
|
|
||||||
if (file.endsWith('.extension.json')) {
|
|
||||||
try {
|
|
||||||
const extPath = path.join(extensionsDir, file);
|
|
||||||
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
|
|
||||||
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
|
|
||||||
installedExtensions.push(extData);
|
|
||||||
extensionPaths.push(extData.unpacked_path);
|
|
||||||
console.error(`[*] Loading extension: ${extData.name || file}`);
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
console.warn(`[!] Skipping invalid extension cache: ${file}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (installedExtensions.length > 0) {
|
|
||||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note: PID file is written by run_hook() with hook-specific name
|
|
||||||
// Snapshot.cleanup() kills all *.pid processes when done
|
|
||||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
|
||||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Launch Chromium using consolidated function
|
|
||||||
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
|
||||||
const result = await launchChromium({
|
|
||||||
binary,
|
|
||||||
outputDir: OUTPUT_DIR,
|
|
||||||
userDataDir,
|
|
||||||
extensionPaths,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!result.success) {
|
|
||||||
console.error(`ERROR: ${result.error}`);
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
chromePid = result.pid;
|
|
||||||
const cdpUrl = result.cdpUrl;
|
|
||||||
|
|
||||||
// Connect puppeteer for extension verification
|
|
||||||
console.error(`[*] Connecting puppeteer to CDP...`);
|
|
||||||
const browser = await puppeteer.connect({
|
|
||||||
browserWSEndpoint: cdpUrl,
|
|
||||||
defaultViewport: null,
|
|
||||||
});
|
|
||||||
browserInstance = browser;
|
|
||||||
|
|
||||||
// Get actual extension IDs from chrome://extensions page
|
|
||||||
if (extensionPaths.length > 0) {
|
|
||||||
await new Promise(r => setTimeout(r, 2000));
|
|
||||||
|
|
||||||
try {
|
|
||||||
const extPage = await browser.newPage();
|
|
||||||
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
|
|
||||||
await new Promise(r => setTimeout(r, 2000));
|
|
||||||
|
|
||||||
// Parse extension info from the page
|
|
||||||
const extensionsFromPage = await extPage.evaluate(() => {
|
|
||||||
const extensions = [];
|
|
||||||
// Extensions manager uses shadow DOM
|
|
||||||
const manager = document.querySelector('extensions-manager');
|
|
||||||
if (!manager || !manager.shadowRoot) return extensions;
|
|
||||||
|
|
||||||
const itemList = manager.shadowRoot.querySelector('extensions-item-list');
|
|
||||||
if (!itemList || !itemList.shadowRoot) return extensions;
|
|
||||||
|
|
||||||
const items = itemList.shadowRoot.querySelectorAll('extensions-item');
|
|
||||||
for (const item of items) {
|
|
||||||
const id = item.getAttribute('id');
|
|
||||||
const nameEl = item.shadowRoot?.querySelector('#name');
|
|
||||||
const name = nameEl?.textContent?.trim() || '';
|
|
||||||
if (id && name) {
|
|
||||||
extensions.push({ id, name });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return extensions;
|
|
||||||
});
|
|
||||||
|
|
||||||
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
|
|
||||||
for (const e of extensionsFromPage) {
|
|
||||||
console.error(` - ${e.id}: "${e.name}"`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Match extensions by name (strict matching)
|
|
||||||
for (const ext of installedExtensions) {
|
|
||||||
// Read the extension's manifest to get its display name
|
|
||||||
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
|
|
||||||
if (fs.existsSync(manifestPath)) {
|
|
||||||
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
|
|
||||||
let manifestName = manifest.name || '';
|
|
||||||
|
|
||||||
// Resolve message placeholder (e.g., __MSG_extName__)
|
|
||||||
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
|
|
||||||
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
|
|
||||||
const defaultLocale = manifest.default_locale || 'en';
|
|
||||||
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
|
|
||||||
if (fs.existsSync(messagesPath)) {
|
|
||||||
try {
|
|
||||||
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
|
|
||||||
if (messages[msgKey] && messages[msgKey].message) {
|
|
||||||
manifestName = messages[msgKey].message;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`[!] Failed to read messages.json: ${e.message}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);
|
|
||||||
|
|
||||||
// Find matching extension from page by exact name match first
|
|
||||||
let match = extensionsFromPage.find(e => e.name === manifestName);
|
|
||||||
|
|
||||||
// If no exact match, try case-insensitive exact match
|
|
||||||
if (!match) {
|
|
||||||
match = extensionsFromPage.find(e =>
|
|
||||||
e.name.toLowerCase() === manifestName.toLowerCase()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (match) {
|
|
||||||
ext.id = match.id;
|
|
||||||
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
|
|
||||||
} else {
|
|
||||||
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await extPage.close();
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback: check browser targets
|
|
||||||
const targets = browser.targets();
|
|
||||||
const builtinIds = [
|
|
||||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
|
||||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
|
||||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
|
||||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
|
||||||
];
|
|
||||||
const customExtTargets = targets.filter(t => {
|
|
||||||
const url = t.url();
|
|
||||||
if (!url.startsWith('chrome-extension://')) return false;
|
|
||||||
const extId = url.split('://')[1].split('/')[0];
|
|
||||||
return !builtinIds.includes(extId);
|
|
||||||
});
|
|
||||||
|
|
||||||
console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
|
|
||||||
|
|
||||||
for (const target of customExtTargets) {
|
|
||||||
const url = target.url();
|
|
||||||
const extId = url.split('://')[1].split('/')[0];
|
|
||||||
console.error(`[+] Extension target: ${extId} (${target.type()})`);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (customExtTargets.length === 0 && extensionPaths.length > 0) {
|
|
||||||
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
|
|
||||||
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write extensions metadata with actual IDs
|
|
||||||
if (installedExtensions.length > 0) {
|
|
||||||
fs.writeFileSync(
|
|
||||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
|
||||||
JSON.stringify(installedExtensions, null, 2)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
|
||||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
|
||||||
console.error(`[+] PID: ${chromePid}`);
|
|
||||||
|
|
||||||
// Stay alive to handle cleanup on SIGTERM
|
|
||||||
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
|
|
||||||
setInterval(() => {}, 1000000);
|
|
||||||
|
|
||||||
} catch (e) {
|
|
||||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
main().catch((e) => {
|
|
||||||
console.error(`Fatal error: ${e.message}`);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
#!/usr/bin/env node
|
|
||||||
/**
|
|
||||||
* I Still Don't Care About Cookies Extension Plugin
|
|
||||||
*
|
|
||||||
* Installs and configures the "I still don't care about cookies" Chrome extension
|
|
||||||
* for automatic cookie consent banner dismissal during page archiving.
|
|
||||||
*
|
|
||||||
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
|
|
||||||
*
|
|
||||||
* Priority: 02 (early) - Must install before Chrome session starts at Crawl level
|
|
||||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
|
||||||
*
|
|
||||||
* This extension automatically:
|
|
||||||
* - Dismisses cookie consent popups
|
|
||||||
* - Removes cookie banners
|
|
||||||
* - Accepts necessary cookies to proceed with browsing
|
|
||||||
* - Works on thousands of websites out of the box
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Import extension utilities
|
|
||||||
const { installExtensionWithCache } = require('../chrome/chrome_utils.js');
|
|
||||||
|
|
||||||
// Extension metadata
|
|
||||||
const EXTENSION = {
|
|
||||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
|
||||||
name: 'istilldontcareaboutcookies',
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Main entry point - install extension before archiving
|
|
||||||
*
|
|
||||||
* Note: This extension works out of the box with no configuration needed.
|
|
||||||
* It automatically detects and dismisses cookie banners on page load.
|
|
||||||
*/
|
|
||||||
async function main() {
|
|
||||||
const extension = await installExtensionWithCache(EXTENSION);
|
|
||||||
|
|
||||||
if (extension) {
|
|
||||||
console.log('[+] Cookie banners will be automatically dismissed during archiving');
|
|
||||||
}
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Export functions for use by other plugins
|
|
||||||
module.exports = {
|
|
||||||
EXTENSION,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Run if executed directly
|
|
||||||
if (require.main === module) {
|
|
||||||
main().then(() => {
|
|
||||||
console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
|
|
||||||
process.exit(0);
|
|
||||||
}).catch(err => {
|
|
||||||
console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
@@ -1,281 +0,0 @@
|
|||||||
#!/usr/bin/env node
|
|
||||||
/**
|
|
||||||
* SingleFile Extension Plugin
|
|
||||||
*
|
|
||||||
* DISABLED: Extension functionality commented out - using single-file-cli only
|
|
||||||
*
|
|
||||||
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
|
||||||
* Falls back to single-file-cli if the extension is not available.
|
|
||||||
*
|
|
||||||
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
|
||||||
*
|
|
||||||
* Priority: 04 (early) - Must install before Chrome session starts at Crawl level
|
|
||||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
|
||||||
*
|
|
||||||
* This extension automatically:
|
|
||||||
* - Saves complete web pages as single HTML files
|
|
||||||
* - Inlines all resources (CSS, JS, images, fonts)
|
|
||||||
* - Preserves page fidelity better than wget/curl
|
|
||||||
* - Works with SPAs and dynamically loaded content
|
|
||||||
*/
|
|
||||||
|
|
||||||
const path = require('path');
|
|
||||||
const fs = require('fs');
|
|
||||||
const { promisify } = require('util');
|
|
||||||
const { exec } = require('child_process');
|
|
||||||
|
|
||||||
const execAsync = promisify(exec);
|
|
||||||
|
|
||||||
// DISABLED: Extension functionality - using single-file-cli only
|
|
||||||
// // Import extension utilities
|
|
||||||
// const extensionUtils = require('../chrome/chrome_utils.js');
|
|
||||||
|
|
||||||
// // Extension metadata
|
|
||||||
// const EXTENSION = {
|
|
||||||
// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
|
||||||
// name: 'singlefile',
|
|
||||||
// };
|
|
||||||
|
|
||||||
// // Get extensions directory from environment or use default
|
|
||||||
// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
|
||||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
|
||||||
|
|
||||||
// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
|
||||||
// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
|
||||||
|
|
||||||
const OUTPUT_DIR = '.';
|
|
||||||
const OUTPUT_FILE = 'singlefile.html';
|
|
||||||
|
|
||||||
// DISABLED: Extension functionality - using single-file-cli only
|
|
||||||
// /**
|
|
||||||
// * Install the SingleFile extension
|
|
||||||
// */
|
|
||||||
// async function installSinglefileExtension() {
|
|
||||||
// console.log('[*] Installing SingleFile extension...');
|
|
||||||
|
|
||||||
// // Install the extension
|
|
||||||
// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
|
||||||
|
|
||||||
// if (!extension) {
|
|
||||||
// console.error('[❌] Failed to install SingleFile extension');
|
|
||||||
// return null;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// console.log('[+] SingleFile extension installed');
|
|
||||||
// console.log('[+] Web pages will be saved as single HTML files');
|
|
||||||
|
|
||||||
// return extension;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// /**
|
|
||||||
// * Wait for a specified amount of time
|
|
||||||
// */
|
|
||||||
// function wait(ms) {
|
|
||||||
// return new Promise(resolve => setTimeout(resolve, ms));
|
|
||||||
// }
|
|
||||||
|
|
||||||
// /**
|
|
||||||
// * Save a page using the SingleFile extension
|
|
||||||
// *
|
|
||||||
// * @param {Object} page - Puppeteer page object
|
|
||||||
// * @param {Object} extension - Extension metadata with dispatchAction method
|
|
||||||
// * @param {Object} options - Additional options
|
|
||||||
// * @returns {Promise<string|null>} - Path to saved file or null on failure
|
|
||||||
// */
|
|
||||||
// async function saveSinglefileWithExtension(page, extension, options = {}) {
|
|
||||||
// if (!extension || !extension.version) {
|
|
||||||
// throw new Error('SingleFile extension not found or not loaded');
|
|
||||||
// }
|
|
||||||
|
|
||||||
// const url = await page.url();
|
|
||||||
|
|
||||||
// // Check for unsupported URL schemes
|
|
||||||
// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
|
||||||
// const scheme = url.split(':')[0];
|
|
||||||
// if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
|
||||||
// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
|
||||||
// return null;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // Ensure downloads directory exists
|
|
||||||
// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
|
||||||
|
|
||||||
// // Get list of existing files to ignore
|
|
||||||
// const files_before = new Set(
|
|
||||||
// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
|
||||||
// .filter(fn => fn.endsWith('.html'))
|
|
||||||
// );
|
|
||||||
|
|
||||||
// // Output directory is current directory (hook already runs in output dir)
|
|
||||||
// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
|
||||||
|
|
||||||
// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
|
||||||
|
|
||||||
// // Bring page to front (extension action button acts on foreground tab)
|
|
||||||
// await page.bringToFront();
|
|
||||||
|
|
||||||
// // Trigger the extension's action (toolbar button click)
|
|
||||||
// await extension.dispatchAction();
|
|
||||||
|
|
||||||
// // Wait for file to appear in downloads directory
|
|
||||||
// const check_delay = 3000; // 3 seconds
|
|
||||||
// const max_tries = 10;
|
|
||||||
// let files_new = [];
|
|
||||||
|
|
||||||
// for (let attempt = 0; attempt < max_tries; attempt++) {
|
|
||||||
// await wait(check_delay);
|
|
||||||
|
|
||||||
// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
|
||||||
// .filter(fn => fn.endsWith('.html'));
|
|
||||||
|
|
||||||
// files_new = files_after.filter(file => !files_before.has(file));
|
|
||||||
|
|
||||||
// if (files_new.length === 0) {
|
|
||||||
// continue;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // Find the matching file by checking if it contains the URL in the HTML header
|
|
||||||
// for (const file of files_new) {
|
|
||||||
// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
|
||||||
// const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
|
||||||
// const dl_header = dl_text.split('meta charset')[0];
|
|
||||||
|
|
||||||
// if (dl_header.includes(`url: ${url}`)) {
|
|
||||||
// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
|
||||||
// await fs.promises.rename(dl_path, out_path);
|
|
||||||
// return out_path;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
|
||||||
// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
|
||||||
// return null;
|
|
||||||
// }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Save a page using single-file-cli (fallback method)
|
|
||||||
*
|
|
||||||
* @param {string} url - URL to archive
|
|
||||||
* @param {Object} options - Additional options
|
|
||||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
|
||||||
*/
|
|
||||||
async function saveSinglefileWithCLI(url, options = {}) {
|
|
||||||
console.log('[*] Falling back to single-file-cli...');
|
|
||||||
|
|
||||||
// Find single-file binary
|
|
||||||
let binary = null;
|
|
||||||
try {
|
|
||||||
const { stdout } = await execAsync('which single-file');
|
|
||||||
binary = stdout.trim();
|
|
||||||
} catch (err) {
|
|
||||||
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Output directory is current directory (hook already runs in output dir)
|
|
||||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
|
||||||
|
|
||||||
// Build command
|
|
||||||
const cmd = [
|
|
||||||
binary,
|
|
||||||
'--browser-headless',
|
|
||||||
url,
|
|
||||||
out_path,
|
|
||||||
];
|
|
||||||
|
|
||||||
// Add optional args
|
|
||||||
if (options.userAgent) {
|
|
||||||
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
|
||||||
}
|
|
||||||
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
|
||||||
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
|
||||||
}
|
|
||||||
if (options.ignoreSSL) {
|
|
||||||
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Execute
|
|
||||||
try {
|
|
||||||
const timeout = options.timeout || 120000;
|
|
||||||
await execAsync(cmd.join(' '), { timeout });
|
|
||||||
|
|
||||||
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
|
||||||
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
|
||||||
return out_path;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.error('[❌] SingleFile CLI completed but no output file found');
|
|
||||||
return null;
|
|
||||||
} catch (err) {
|
|
||||||
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// DISABLED: Extension functionality - using single-file-cli only
|
|
||||||
// /**
|
|
||||||
// * Main entry point - install extension before archiving
|
|
||||||
// */
|
|
||||||
// async function main() {
|
|
||||||
// // Check if extension is already cached
|
|
||||||
// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
|
||||||
|
|
||||||
// if (fs.existsSync(cacheFile)) {
|
|
||||||
// try {
|
|
||||||
// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
|
||||||
// const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
|
||||||
|
|
||||||
// if (fs.existsSync(manifestPath)) {
|
|
||||||
// console.log('[*] SingleFile extension already installed (using cache)');
|
|
||||||
// return cached;
|
|
||||||
// }
|
|
||||||
// } catch (e) {
|
|
||||||
// // Cache file corrupted, re-install
|
|
||||||
// console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // Install extension
|
|
||||||
// const extension = await installSinglefileExtension();
|
|
||||||
|
|
||||||
// // Export extension metadata for chrome plugin to load
|
|
||||||
// if (extension) {
|
|
||||||
// // Write extension info to a cache file that chrome plugin can read
|
|
||||||
// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
|
||||||
// await fs.promises.writeFile(
|
|
||||||
// cacheFile,
|
|
||||||
// JSON.stringify(extension, null, 2)
|
|
||||||
// );
|
|
||||||
// console.log(`[+] Extension metadata written to ${cacheFile}`);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// return extension;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// Export functions for use by other plugins
|
|
||||||
module.exports = {
|
|
||||||
// DISABLED: Extension functionality - using single-file-cli only
|
|
||||||
// EXTENSION,
|
|
||||||
// installSinglefileExtension,
|
|
||||||
// saveSinglefileWithExtension,
|
|
||||||
saveSinglefileWithCLI,
|
|
||||||
};
|
|
||||||
|
|
||||||
// DISABLED: Extension functionality - using single-file-cli only
|
|
||||||
// // Run if executed directly
|
|
||||||
// if (require.main === module) {
|
|
||||||
// main().then(() => {
|
|
||||||
// console.log('[✓] SingleFile extension setup complete');
|
|
||||||
// process.exit(0);
|
|
||||||
// }).catch(err => {
|
|
||||||
// console.error('[❌] SingleFile extension setup failed:', err);
|
|
||||||
// process.exit(1);
|
|
||||||
// });
|
|
||||||
// }
|
|
||||||
|
|
||||||
// No-op when run directly (extension install disabled)
|
|
||||||
if (require.main === module) {
|
|
||||||
console.log('[*] SingleFile extension install disabled - using single-file-cli only');
|
|
||||||
process.exit(0);
|
|
||||||
}
|
|
||||||
@@ -1,116 +0,0 @@
|
|||||||
#!/usr/bin/env node
|
|
||||||
/**
|
|
||||||
* uBlock Origin Extension Plugin
|
|
||||||
*
|
|
||||||
* Installs and configures the uBlock Origin Chrome extension for ad blocking
|
|
||||||
* and privacy protection during page archiving.
|
|
||||||
*
|
|
||||||
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
|
|
||||||
*
|
|
||||||
* Priority: 03 (early) - Must install before Chrome session starts at Crawl level
|
|
||||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
|
||||||
*
|
|
||||||
* This extension automatically:
|
|
||||||
* - Blocks ads, trackers, and malware domains
|
|
||||||
* - Reduces page load time and bandwidth usage
|
|
||||||
* - Improves privacy during archiving
|
|
||||||
* - Removes clutter from archived pages
|
|
||||||
* - Uses efficient blocking with filter lists
|
|
||||||
*/
|
|
||||||
|
|
||||||
const path = require('path');
|
|
||||||
const fs = require('fs');
|
|
||||||
|
|
||||||
// Import extension utilities
|
|
||||||
const extensionUtils = require('../chrome/chrome_utils.js');
|
|
||||||
|
|
||||||
// Extension metadata
|
|
||||||
const EXTENSION = {
|
|
||||||
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
|
|
||||||
name: 'ublock',
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get extensions directory from environment or use default
|
|
||||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
|
||||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Install the uBlock Origin extension
|
|
||||||
*/
|
|
||||||
async function installUblockExtension() {
|
|
||||||
console.log('[*] Installing uBlock Origin extension...');
|
|
||||||
|
|
||||||
// Install the extension
|
|
||||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
|
||||||
|
|
||||||
if (!extension) {
|
|
||||||
console.error('[❌] Failed to install uBlock Origin extension');
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log('[+] uBlock Origin extension installed');
|
|
||||||
console.log('[+] Ads and trackers will be blocked during archiving');
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Note: uBlock Origin works automatically with default filter lists.
|
|
||||||
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Main entry point - install extension before archiving
|
|
||||||
*/
|
|
||||||
async function main() {
|
|
||||||
// Check if extension is already cached
|
|
||||||
const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(cacheFile)) {
|
|
||||||
try {
|
|
||||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
|
||||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
|
||||||
|
|
||||||
if (fs.existsSync(manifestPath)) {
|
|
||||||
console.log('[*] uBlock Origin extension already installed (using cache)');
|
|
||||||
return cached;
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
// Cache file corrupted, re-install
|
|
||||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Install extension
|
|
||||||
const extension = await installUblockExtension();
|
|
||||||
|
|
||||||
// Export extension metadata for chrome plugin to load
|
|
||||||
if (extension) {
|
|
||||||
// Write extension info to a cache file that chrome plugin can read
|
|
||||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
|
||||||
await fs.promises.writeFile(
|
|
||||||
cacheFile,
|
|
||||||
JSON.stringify(extension, null, 2)
|
|
||||||
);
|
|
||||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
return extension;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Export functions for use by other plugins
|
|
||||||
module.exports = {
|
|
||||||
EXTENSION,
|
|
||||||
installUblockExtension,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Run if executed directly
|
|
||||||
if (require.main === module) {
|
|
||||||
main().then(() => {
|
|
||||||
console.log('[✓] uBlock Origin extension setup complete');
|
|
||||||
process.exit(0);
|
|
||||||
}).catch(err => {
|
|
||||||
console.error('[❌] uBlock Origin extension setup failed:', err);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
@@ -1,130 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Validate and compute derived wget config values.
|
|
||||||
|
|
||||||
This hook runs early in the Crawl lifecycle to:
|
|
||||||
1. Validate config values with warnings (not hard errors)
|
|
||||||
2. Compute derived values (USE_WGET from WGET_ENABLED)
|
|
||||||
3. Check binary availability and version
|
|
||||||
|
|
||||||
Output:
|
|
||||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
|
||||||
- Binary JSONL records to stdout when binaries are found
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from abx_pkg import Binary, EnvProvider
|
|
||||||
|
|
||||||
|
|
||||||
# Read config from environment (already validated by JSONSchema)
|
|
||||||
def get_env(name: str, default: str = '') -> str:
|
|
||||||
return os.environ.get(name, default).strip()
|
|
||||||
|
|
||||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
|
||||||
val = get_env(name, '').lower()
|
|
||||||
if val in ('true', '1', 'yes', 'on'):
|
|
||||||
return True
|
|
||||||
if val in ('false', '0', 'no', 'off'):
|
|
||||||
return False
|
|
||||||
return default
|
|
||||||
|
|
||||||
def get_env_int(name: str, default: int = 0) -> int:
|
|
||||||
try:
|
|
||||||
return int(get_env(name, str(default)))
|
|
||||||
except ValueError:
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
|
||||||
def output_binary(binary: Binary, name: str):
|
|
||||||
"""Output Binary JSONL record to stdout."""
|
|
||||||
machine_id = os.environ.get('MACHINE_ID', '')
|
|
||||||
|
|
||||||
record = {
|
|
||||||
'type': 'Binary',
|
|
||||||
'name': name,
|
|
||||||
'abspath': str(binary.abspath),
|
|
||||||
'version': str(binary.version) if binary.version else '',
|
|
||||||
'sha256': binary.sha256 or '',
|
|
||||||
'binprovider': 'env',
|
|
||||||
'machine_id': machine_id,
|
|
||||||
}
|
|
||||||
print(json.dumps(record))
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
warnings = []
|
|
||||||
errors = []
|
|
||||||
computed = {}
|
|
||||||
|
|
||||||
# Get config values
|
|
||||||
wget_enabled = get_env_bool('WGET_ENABLED', True)
|
|
||||||
wget_save_warc = get_env_bool('WGET_SAVE_WARC', True)
|
|
||||||
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
|
||||||
wget_binary = get_env('WGET_BINARY', 'wget')
|
|
||||||
|
|
||||||
# Compute derived values (USE_WGET for backward compatibility)
|
|
||||||
use_wget = wget_enabled
|
|
||||||
computed['USE_WGET'] = str(use_wget).lower()
|
|
||||||
|
|
||||||
# Validate timeout with warning (not error)
|
|
||||||
if use_wget and wget_timeout < 20:
|
|
||||||
warnings.append(
|
|
||||||
f"WGET_TIMEOUT={wget_timeout} is very low. "
|
|
||||||
"wget may fail to archive sites if set to less than ~20 seconds. "
|
|
||||||
"Consider setting WGET_TIMEOUT=60 or higher."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check binary availability using abx-pkg
|
|
||||||
provider = EnvProvider()
|
|
||||||
try:
|
|
||||||
binary = Binary(name=wget_binary, binproviders=[provider]).load()
|
|
||||||
binary_path = str(binary.abspath) if binary.abspath else ''
|
|
||||||
except Exception:
|
|
||||||
binary = None
|
|
||||||
binary_path = ''
|
|
||||||
|
|
||||||
if not binary_path:
|
|
||||||
if use_wget:
|
|
||||||
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
|
|
||||||
computed['WGET_BINARY'] = ''
|
|
||||||
else:
|
|
||||||
computed['WGET_BINARY'] = binary_path
|
|
||||||
wget_version = str(binary.version) if binary.version else 'unknown'
|
|
||||||
computed['WGET_VERSION'] = wget_version
|
|
||||||
|
|
||||||
# Output Binary JSONL record
|
|
||||||
output_binary(binary, name='wget')
|
|
||||||
|
|
||||||
# Check for compression support
|
|
||||||
if computed.get('WGET_BINARY'):
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
[computed['WGET_BINARY'], '--compression=auto', '--help'],
|
|
||||||
capture_output=True, timeout=5
|
|
||||||
)
|
|
||||||
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
|
|
||||||
except Exception:
|
|
||||||
computed['WGET_AUTO_COMPRESSION'] = 'false'
|
|
||||||
|
|
||||||
# Output results
|
|
||||||
# Format: KEY=VALUE lines that hooks.py will parse and add to env
|
|
||||||
for key, value in computed.items():
|
|
||||||
print(f"COMPUTED:{key}={value}")
|
|
||||||
|
|
||||||
for warning in warnings:
|
|
||||||
print(f"WARNING:{warning}", file=sys.stderr)
|
|
||||||
|
|
||||||
for error in errors:
|
|
||||||
print(f"ERROR:{error}", file=sys.stderr)
|
|
||||||
|
|
||||||
# Exit with error if any hard errors
|
|
||||||
sys.exit(1 if errors else 0)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
Reference in New Issue
Block a user