wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,80 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"CHROME_BINARY": {
"type": "string",
"default": "chromium",
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
"description": "Path to Chrome/Chromium binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"x-aliases": ["NODEJS_BINARY"],
"description": "Path to Node.js binary (for Puppeteer)"
},
"CHROME_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Chrome operations in seconds"
},
"CHROME_HEADLESS": {
"type": "boolean",
"default": true,
"description": "Run Chrome in headless mode"
},
"CHROME_SANDBOX": {
"type": "boolean",
"default": true,
"description": "Enable Chrome sandbox (disable in Docker with --no-sandbox)"
},
"CHROME_RESOLUTION": {
"type": "string",
"default": "1440,2000",
"pattern": "^\\d+,\\d+$",
"x-fallback": "RESOLUTION",
"description": "Browser viewport resolution (width,height)"
},
"CHROME_USER_DATA_DIR": {
"type": "string",
"default": "",
"description": "Path to Chrome user data directory for persistent sessions"
},
"CHROME_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string for Chrome"
},
"CHROME_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra command-line arguments for Chrome (space-separated)"
},
"CHROME_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"SAVE_SCREENSHOT": {
"type": "boolean",
"default": true,
"description": "Enable screenshot capture"
},
"SAVE_PDF": {
"type": "boolean",
"default": true,
"description": "Enable PDF generation"
},
"SAVE_DOM": {
"type": "boolean",
"default": true,
"description": "Enable DOM capture"
}
}
}

View File

@@ -0,0 +1,150 @@
#!/usr/bin/env python3
"""
Validation hook for Chrome/Chromium binary.
Runs at crawl start to verify Chrome is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
# Common Chrome/Chromium binary names and paths
CHROME_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
CHROME_PATHS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
'/snap/bin/chromium',
'/opt/google/chrome/chrome',
]
def get_binary_version(abspath: str) -> str | None:
"""Get version string from Chrome binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0 and result.stdout:
# Chrome version string: "Google Chrome 120.0.6099.109" or "Chromium 120.0.6099.109"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
# Find version number (looks like 120.0.6099.109)
for part in parts:
if '.' in part and part[0].isdigit():
return part
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_chrome() -> dict | None:
"""Find Chrome/Chromium binary."""
# Check env var first
env_path = os.environ.get('CHROME_BINARY', '')
if env_path and Path(env_path).is_file():
return {
'name': 'chrome',
'abspath': env_path,
'version': get_binary_version(env_path),
'sha256': get_binary_hash(env_path),
'binprovider': 'env',
}
# Try shutil.which for various names
for name in CHROME_NAMES:
abspath = shutil.which(name)
if abspath:
return {
'name': 'chrome',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
# Check common paths
for path in CHROME_PATHS:
if Path(path).is_file():
return {
'name': 'chrome',
'abspath': path,
'version': get_binary_version(path),
'sha256': get_binary_hash(path),
'binprovider': 'env',
}
return None
def main():
result = find_chrome()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROME_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROME_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'chrome',
'bin_providers': 'apt,brew,env',
}))
print(f"Chrome/Chromium binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
Validate and compute derived Chrome config values.
This hook runs early in the Crawl lifecycle to:
1. Auto-detect Chrome binary location
2. Compute sandbox settings based on Docker detection
3. Validate binary availability and version
4. Set computed env vars for subsequent hooks
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- InstalledBinary JSONL records to stdout when binaries are found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
# Chrome binary search order
CHROME_BINARY_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
"""Find Chrome binary using abx-pkg, checking configured path first."""
# Try configured binary first
if configured:
try:
binary = Binary(name=configured, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
pass
# Search common names
for name in CHROME_BINARY_NAMES:
try:
binary = Binary(name=name, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
continue
return None
def output_installed_binary(binary: Binary, name: str):
"""Output InstalledBinary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'InstalledBinary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
chrome_binary = get_env('CHROME_BINARY', 'chromium')
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
save_screenshot = get_env_bool('SAVE_SCREENSHOT', True)
save_pdf = get_env_bool('SAVE_PDF', True)
save_dom = get_env_bool('SAVE_DOM', True)
# Compute USE_CHROME (derived from SAVE_* flags)
use_chrome = save_screenshot or save_pdf or save_dom
computed['USE_CHROME'] = str(use_chrome).lower()
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Find Chrome binary using abx-pkg
provider = EnvProvider()
if use_chrome:
chrome = find_chrome_binary(chrome_binary, provider)
if not chrome or not chrome.abspath:
errors.append(
f"Chrome binary not found (tried: {chrome_binary}). "
"Install Chrome/Chromium or set CHROME_BINARY path."
)
computed['CHROME_BINARY'] = ''
else:
computed['CHROME_BINARY'] = str(chrome.abspath)
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
# Output InstalledBinary JSONL record for Chrome
output_installed_binary(chrome, name='chrome')
# Check Node.js for Puppeteer
node_binary_name = get_env('NODE_BINARY', 'node')
try:
node = Binary(name=node_binary_name, binproviders=[provider]).load()
node_path = str(node.abspath) if node.abspath else ''
except Exception:
node = None
node_path = ''
if use_chrome and not node_path:
errors.append(
f"Node.js not found (tried: {node_binary_name}). "
"Install Node.js or set NODE_BINARY path for Puppeteer."
)
else:
computed['NODE_BINARY'] = node_path
if node and node.abspath:
# Output InstalledBinary JSONL record for Node
output_installed_binary(node, name='node')
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,350 @@
#!/usr/bin/env node
/**
* Start a Chrome browser session for use by other extractors.
*
* This extractor ONLY launches Chrome and creates a blank page - it does NOT navigate.
* Pre-load extractors (21-29) can connect via CDP to register listeners before navigation.
* The chrome_navigate extractor (30) performs the actual page load.
*
* Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>
* Output: Creates chrome_session/ with:
* - cdp_url.txt: WebSocket URL for CDP connection
* - pid.txt: Chrome process ID (for cleanup)
* - page_id.txt: Target ID of the page for other extractors to use
* - url.txt: The URL to be navigated to (for chrome_navigate)
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_session';
const OUTPUT_DIR = 'chrome_session';
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
// Load installed extensions from cache files
function loadInstalledExtensions() {
const extensions = [];
if (!fs.existsSync(EXTENSIONS_DIR)) {
return extensions;
}
// Look for *.extension.json cache files created by extension plugins
const files = fs.readdirSync(EXTENSIONS_DIR);
const extensionFiles = files.filter(f => f.endsWith('.extension.json'));
for (const file of extensionFiles) {
try {
const filePath = path.join(EXTENSIONS_DIR, file);
const data = fs.readFileSync(filePath, 'utf-8');
const extension = JSON.parse(data);
// Verify extension is actually installed
const manifestPath = path.join(extension.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
extensions.push(extension);
console.log(`[+] Loaded extension: ${extension.name} (${extension.webstore_id})`);
}
} catch (e) {
console.warn(`[⚠️] Failed to load extension from ${file}: ${e.message}`);
}
}
return extensions;
}
async function startChromeSession(url, binary) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Load installed extensions
const extensions = loadInstalledExtensions();
const extensionArgs = extensionUtils.getExtensionLaunchArgs(extensions);
if (extensions.length > 0) {
console.log(`[*] Loading ${extensions.length} Chrome extensions...`);
}
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
let browser = null;
try {
// Launch browser with Puppeteer
browser = await puppeteer.launch({
executablePath: binary,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-sync',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
'--password-store=basic',
'--use-mock-keychain',
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
...extensionArgs,
],
defaultViewport: { width, height },
});
// Get the WebSocket endpoint URL
const cdpUrl = browser.wsEndpoint();
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
// Write PID for cleanup
const browserProcess = browser.process();
if (browserProcess) {
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(browserProcess.pid));
}
// Create a new page (but DON'T navigate yet)
const page = await browser.newPage();
// Set user agent if specified
if (userAgent) {
await page.setUserAgent(userAgent);
}
// Write the page target ID so other extractors can find this specific page
const target = page.target();
const targetId = target._targetId;
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
// Write the URL for chrome_navigate to use
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
// Connect to loaded extensions at runtime (only if not already done)
const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
if (extensions.length > 0 && !fs.existsSync(extensionsFile)) {
console.log('[*] Connecting to loaded extensions (first time setup)...');
try {
const loadedExtensions = await extensionUtils.loadAllExtensionsFromBrowser(browser, extensions);
// Write loaded extensions metadata for other extractors to use
fs.writeFileSync(extensionsFile, JSON.stringify(loadedExtensions, null, 2));
console.log(`[+] Extensions loaded and available at ${extensionsFile}`);
console.log(`[+] ${loadedExtensions.length} extensions ready for configuration by subsequent plugins`);
} catch (e) {
console.warn(`[⚠️] Failed to load extensions from browser: ${e.message}`);
}
} else if (extensions.length > 0) {
console.log('[*] Extensions already loaded from previous snapshot');
}
// Don't close browser - leave it running for other extractors
// Detach puppeteer from browser so it stays running
browser.disconnect();
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId };
} catch (e) {
// Kill browser if startup failed
if (browser) {
try {
await browser.close();
} catch (closeErr) {
// Ignore
}
}
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let version = '';
try {
// chrome_session launches Chrome and creates a blank page
// Pre-load extractors (21-29) register CDP listeners
// chrome_navigate (30) performs actual navigation
const binary = findChrome();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chrome');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
process.exit(1);
}
// Get Chrome version
try {
const { execSync } = require('child_process');
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
} catch (e) {
version = '';
}
const result = await startChromeSession(url, binary);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Chrome session started (no navigation yet): ${result.cdpUrl}`);
console.log(`Page target ID: ${result.targetId}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (version) {
console.log(`VERSION=${version}`);
}
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
cmd_version: version,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});