#!/usr/bin/env node /** * Create a Chrome tab for this snapshot in the shared crawl Chrome session. * * If a crawl-level Chrome session exists (from on_Crawl__10_chrome_session.js), * this connects to it and creates a new tab. Otherwise, falls back to launching * its own Chrome instance. * * Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id= --crawl-id= * Output: Creates chrome_session/ with: * - cdp_url.txt: WebSocket URL for CDP connection (copied or new) * - pid.txt: Chrome process ID (from crawl or new) * - page_id.txt: Target ID of this snapshot's tab * - url.txt: The URL to be navigated to * * Environment variables: * DATA_DIR: Data directory (to find crawl's Chrome session) * CHROME_BINARY: Path to Chrome/Chromium binary (for fallback) * CHROME_RESOLUTION: Page resolution (default: 1440,2000) * CHROME_USER_AGENT: User agent string (optional) * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) * CHROME_HEADLESS: Run in headless mode (default: true) */ const fs = require('fs'); const path = require('path'); const { spawn } = require('child_process'); const http = require('http'); const puppeteer = require('puppeteer-core'); // Extractor metadata const EXTRACTOR_NAME = 'chrome_session'; const OUTPUT_DIR = '.'; // Hook already runs in the output directory // Parse command line arguments function parseArgs() { const args = {}; process.argv.slice(2).forEach(arg => { if (arg.startsWith('--')) { const [key, ...valueParts] = arg.slice(2).split('='); args[key.replace(/-/g, '_')] = valueParts.join('=') || true; } }); return args; } // Get environment variable with default function getEnv(name, defaultValue = '') { return (process.env[name] || defaultValue).trim(); } function getEnvBool(name, defaultValue = false) { const val = getEnv(name, '').toLowerCase(); if (['true', '1', 'yes', 'on'].includes(val)) return true; if (['false', '0', 'no', 'off'].includes(val)) return false; return defaultValue; } // Find Chrome binary (for fallback) function findChrome() { const chromeBinary = getEnv('CHROME_BINARY'); if (chromeBinary && fs.existsSync(chromeBinary)) { return chromeBinary; } const candidates = [ '/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/chromium', '/usr/bin/chromium-browser', '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', '/Applications/Chromium.app/Contents/MacOS/Chromium', ]; for (const candidate of candidates) { if (fs.existsSync(candidate)) { return candidate; } } return null; } // Parse resolution string function parseResolution(resolution) { const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); return { width: width || 1440, height: height || 2000 }; } // Find a free port function findFreePort() { return new Promise((resolve, reject) => { const server = require('net').createServer(); server.unref(); server.on('error', reject); server.listen(0, () => { const port = server.address().port; server.close(() => resolve(port)); }); }); } // Wait for Chrome's DevTools port to be ready function waitForDebugPort(port, timeout = 30000) { const startTime = Date.now(); return new Promise((resolve, reject) => { const tryConnect = () => { if (Date.now() - startTime > timeout) { reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); return; } const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { let data = ''; res.on('data', chunk => data += chunk); res.on('end', () => { try { const info = JSON.parse(data); resolve(info); } catch (e) { setTimeout(tryConnect, 100); } }); }); req.on('error', () => { setTimeout(tryConnect, 100); }); req.setTimeout(1000, () => { req.destroy(); setTimeout(tryConnect, 100); }); }; tryConnect(); }); } // Try to find the crawl's Chrome session function findCrawlChromeSession(crawlId) { if (!crawlId) return null; const dataDir = getEnv('DATA_DIR', '.'); const crawlChromeDir = path.join(dataDir, 'tmp', `crawl_${crawlId}`, 'chrome_session'); const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt'); const pidFile = path.join(crawlChromeDir, 'pid.txt'); if (fs.existsSync(cdpFile) && fs.existsSync(pidFile)) { try { const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10); // Verify the process is still running try { process.kill(pid, 0); // Signal 0 = check if process exists return { cdpUrl, pid }; } catch (e) { // Process not running return null; } } catch (e) { return null; } } return null; } // Create a new tab in an existing Chrome session async function createTabInExistingChrome(cdpUrl, url, pid) { const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); const { width, height } = parseResolution(resolution); console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`); // Connect Puppeteer to the running Chrome const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl, defaultViewport: { width, height }, }); // Create a new tab for this snapshot const page = await browser.newPage(); // Set viewport await page.setViewport({ width, height }); // Set user agent if specified if (userAgent) { await page.setUserAgent(userAgent); } // Get the page target ID const target = page.target(); const targetId = target._targetId; // Write session info fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl); fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(pid)); fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId); fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url); fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'true'); // Disconnect Puppeteer (Chrome and tab stay alive) browser.disconnect(); return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid, shared: true }; } // Fallback: Launch a new Chrome instance for this snapshot async function launchNewChrome(url, binary) { const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'); const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''); const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)); const headless = getEnvBool('CHROME_HEADLESS', true); const { width, height } = parseResolution(resolution); // Find a free port for Chrome DevTools const debugPort = await findFreePort(); console.log(`[*] Launching new Chrome on port: ${debugPort}`); // Build Chrome arguments const chromeArgs = [ `--remote-debugging-port=${debugPort}`, '--remote-debugging-address=127.0.0.1', '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--disable-sync', '--no-first-run', '--no-default-browser-check', '--disable-default-apps', '--disable-infobars', '--disable-blink-features=AutomationControlled', '--disable-component-update', '--disable-domain-reliability', '--disable-breakpad', '--disable-background-networking', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding', '--disable-ipc-flooding-protection', '--password-store=basic', '--use-mock-keychain', '--font-render-hinting=none', '--force-color-profile=srgb', `--window-size=${width},${height}`, ...(headless ? ['--headless=new'] : []), ...(checkSsl ? [] : ['--ignore-certificate-errors']), 'about:blank', ]; // Launch Chrome as a detached process (since no crawl-level Chrome exists) const chromeProcess = spawn(binary, chromeArgs, { detached: true, stdio: ['ignore', 'ignore', 'ignore'], }); chromeProcess.unref(); const chromePid = chromeProcess.pid; console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`); // Write PID immediately for cleanup fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid)); try { // Wait for Chrome to be ready const versionInfo = await waitForDebugPort(debugPort, 30000); console.log(`[+] Chrome ready: ${versionInfo.Browser}`); const wsUrl = versionInfo.webSocketDebuggerUrl; fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), wsUrl); // Connect Puppeteer to get page info const browser = await puppeteer.connect({ browserWSEndpoint: wsUrl, defaultViewport: { width, height }, }); let pages = await browser.pages(); let page = pages[0]; if (!page) { page = await browser.newPage(); } await page.setViewport({ width, height }); if (userAgent) { await page.setUserAgent(userAgent); } const target = page.target(); const targetId = target._targetId; fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId); fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url); fs.writeFileSync(path.join(OUTPUT_DIR, 'shared_session.txt'), 'false'); browser.disconnect(); return { success: true, output: OUTPUT_DIR, cdpUrl: wsUrl, targetId, pid: chromePid, shared: false }; } catch (e) { try { process.kill(chromePid, 'SIGTERM'); } catch (killErr) { // Ignore } return { success: false, error: `${e.name}: ${e.message}` }; } } async function main() { const args = parseArgs(); const url = args.url; const snapshotId = args.snapshot_id; const crawlId = args.crawl_id; if (!url || !snapshotId) { console.error('Usage: on_Snapshot__20_chrome_session.js --url= --snapshot-id= [--crawl-id=]'); process.exit(1); } const startTs = new Date(); let status = 'failed'; let output = null; let error = ''; let version = ''; try { const binary = findChrome(); if (!binary) { console.error('ERROR: Chrome/Chromium binary not found'); console.error('DEPENDENCY_NEEDED=chrome'); console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable'); process.exit(1); } // Get Chrome version try { const { execSync } = require('child_process'); version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64); } catch (e) { version = ''; } // Try to use existing crawl Chrome session const crawlSession = findCrawlChromeSession(crawlId); let result; if (crawlSession) { console.log(`[*] Found existing Chrome session from crawl ${crawlId}`); result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid); } else { console.log(`[*] No crawl Chrome session found, launching new Chrome`); result = await launchNewChrome(url, binary); } if (result.success) { status = 'succeeded'; output = result.output; console.log(`[+] Chrome session ready (shared: ${result.shared})`); console.log(`[+] CDP URL: ${result.cdpUrl}`); console.log(`[+] Page target ID: ${result.targetId}`); } else { status = 'failed'; error = result.error; } } catch (e) { error = `${e.name}: ${e.message}`; status = 'failed'; } const endTs = new Date(); const duration = (endTs - startTs) / 1000; // Print results console.log(`START_TS=${startTs.toISOString()}`); console.log(`END_TS=${endTs.toISOString()}`); console.log(`DURATION=${duration.toFixed(2)}`); if (version) { console.log(`VERSION=${version}`); } if (output) { console.log(`OUTPUT=${output}`); } console.log(`STATUS=${status}`); if (error) { console.error(`ERROR=${error}`); } // Print JSON result const resultJson = { extractor: EXTRACTOR_NAME, url, snapshot_id: snapshotId, crawl_id: crawlId || null, status, start_ts: startTs.toISOString(), end_ts: endTs.toISOString(), duration: Math.round(duration * 100) / 100, cmd_version: version, output, error: error || null, }; console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); process.exit(status === 'succeeded' ? 0 : 1); } main().catch(e => { console.error(`Fatal error: ${e.message}`); process.exit(1); });