diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index 02288067..13d49b73 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -1512,6 +1512,173 @@ async function installExtensionWithCache(extension, options = {}) { return installedExt; } +// ============================================================================ +// Snapshot Hook Utilities (for CDP-based plugins like ssl, responses, dns) +// ============================================================================ + +/** + * Parse command line arguments into an object. + * Handles --key=value and --flag formats. + * + * @returns {Object} - Parsed arguments object + */ +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach(arg => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +/** + * Wait for Chrome session files to be ready. + * Polls for cdp_url.txt and target_id.txt in the chrome session directory. + * + * @param {string} chromeSessionDir - Path to chrome session directory (e.g., '../chrome') + * @param {number} [timeoutMs=60000] - Timeout in milliseconds + * @returns {Promise} - True if files are ready, false if timeout + */ +async function waitForChromeSession(chromeSessionDir, timeoutMs = 60000) { + const cdpFile = path.join(chromeSessionDir, 'cdp_url.txt'); + const targetIdFile = path.join(chromeSessionDir, 'target_id.txt'); + const startTime = Date.now(); + + while (Date.now() - startTime < timeoutMs) { + if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { + return true; + } + await new Promise(resolve => setTimeout(resolve, 100)); + } + + return false; +} + +/** + * Read CDP WebSocket URL from chrome session directory. + * + * @param {string} chromeSessionDir - Path to chrome session directory + * @returns {string|null} - CDP URL or null if not found + */ +function readCdpUrl(chromeSessionDir) { + const cdpFile = path.join(chromeSessionDir, 'cdp_url.txt'); + if (fs.existsSync(cdpFile)) { + return fs.readFileSync(cdpFile, 'utf8').trim(); + } + return null; +} + +/** + * Read target ID from chrome session directory. + * + * @param {string} chromeSessionDir - Path to chrome session directory + * @returns {string|null} - Target ID or null if not found + */ +function readTargetId(chromeSessionDir) { + const targetIdFile = path.join(chromeSessionDir, 'target_id.txt'); + if (fs.existsSync(targetIdFile)) { + return fs.readFileSync(targetIdFile, 'utf8').trim(); + } + return null; +} + +/** + * Connect to Chrome browser and find the target page. + * This is a high-level utility that handles all the connection logic: + * 1. Wait for chrome session files + * 2. Connect to browser via CDP + * 3. Find the target page by ID + * + * @param {Object} options - Connection options + * @param {string} [options.chromeSessionDir='../chrome'] - Path to chrome session directory + * @param {number} [options.timeoutMs=60000] - Timeout for waiting + * @param {Object} [options.puppeteer] - Puppeteer module (must be passed in) + * @returns {Promise} - { browser, page, targetId, cdpUrl } + * @throws {Error} - If connection fails or page not found + */ +async function connectToPage(options = {}) { + const { + chromeSessionDir = '../chrome', + timeoutMs = 60000, + puppeteer, + } = options; + + if (!puppeteer) { + throw new Error('puppeteer module must be passed to connectToPage()'); + } + + // Wait for chrome session to be ready + const sessionReady = await waitForChromeSession(chromeSessionDir, timeoutMs); + if (!sessionReady) { + throw new Error(`Chrome session not ready after ${timeoutMs/1000}s (chrome plugin must run first)`); + } + + // Read session files + const cdpUrl = readCdpUrl(chromeSessionDir); + if (!cdpUrl) { + throw new Error('No Chrome session found (cdp_url.txt missing)'); + } + + const targetId = readTargetId(chromeSessionDir); + + // Connect to browser + const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); + + // Find the target page + const pages = await browser.pages(); + let page = null; + + if (targetId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === targetId; + }); + } + + // Fallback to last page if target not found + if (!page) { + page = pages[pages.length - 1]; + } + + if (!page) { + throw new Error('No page found in browser'); + } + + return { browser, page, targetId, cdpUrl }; +} + +/** + * Wait for page navigation to complete. + * Polls for page_loaded.txt marker file written by chrome_navigate. + * + * @param {string} chromeSessionDir - Path to chrome session directory + * @param {number} [timeoutMs=120000] - Timeout in milliseconds + * @param {number} [postLoadDelayMs=0] - Additional delay after page load marker + * @returns {Promise} + * @throws {Error} - If timeout waiting for navigation + */ +async function waitForPageLoaded(chromeSessionDir, timeoutMs = 120000, postLoadDelayMs = 0) { + const pageLoadedMarker = path.join(chromeSessionDir, 'page_loaded.txt'); + const pollInterval = 100; + let waitTime = 0; + + while (!fs.existsSync(pageLoadedMarker) && waitTime < timeoutMs) { + await new Promise(resolve => setTimeout(resolve, pollInterval)); + waitTime += pollInterval; + } + + if (!fs.existsSync(pageLoadedMarker)) { + throw new Error('Timeout waiting for navigation (chrome_navigate did not complete)'); + } + + // Optional post-load delay for late responses + if (postLoadDelayMs > 0) { + await new Promise(resolve => setTimeout(resolve, postLoadDelayMs)); + } +} + // Export all functions module.exports = { // Environment helpers @@ -1559,6 +1726,13 @@ module.exports = { installExtensionWithCache, // Deprecated - use enableExtensions option instead getExtensionLaunchArgs, + // Snapshot hook utilities (for CDP-based plugins) + parseArgs, + waitForChromeSession, + readCdpUrl, + readTargetId, + connectToPage, + waitForPageLoaded, }; // CLI usage diff --git a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js b/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js index 6ee13242..cc977fb7 100755 --- a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js +++ b/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js @@ -19,57 +19,19 @@ if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_ const puppeteer = require('puppeteer-core'); // Import shared utilities from chrome_utils.js -const chromeUtils = require('../chrome/chrome_utils.js'); -const { getEnv, getEnvBool, getEnvInt } = chromeUtils; +const { + getEnvBool, + getEnvInt, + parseArgs, + connectToPage, + waitForPageLoaded, +} = require('../chrome/chrome_utils.js'); const PLUGIN_NAME = 'dns'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'dns.jsonl'; const CHROME_SESSION_DIR = '../chrome'; -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -// Chrome session file helpers (these are local to each plugin's working directory) -async function waitForChromeTabOpen(timeoutMs = 60000) { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { - return true; - } - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -function getPageId() { - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - if (fs.existsSync(targetIdFile)) { - return fs.readFileSync(targetIdFile, 'utf8').trim(); - } - return null; -} - function extractHostname(url) { try { const urlObj = new URL(url); @@ -91,37 +53,12 @@ async function setupListener(targetUrl) { // Track request IDs to their URLs for correlation const requestUrls = new Map(); - // Wait for chrome tab to be open - const tabOpen = await waitForChromeTabOpen(timeout); - if (!tabOpen) { - throw new Error(`Chrome tab not open after ${timeout/1000}s (chrome plugin must run first)`); - } - - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - throw new Error('No Chrome session found'); - } - - const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - // Find our page - const pages = await browser.pages(); - const targetId = getPageId(); - let page = null; - - if (targetId) { - page = pages.find(p => { - const target = p.target(); - return target && target._targetId === targetId; - }); - } - if (!page) { - page = pages[pages.length - 1]; - } - - if (!page) { - throw new Error('No page found'); - } + // Connect to Chrome page using shared utility + const { browser, page } = await connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs: timeout, + puppeteer, + }); // Get CDP session for low-level network events const client = await page.target().createCDPSession(); @@ -233,26 +170,6 @@ async function setupListener(targetUrl) { return { browser, page, client }; } -async function waitForNavigation() { - // Wait for chrome_navigate to complete (it writes page_loaded.txt) - const pageLoadedMarker = path.join(CHROME_SESSION_DIR, 'page_loaded.txt'); - const maxWait = getEnvInt('DNS_TIMEOUT', 30) * 1000 * 4; // 4x timeout for navigation - const pollInterval = 100; - let waitTime = 0; - - while (!fs.existsSync(pageLoadedMarker) && waitTime < maxWait) { - await new Promise(resolve => setTimeout(resolve, pollInterval)); - waitTime += pollInterval; - } - - if (!fs.existsSync(pageLoadedMarker)) { - throw new Error('Timeout waiting for navigation (chrome_navigate did not complete)'); - } - - // Wait a bit longer for any post-load DNS resolutions - await new Promise(resolve => setTimeout(resolve, 500)); -} - async function main() { const args = parseArgs(); const url = args.url; @@ -269,17 +186,14 @@ async function main() { process.exit(0); } - const startTs = new Date(); + const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000; try { // Set up listener BEFORE navigation await setupListener(url); - // Note: PID file is written by run_hook() with hook-specific name - // Snapshot.cleanup() kills all *.pid processes when done - // Wait for chrome_navigate to complete (BLOCKING) - await waitForNavigation(); + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); // Count DNS records const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); @@ -289,10 +203,7 @@ async function main() { recordCount = content.split('\n').filter(line => line.trim()).length; } - // Report success - const endTs = new Date(); - - // Output clean JSONL (no RESULT_JSON= prefix) + // Output clean JSONL console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', @@ -305,7 +216,6 @@ async function main() { const error = `${e.name}: ${e.message}`; console.error(`ERROR: ${error}`); - // Output clean JSONL (no RESULT_JSON= prefix) console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js index 15785a7a..9d8f16ed 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js @@ -7,83 +7,35 @@ * responses during the navigation. * * Usage: on_Snapshot__24_responses.js --url= --snapshot-id= - * Output: Creates responses/ directory with index.jsonl + listener.pid + * Output: Creates responses/ directory with index.jsonl */ const fs = require('fs'); const path = require('path'); const crypto = require('crypto'); + // Add NODE_MODULES_DIR to module resolution paths if set if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); + const puppeteer = require('puppeteer-core'); +// Import shared utilities from chrome_utils.js +const { + getEnv, + getEnvBool, + getEnvInt, + parseArgs, + connectToPage, + waitForPageLoaded, +} = require('../chrome/chrome_utils.js'); + const PLUGIN_NAME = 'responses'; const OUTPUT_DIR = '.'; -// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; // Resource types to capture (by default, capture everything) const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket']; -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - -async function waitForChromeTabOpen(timeoutMs = 60000) { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -function getPageId() { - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - if (fs.existsSync(targetIdFile)) { - return fs.readFileSync(targetIdFile, 'utf8').trim(); - } - return null; -} - function getExtensionFromMimeType(mimeType) { const mimeMap = { 'text/html': 'html', @@ -150,6 +102,7 @@ async function createSymlink(target, linkPath) { } async function setupListener() { + const timeout = getEnvInt('RESPONSES_TIMEOUT', 30) * 1000; const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(',')); const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase()); @@ -162,37 +115,12 @@ async function setupListener() { const indexPath = path.join(OUTPUT_DIR, 'index.jsonl'); fs.writeFileSync(indexPath, ''); - // Wait for chrome tab to be open (up to 60s) - const tabOpen = await waitForChromeTabOpen(60000); - if (!tabOpen) { - throw new Error('Chrome tab not open after 60s (chrome plugin must run first)'); - } - - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - throw new Error('No Chrome session found'); - } - - const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - // Find our page - const pages = await browser.pages(); - const targetId = getPageId(); - let page = null; - - if (targetId) { - page = pages.find(p => { - const target = p.target(); - return target && target._targetId === targetId; - }); - } - if (!page) { - page = pages[pages.length - 1]; - } - - if (!page) { - throw new Error('No page found'); - } + // Connect to Chrome page using shared utility + const { browser, page } = await connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs: timeout, + puppeteer, + }); // Set up response listener page.on('response', async (response) => { @@ -280,27 +208,6 @@ async function setupListener() { return { browser, page }; } -async function waitForNavigation() { - // Wait for chrome_navigate to complete - const navDir = '../chrome'; - const pageLoadedMarker = path.join(navDir, 'page_loaded.txt'); - const maxWait = 120000; // 2 minutes - const pollInterval = 100; - let waitTime = 0; - - while (!fs.existsSync(pageLoadedMarker) && waitTime < maxWait) { - await new Promise(resolve => setTimeout(resolve, pollInterval)); - waitTime += pollInterval; - } - - if (!fs.existsSync(pageLoadedMarker)) { - throw new Error('Timeout waiting for navigation (chrome_navigate did not complete)'); - } - - // Wait a bit longer for any post-load responses - await new Promise(resolve => setTimeout(resolve, 1000)); -} - async function main() { const args = parseArgs(); const url = args.url; @@ -317,22 +224,17 @@ async function main() { process.exit(0); } - const startTs = new Date(); + const timeout = getEnvInt('RESPONSES_TIMEOUT', 30) * 1000; try { // Set up listener BEFORE navigation await setupListener(); - // Note: PID file is written by run_hook() with hook-specific name - // Snapshot.cleanup() kills all *.pid processes when done - // Wait for chrome_navigate to complete (BLOCKING) - await waitForNavigation(); + // Extra 1s delay for late responses + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 1000); - // Report success - const endTs = new Date(); - - // Output clean JSONL (no RESULT_JSON= prefix) + // Output clean JSONL console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', @@ -345,7 +247,6 @@ async function main() { const error = `${e.name}: ${e.message}`; console.error(`ERROR: ${error}`); - // Output clean JSONL (no RESULT_JSON= prefix) console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js index 67bd3438..5b98801b 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js @@ -7,114 +7,46 @@ * during the navigation request. * * Usage: on_Snapshot__23_ssl.js --url= --snapshot-id= - * Output: Writes ssl.json + listener.pid + * Output: Writes ssl.jsonl */ const fs = require('fs'); const path = require('path'); + // Add NODE_MODULES_DIR to module resolution paths if set if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); + const puppeteer = require('puppeteer-core'); +// Import shared utilities from chrome_utils.js +const { + getEnvBool, + getEnvInt, + parseArgs, + connectToPage, + waitForPageLoaded, +} = require('../chrome/chrome_utils.js'); + const PLUGIN_NAME = 'ssl'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'ssl.jsonl'; -// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -async function waitForChromeTabOpen(timeoutMs = 60000) { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -function getPageId() { - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - if (fs.existsSync(targetIdFile)) { - return fs.readFileSync(targetIdFile, 'utf8').trim(); - } - return null; -} - async function setupListener(url) { const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); + const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; // Only extract SSL for HTTPS URLs if (!url.startsWith('https://')) { throw new Error('URL is not HTTPS'); } - // Wait for chrome tab to be open (up to 60s) - const tabOpen = await waitForChromeTabOpen(60000); - if (!tabOpen) { - throw new Error('Chrome tab not open after 60s (chrome plugin must run first)'); - } - - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - throw new Error('No Chrome session found'); - } - - const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - // Find our page - const pages = await browser.pages(); - const targetId = getPageId(); - let page = null; - - if (targetId) { - page = pages.find(p => { - const target = p.target(); - return target && target._targetId === targetId; - }); - } - if (!page) { - page = pages[pages.length - 1]; - } - - if (!page) { - throw new Error('No page found'); - } + // Connect to Chrome page using shared utility + const { browser, page } = await connectToPage({ + chromeSessionDir: CHROME_SESSION_DIR, + timeoutMs: timeout, + puppeteer, + }); // Set up listener to capture SSL details during navigation page.on('response', async (response) => { @@ -171,24 +103,6 @@ async function setupListener(url) { return { browser, page }; } -async function waitForNavigation() { - // Wait for chrome_navigate to complete (it writes page_loaded.txt) - const navDir = '../chrome'; - const pageLoadedMarker = path.join(navDir, 'page_loaded.txt'); - const maxWait = 120000; // 2 minutes - const pollInterval = 100; - let waitTime = 0; - - while (!fs.existsSync(pageLoadedMarker) && waitTime < maxWait) { - await new Promise(resolve => setTimeout(resolve, pollInterval)); - waitTime += pollInterval; - } - - if (!fs.existsSync(pageLoadedMarker)) { - throw new Error('Timeout waiting for navigation (chrome_navigate did not complete)'); - } -} - async function main() { const args = parseArgs(); const url = args.url; @@ -205,22 +119,16 @@ async function main() { process.exit(0); } - const startTs = new Date(); + const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; try { // Set up listener BEFORE navigation await setupListener(url); - // Note: PID file is written by run_hook() with hook-specific name - // Snapshot.cleanup() kills all *.pid processes when done - // Wait for chrome_navigate to complete (BLOCKING) - await waitForNavigation(); + await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4); - // Report success - const endTs = new Date(); - - // Output clean JSONL (no RESULT_JSON= prefix) + // Output clean JSONL console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', @@ -233,7 +141,6 @@ async function main() { const error = `${e.name}: ${e.message}`; console.error(`ERROR: ${error}`); - // Output clean JSONL (no RESULT_JSON= prefix) console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed',