Files
ArchiveBox/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js
claude[bot] 483929391d Fix test assertions to fail properly and add NXDOMAIN deduplication
- test_seo.py: Add assertIsNotNone before conditional to catch SEO extraction failures
- test_ssl.py: Add assertIsNotNone to ensure SSL data is captured from HTTPS URLs
- test_pip_provider.py: Assert jsonl_found variable to verify binary discovery
- dns plugin: Deduplicate NXDOMAIN records using seenResolutions map

Tests now fail when functionality doesn't work (no cheating).

Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
2025-12-31 19:00:28 +00:00

241 lines
7.3 KiB
JavaScript
Executable File

#!/usr/bin/env node
/**
* Record all DNS traffic (hostname -> IP resolutions) during page load.
*
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
* then waits for navigation to complete. The listeners capture all DNS
* resolutions by extracting hostname/IP pairs from network responses.
*
* Usage: on_Snapshot__22_dns.js --url=<url> --snapshot-id=<uuid>
* Output: Writes dns.jsonl with one line per DNS resolution record
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Import shared utilities from chrome_utils.js
const {
getEnvBool,
getEnvInt,
parseArgs,
connectToPage,
waitForPageLoaded,
} = require('../chrome/chrome_utils.js');
const PLUGIN_NAME = 'dns';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'dns.jsonl';
const CHROME_SESSION_DIR = '../chrome';
function extractHostname(url) {
try {
const urlObj = new URL(url);
return urlObj.hostname;
} catch (e) {
return null;
}
}
async function setupListener(targetUrl) {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
// Initialize output file
fs.writeFileSync(outputPath, '');
// Track seen hostname -> IP mappings to avoid duplicates per request
const seenResolutions = new Map();
// Track request IDs to their URLs for correlation
const requestUrls = new Map();
// Connect to Chrome page using shared utility
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs: timeout,
puppeteer,
});
// Get CDP session for low-level network events
const client = await page.target().createCDPSession();
// Enable network domain to receive events
await client.send('Network.enable');
// Listen for request events to track URLs
client.on('Network.requestWillBeSent', (params) => {
requestUrls.set(params.requestId, params.request.url);
});
// Listen for response events which contain remoteIPAddress (the resolved IP)
client.on('Network.responseReceived', (params) => {
try {
const response = params.response;
const url = response.url;
const remoteIPAddress = response.remoteIPAddress;
const remotePort = response.remotePort;
if (!url || !remoteIPAddress) {
return;
}
const hostname = extractHostname(url);
if (!hostname) {
return;
}
// Skip if IP address is same as hostname (already an IP)
if (hostname === remoteIPAddress) {
return;
}
// Create a unique key for this resolution
const resolutionKey = `${hostname}:${remoteIPAddress}`;
// Skip if we've already recorded this resolution
if (seenResolutions.has(resolutionKey)) {
return;
}
seenResolutions.set(resolutionKey, true);
// Determine record type (A for IPv4, AAAA for IPv6)
const isIPv6 = remoteIPAddress.includes(':');
const recordType = isIPv6 ? 'AAAA' : 'A';
// Create DNS record
const timestamp = new Date().toISOString();
const dnsRecord = {
ts: timestamp,
hostname: hostname,
ip: remoteIPAddress,
port: remotePort || null,
type: recordType,
protocol: url.startsWith('https://') ? 'https' : 'http',
url: url,
requestId: params.requestId,
};
// Append to output file
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
} catch (e) {
// Ignore errors
}
});
// Listen for failed requests too - they still involve DNS
client.on('Network.loadingFailed', (params) => {
try {
const requestId = params.requestId;
const url = requestUrls.get(requestId);
if (!url) {
return;
}
const hostname = extractHostname(url);
if (!hostname) {
return;
}
// Check if this is a DNS-related failure
const errorText = params.errorText || '';
if (errorText.includes('net::ERR_NAME_NOT_RESOLVED') ||
errorText.includes('net::ERR_NAME_RESOLUTION_FAILED')) {
// Create a unique key for this failed resolution
const resolutionKey = `${hostname}:NXDOMAIN`;
// Skip if we've already recorded this NXDOMAIN
if (seenResolutions.has(resolutionKey)) {
return;
}
seenResolutions.set(resolutionKey, true);
const timestamp = new Date().toISOString();
const dnsRecord = {
ts: timestamp,
hostname: hostname,
ip: null,
port: null,
type: 'NXDOMAIN',
protocol: url.startsWith('https://') ? 'https' : 'http',
url: url,
requestId: requestId,
error: errorText,
};
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
}
} catch (e) {
// Ignore errors
}
});
return { browser, page, client };
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__22_dns.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
if (!getEnvBool('DNS_ENABLED', true)) {
console.error('Skipping (DNS_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'DNS_ENABLED=False'}));
process.exit(0);
}
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
try {
// Set up listener BEFORE navigation
await setupListener(url);
// Wait for chrome_navigate to complete (BLOCKING)
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
// Count DNS records
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let recordCount = 0;
if (fs.existsSync(outputPath)) {
const content = fs.readFileSync(outputPath, 'utf8');
recordCount = content.split('\n').filter(line => line.trim()).length;
}
// Output clean JSONL
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
}));
process.exit(0);
} catch (e) {
const error = `${e.name}: ${e.message}`;
console.error(`ERROR: ${error}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'failed',
output_str: error,
}));
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});