rename extractor to plugin everywhere

This commit is contained in:
Nick Sweeting
2025-12-28 04:43:15 -08:00
parent 50e527ec65
commit bd265c0083
19 changed files with 766 additions and 160 deletions

View File

@@ -29,6 +29,98 @@ const http = require('http');
const EXTRACTOR_NAME = 'chrome_launch';
const OUTPUT_DIR = 'chrome';
// Helper: Write PID file with mtime set to process start time
function writePidWithMtime(filePath, pid, startTimeSeconds) {
fs.writeFileSync(filePath, String(pid));
// Set both atime and mtime to process start time for validation
const startTimeMs = startTimeSeconds * 1000;
fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
}
// Helper: Write command script for validation
function writeCmdScript(filePath, binary, args) {
// Shell escape arguments containing spaces or special characters
const escapedArgs = args.map(arg => {
if (arg.includes(' ') || arg.includes('"') || arg.includes('$')) {
return `"${arg.replace(/"/g, '\\"')}"`;
}
return arg;
});
const script = `#!/bin/bash\n${binary} ${escapedArgs.join(' ')}\n`;
fs.writeFileSync(filePath, script);
fs.chmodSync(filePath, 0o755);
}
// Helper: Get process start time (cross-platform)
function getProcessStartTime(pid) {
try {
const { execSync } = require('child_process');
if (process.platform === 'darwin') {
// macOS: ps -p PID -o lstart= gives start time
const output = execSync(`ps -p ${pid} -o lstart=`, { encoding: 'utf8', timeout: 1000 });
return Date.parse(output.trim()) / 1000; // Convert to epoch seconds
} else {
// Linux: read /proc/PID/stat field 22 (starttime in clock ticks)
const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
const match = stat.match(/\) \w+ (\d+)/);
if (match) {
const startTicks = parseInt(match[1], 10);
// Convert clock ticks to seconds (assuming 100 ticks/sec)
const uptimeSeconds = parseFloat(fs.readFileSync('/proc/uptime', 'utf8').split(' ')[0]);
const bootTime = Date.now() / 1000 - uptimeSeconds;
return bootTime + (startTicks / 100);
}
}
} catch (e) {
// Can't get start time
return null;
}
return null;
}
// Helper: Validate PID using mtime and command
function validatePid(pid, pidFile, cmdFile) {
try {
// Check process exists
try {
process.kill(pid, 0); // Signal 0 = check existence
} catch (e) {
return false; // Process doesn't exist
}
// Check mtime matches process start time (within 5 sec tolerance)
const fileStat = fs.statSync(pidFile);
const fileMtime = fileStat.mtimeMs / 1000; // Convert to seconds
const procStartTime = getProcessStartTime(pid);
if (procStartTime === null) {
// Can't validate - fall back to basic existence check
return true;
}
if (Math.abs(fileMtime - procStartTime) > 5) {
// PID was reused by different process
return false;
}
// Validate command if available
if (fs.existsSync(cmdFile)) {
const cmd = fs.readFileSync(cmdFile, 'utf8');
// Check for Chrome/Chromium and debug port
if (!cmd.includes('chrome') && !cmd.includes('chromium')) {
return false;
}
if (!cmd.includes('--remote-debugging-port')) {
return false;
}
}
return true;
} catch (e) {
return false;
}
}
// Global state for cleanup
let chromePid = null;
@@ -240,17 +332,17 @@ function killZombieChrome() {
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
if (isNaN(pid) || pid <= 0) continue;
// Check if process exists
try {
process.kill(pid, 0);
} catch (e) {
// Process dead, remove stale PID file
// Validate PID before killing
const cmdFile = path.join(chromeDir, 'cmd.sh');
if (!validatePid(pid, pidFile, cmdFile)) {
// PID reused or validation failed
console.error(`[!] PID ${pid} failed validation (reused or wrong process) - cleaning up`);
try { fs.unlinkSync(pidFile); } catch (e) {}
continue;
}
// Process alive but crawl is stale - zombie!
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
// Process alive, validated, and crawl is stale - zombie!
console.error(`[!] Found validated zombie (PID ${pid}) from stale crawl ${crawl.name}`);
try {
// Kill process group first
@@ -386,15 +478,20 @@ async function launchChrome(binary) {
chromeProcess.unref(); // Don't keep Node.js process running
chromePid = chromeProcess.pid;
const chromeStartTime = Date.now() / 1000; // Unix epoch seconds
console.error(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
// Write Chrome PID for backup cleanup (named .pid so Crawl.cleanup() finds it)
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
// Write Chrome PID with mtime set to start time for validation
writePidWithMtime(path.join(OUTPUT_DIR, 'chrome.pid'), chromePid, chromeStartTime);
// Write command script for validation
writeCmdScript(path.join(OUTPUT_DIR, 'cmd.sh'), binary, chromeArgs);
fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort));
// Write hook's own PID so Crawl.cleanup() can kill this hook process
// (which will trigger our SIGTERM handler to kill Chrome)
fs.writeFileSync(path.join(OUTPUT_DIR, 'hook.pid'), String(process.pid));
// Write hook's own PID with mtime for validation
const hookStartTime = Date.now() / 1000;
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
try {
// Wait for Chrome to be ready