diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 1dca0810..bdf6cf2d 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1435,10 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not self.OUTPUT_DIR.exists(): return False - for plugin_dir in self.OUTPUT_DIR.iterdir(): - if not plugin_dir.is_dir(): - continue - pid_file = plugin_dir / 'hook.pid' + # Check all .pid files in the snapshot directory (hook-specific names) + for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): if process_is_alive(pid_file): return True @@ -2702,8 +2700,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.save() return - # Read and parse JSONL output from stdout.log - stdout_file = plugin_dir / 'stdout.log' + # Derive hook basename for hook-specific filenames + # e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget" + hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook' + + # Read and parse JSONL output from hook-specific stdout log + stdout_file = plugin_dir / f'{hook_basename}.stdout.log' stdout = stdout_file.read_text() if stdout_file.exists() else '' records = [] @@ -2744,7 +2746,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.output_str = 'Hook did not output ArchiveResult record' # Walk filesystem and populate output_files, output_size, output_mimetypes - exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} + # Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log) + def is_hook_output_file(name: str) -> bool: + """Check if a file is a hook output file that should be excluded.""" + return ( + name.endswith('.stdout.log') or + name.endswith('.stderr.log') or + name.endswith('.pid') or + (name.endswith('.sh') and name.startswith('on_')) + ) + mime_sizes = defaultdict(int) total_size = 0 output_files = {} @@ -2752,7 +2763,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi for file_path in plugin_dir.rglob('*'): if not file_path.is_file(): continue - if file_path.name in exclude_names: + if is_hook_output_file(file_path.name): continue try: @@ -2810,10 +2821,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi } process_hook_records(filtered_records, overrides=overrides) - # Cleanup PID files and empty logs - pid_file = plugin_dir / 'hook.pid' + # Cleanup PID files and empty logs (hook-specific names) + pid_file = plugin_dir / f'{hook_basename}.pid' pid_file.unlink(missing_ok=True) - stderr_file = plugin_dir / 'stderr.log' + stderr_file = plugin_dir / f'{hook_basename}.stderr.log' if stdout_file.exists() and stdout_file.stat().st_size == 0: stdout_file.unlink() if stderr_file.exists() and stderr_file.stat().st_size == 0: @@ -2919,7 +2930,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi plugin_dir = Path(self.pwd) if self.pwd else None if not plugin_dir: return False - pid_file = plugin_dir / 'hook.pid' + # Use hook-specific pid filename + hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook' + pid_file = plugin_dir / f'{hook_basename}.pid' return pid_file.exists() diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 2a506e9b..93dbb938 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -365,11 +365,14 @@ def run_hook( # Old convention: __background in stem (for backwards compatibility) is_background = '.bg.' in script.name or '__background' in script.stem - # Set up output files for ALL hooks (useful for debugging) - stdout_file = output_dir / 'stdout.log' - stderr_file = output_dir / 'stderr.log' - pid_file = output_dir / 'hook.pid' - cmd_file = output_dir / 'cmd.sh' + # Set up output files for ALL hooks - use hook-specific names to avoid conflicts + # when multiple hooks run in the same plugin directory + # e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log + hook_basename = script.stem # e.g., "on_Snapshot__20_chrome_tab.bg" + stdout_file = output_dir / f'{hook_basename}.stdout.log' + stderr_file = output_dir / f'{hook_basename}.stderr.log' + pid_file = output_dir / f'{hook_basename}.pid' + cmd_file = output_dir / f'{hook_basename}.sh' try: # Write command script for validation @@ -421,8 +424,14 @@ def run_hook( # Detect new files created by the hook files_after = set(output_dir.rglob('*')) if output_dir.exists() else set() new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()] - # Exclude the log files themselves from new_files - new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')] + # Exclude the log/pid/sh files themselves from new_files (hook-specific names) + hook_output_files = { + f'{hook_basename}.stdout.log', + f'{hook_basename}.stderr.log', + f'{hook_basename}.pid', + f'{hook_basename}.sh', + } + new_files = [f for f in new_files if f not in hook_output_files] # Parse JSONL output from stdout # Each line starting with { that has 'type' field is a record @@ -1235,15 +1244,16 @@ def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = Tru Kill process in PID file with optional validation. Args: - pid_file: Path to hook.pid file + pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid) sig: Signal to send (default SIGTERM) validate: If True, validate process identity before killing (default: True) """ from archivebox.misc.process_utils import safe_kill_process - + if validate: # Use safe kill with validation - cmd_file = pid_file.parent / 'cmd.sh' + # Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh + cmd_file = pid_file.with_suffix('.sh') safe_kill_process(pid_file, cmd_file, signal_num=sig) else: # Legacy behavior - kill without validation diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index d448923b..7faa92ea 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -533,9 +533,9 @@ async function killChrome(pid, outputDir = null) { } // Step 8: Clean up PID files + // Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup() if (outputDir) { try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {} - try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {} } console.error('[*] Chrome cleanup completed'); diff --git a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js index d025be81..643ba284 100644 --- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js @@ -143,12 +143,11 @@ async function main() { console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); } - // Write hook's own PID - const hookStartTime = Date.now() / 1000; + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } - writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); // Launch Chromium using consolidated function const result = await launchChromium({ diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js index b4e4fa63..59b7ea25 100755 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js +++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'consolelog'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'console.jsonl'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; function parseArgs() { @@ -221,8 +221,8 @@ async function main() { // Set up listeners BEFORE navigation await setupListeners(); - // Write PID file so chrome_cleanup can kill any remaining processes - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js index d6c2497f..a3cfcbc8 100755 --- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js +++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'redirects'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'redirects.jsonl'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; // Global state @@ -274,8 +274,8 @@ async function main() { // Set up redirect listener BEFORE navigation await setupRedirectListener(); - // Write PID file - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js index 33697f55..15785a7a 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'responses'; const OUTPUT_DIR = '.'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; // Resource types to capture (by default, capture everything) @@ -323,8 +323,8 @@ async function main() { // Set up listener BEFORE navigation await setupListener(); - // Write PID file - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js index 83ff4d61..67bd3438 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'ssl'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'ssl.jsonl'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; function parseArgs() { @@ -211,8 +211,8 @@ async function main() { // Set up listener BEFORE navigation await setupListener(url); - // Write PID file so chrome_cleanup can kill any remaining processes - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js index 5a501694..0735e764 100644 --- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js +++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js @@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'staticfile'; const OUTPUT_DIR = '.'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; // Content-Types that indicate static files @@ -398,8 +398,8 @@ async function main() { // Set up static file listener BEFORE navigation await setupStaticFileListener(); - // Write PID file - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation();