From 754b09619329c0b0b7aa3f32227d14d681e3e6f0 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 02:00:15 +0000 Subject: [PATCH] use hook-specific filenames to avoid overwrites Multiple hooks in the same plugin directory were overwriting each other's stdout.log, stderr.log, hook.pid, and cmd.sh files. Now each hook uses filenames prefixed with its hook name: - on_Snapshot__20_chrome_tab.bg.stdout.log - on_Snapshot__20_chrome_tab.bg.stderr.log - on_Snapshot__20_chrome_tab.bg.pid - on_Snapshot__20_chrome_tab.bg.sh Updated: - hooks.py run_hook() to use hook-specific names - core/models.py cleanup and update_from_output methods - Plugin scripts to no longer write redundant hook.pid files --- archivebox/core/models.py | 37 +++++++++++++------ archivebox/hooks.py | 30 ++++++++++----- archivebox/plugins/chrome/chrome_utils.js | 2 +- .../chrome/on_Crawl__30_chrome_launch.bg.js | 5 +-- .../on_Snapshot__21_consolelog.bg.js | 6 +-- .../redirects/on_Snapshot__31_redirects.bg.js | 6 +-- .../responses/on_Snapshot__24_responses.bg.js | 6 +-- .../plugins/ssl/on_Snapshot__23_ssl.bg.js | 6 +-- .../on_Snapshot__31_staticfile.bg.js | 6 +-- 9 files changed, 63 insertions(+), 41 deletions(-) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 1dca0810..bdf6cf2d 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1435,10 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if not self.OUTPUT_DIR.exists(): return False - for plugin_dir in self.OUTPUT_DIR.iterdir(): - if not plugin_dir.is_dir(): - continue - pid_file = plugin_dir / 'hook.pid' + # Check all .pid files in the snapshot directory (hook-specific names) + for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): if process_is_alive(pid_file): return True @@ -2702,8 +2700,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.save() return - # Read and parse JSONL output from stdout.log - stdout_file = plugin_dir / 'stdout.log' + # Derive hook basename for hook-specific filenames + # e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget" + hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook' + + # Read and parse JSONL output from hook-specific stdout log + stdout_file = plugin_dir / f'{hook_basename}.stdout.log' stdout = stdout_file.read_text() if stdout_file.exists() else '' records = [] @@ -2744,7 +2746,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.output_str = 'Hook did not output ArchiveResult record' # Walk filesystem and populate output_files, output_size, output_mimetypes - exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} + # Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log) + def is_hook_output_file(name: str) -> bool: + """Check if a file is a hook output file that should be excluded.""" + return ( + name.endswith('.stdout.log') or + name.endswith('.stderr.log') or + name.endswith('.pid') or + (name.endswith('.sh') and name.startswith('on_')) + ) + mime_sizes = defaultdict(int) total_size = 0 output_files = {} @@ -2752,7 +2763,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi for file_path in plugin_dir.rglob('*'): if not file_path.is_file(): continue - if file_path.name in exclude_names: + if is_hook_output_file(file_path.name): continue try: @@ -2810,10 +2821,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi } process_hook_records(filtered_records, overrides=overrides) - # Cleanup PID files and empty logs - pid_file = plugin_dir / 'hook.pid' + # Cleanup PID files and empty logs (hook-specific names) + pid_file = plugin_dir / f'{hook_basename}.pid' pid_file.unlink(missing_ok=True) - stderr_file = plugin_dir / 'stderr.log' + stderr_file = plugin_dir / f'{hook_basename}.stderr.log' if stdout_file.exists() and stdout_file.stat().st_size == 0: stdout_file.unlink() if stderr_file.exists() and stderr_file.stat().st_size == 0: @@ -2919,7 +2930,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi plugin_dir = Path(self.pwd) if self.pwd else None if not plugin_dir: return False - pid_file = plugin_dir / 'hook.pid' + # Use hook-specific pid filename + hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook' + pid_file = plugin_dir / f'{hook_basename}.pid' return pid_file.exists() diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 2a506e9b..93dbb938 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -365,11 +365,14 @@ def run_hook( # Old convention: __background in stem (for backwards compatibility) is_background = '.bg.' in script.name or '__background' in script.stem - # Set up output files for ALL hooks (useful for debugging) - stdout_file = output_dir / 'stdout.log' - stderr_file = output_dir / 'stderr.log' - pid_file = output_dir / 'hook.pid' - cmd_file = output_dir / 'cmd.sh' + # Set up output files for ALL hooks - use hook-specific names to avoid conflicts + # when multiple hooks run in the same plugin directory + # e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log + hook_basename = script.stem # e.g., "on_Snapshot__20_chrome_tab.bg" + stdout_file = output_dir / f'{hook_basename}.stdout.log' + stderr_file = output_dir / f'{hook_basename}.stderr.log' + pid_file = output_dir / f'{hook_basename}.pid' + cmd_file = output_dir / f'{hook_basename}.sh' try: # Write command script for validation @@ -421,8 +424,14 @@ def run_hook( # Detect new files created by the hook files_after = set(output_dir.rglob('*')) if output_dir.exists() else set() new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()] - # Exclude the log files themselves from new_files - new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')] + # Exclude the log/pid/sh files themselves from new_files (hook-specific names) + hook_output_files = { + f'{hook_basename}.stdout.log', + f'{hook_basename}.stderr.log', + f'{hook_basename}.pid', + f'{hook_basename}.sh', + } + new_files = [f for f in new_files if f not in hook_output_files] # Parse JSONL output from stdout # Each line starting with { that has 'type' field is a record @@ -1235,15 +1244,16 @@ def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = Tru Kill process in PID file with optional validation. Args: - pid_file: Path to hook.pid file + pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid) sig: Signal to send (default SIGTERM) validate: If True, validate process identity before killing (default: True) """ from archivebox.misc.process_utils import safe_kill_process - + if validate: # Use safe kill with validation - cmd_file = pid_file.parent / 'cmd.sh' + # Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh + cmd_file = pid_file.with_suffix('.sh') safe_kill_process(pid_file, cmd_file, signal_num=sig) else: # Legacy behavior - kill without validation diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index d448923b..7faa92ea 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -533,9 +533,9 @@ async function killChrome(pid, outputDir = null) { } // Step 8: Clean up PID files + // Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup() if (outputDir) { try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {} - try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {} } console.error('[*] Chrome cleanup completed'); diff --git a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js index d025be81..643ba284 100644 --- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js @@ -143,12 +143,11 @@ async function main() { console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); } - // Write hook's own PID - const hookStartTime = Date.now() / 1000; + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done if (!fs.existsSync(OUTPUT_DIR)) { fs.mkdirSync(OUTPUT_DIR, { recursive: true }); } - writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); // Launch Chromium using consolidated function const result = await launchChromium({ diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js index b4e4fa63..59b7ea25 100755 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js +++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'consolelog'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'console.jsonl'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; function parseArgs() { @@ -221,8 +221,8 @@ async function main() { // Set up listeners BEFORE navigation await setupListeners(); - // Write PID file so chrome_cleanup can kill any remaining processes - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js index d6c2497f..a3cfcbc8 100755 --- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js +++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'redirects'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'redirects.jsonl'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; // Global state @@ -274,8 +274,8 @@ async function main() { // Set up redirect listener BEFORE navigation await setupRedirectListener(); - // Write PID file - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js index 33697f55..15785a7a 100755 --- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js +++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'responses'; const OUTPUT_DIR = '.'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; // Resource types to capture (by default, capture everything) @@ -323,8 +323,8 @@ async function main() { // Set up listener BEFORE navigation await setupListener(); - // Write PID file - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js index 83ff4d61..67bd3438 100755 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js @@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'ssl'; const OUTPUT_DIR = '.'; const OUTPUT_FILE = 'ssl.jsonl'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; function parseArgs() { @@ -211,8 +211,8 @@ async function main() { // Set up listener BEFORE navigation await setupListener(url); - // Write PID file so chrome_cleanup can kill any remaining processes - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation(); diff --git a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js index 5a501694..0735e764 100644 --- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js +++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js @@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core'); const PLUGIN_NAME = 'staticfile'; const OUTPUT_DIR = '.'; -const PID_FILE = 'hook.pid'; +// PID file is now written by run_hook() with hook-specific name const CHROME_SESSION_DIR = '../chrome'; // Content-Types that indicate static files @@ -398,8 +398,8 @@ async function main() { // Set up static file listener BEFORE navigation await setupStaticFileListener(); - // Write PID file - fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid)); + // Note: PID file is written by run_hook() with hook-specific name + // Snapshot.cleanup() kills all *.pid processes when done // Wait for chrome_navigate to complete (BLOCKING) await waitForNavigation();