Fix hook file overwrites in plugin directory (#1732)

Multiple hooks in the same plugin directory were overwriting each other's stdout.log, stderr.log, hook.pid, and cmd.sh files. Now each hook uses filenames prefixed with its hook name: - on_Snapshot__20_chrome_tab.bg.stdout.log - on_Snapshot__20_chrome_tab.bg.stderr.log - on_Snapshot__20_chrome_tab.bg.pid - on_Snapshot__20_chrome_tab.bg.sh Updated: - hooks.py run_hook() to use hook-specific names - core/models.py cleanup and update_from_output methods - Plugin scripts to no longer write redundant hook.pid files  # Summary  # Related issues  # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk  --- ## Summary by cubic Prevented hook file collisions by giving each hook its own stdout, stderr, pid, and cmd filenames. This fixes mixed logs and ensures correct cleanup and status checks when multiple hooks run in the same plugin directory. - **Bug Fixes** - hooks.py: write hook-specific stdout/stderr/pid/cmd files and exclude them from new_files; derive cmd.sh from pid for safe kill. - core/models.py: read hook-specific logs; exclude hook output files when computing outputs; cleanup and background detection use *.pid. - Plugins: stop writing redundant hook.pid files; minor chrome utils cleanup. <sup>Written for commit 754b096193. Summary will update on new commits.</sup>
2026-04-06 07:47:53 +10:00 · 2025-12-30 23:36:09 -08:00
parent dac6c63bba 754b096193
commit e26a0f6fc0
9 changed files with 63 additions and 41 deletions
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1435,10 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        if not self.OUTPUT_DIR.exists():
            return False

-        for plugin_dir in self.OUTPUT_DIR.iterdir():
-            if not plugin_dir.is_dir():
-                continue
-            pid_file = plugin_dir / 'hook.pid'
+        # Check all .pid files in the snapshot directory (hook-specific names)
+        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
            if process_is_alive(pid_file):
                return True

@@ -2702,8 +2700,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            self.save()
            return

-        # Read and parse JSONL output from stdout.log
-        stdout_file = plugin_dir / 'stdout.log'
+        # Derive hook basename for hook-specific filenames
+        # e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget"
+        hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
+
+        # Read and parse JSONL output from hook-specific stdout log
+        stdout_file = plugin_dir / f'{hook_basename}.stdout.log'
        stdout = stdout_file.read_text() if stdout_file.exists() else ''

        records = []
@@ -2744,7 +2746,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            self.output_str = 'Hook did not output ArchiveResult record'

        # Walk filesystem and populate output_files, output_size, output_mimetypes
-        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
+        # Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log)
+        def is_hook_output_file(name: str) -> bool:
+            """Check if a file is a hook output file that should be excluded."""
+            return (
+                name.endswith('.stdout.log') or
+                name.endswith('.stderr.log') or
+                name.endswith('.pid') or
+                (name.endswith('.sh') and name.startswith('on_'))
+            )
+
        mime_sizes = defaultdict(int)
        total_size = 0
        output_files = {}
@@ -2752,7 +2763,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        for file_path in plugin_dir.rglob('*'):
            if not file_path.is_file():
                continue
-            if file_path.name in exclude_names:
+            if is_hook_output_file(file_path.name):
                continue

            try:
@@ -2810,10 +2821,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        }
        process_hook_records(filtered_records, overrides=overrides)

-        # Cleanup PID files and empty logs
-        pid_file = plugin_dir / 'hook.pid'
+        # Cleanup PID files and empty logs (hook-specific names)
+        pid_file = plugin_dir / f'{hook_basename}.pid'
        pid_file.unlink(missing_ok=True)
-        stderr_file = plugin_dir / 'stderr.log'
+        stderr_file = plugin_dir / f'{hook_basename}.stderr.log'
        if stdout_file.exists() and stdout_file.stat().st_size == 0:
            stdout_file.unlink()
        if stderr_file.exists() and stderr_file.stat().st_size == 0:
@@ -2919,7 +2930,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        plugin_dir = Path(self.pwd) if self.pwd else None
        if not plugin_dir:
            return False
-        pid_file = plugin_dir / 'hook.pid'
+        # Use hook-specific pid filename
+        hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
+        pid_file = plugin_dir / f'{hook_basename}.pid'
        return pid_file.exists()


--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -365,11 +365,14 @@ def run_hook(
    # Old convention: __background in stem (for backwards compatibility)
    is_background = '.bg.' in script.name or '__background' in script.stem

-    # Set up output files for ALL hooks (useful for debugging)
-    stdout_file = output_dir / 'stdout.log'
-    stderr_file = output_dir / 'stderr.log'
-    pid_file = output_dir / 'hook.pid'
-    cmd_file = output_dir / 'cmd.sh'
+    # Set up output files for ALL hooks - use hook-specific names to avoid conflicts
+    # when multiple hooks run in the same plugin directory
+    # e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log
+    hook_basename = script.stem  # e.g., "on_Snapshot__20_chrome_tab.bg"
+    stdout_file = output_dir / f'{hook_basename}.stdout.log'
+    stderr_file = output_dir / f'{hook_basename}.stderr.log'
+    pid_file = output_dir / f'{hook_basename}.pid'
+    cmd_file = output_dir / f'{hook_basename}.sh'

    try:
        # Write command script for validation
@@ -421,8 +424,14 @@ def run_hook(
        # Detect new files created by the hook
        files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
        new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
-        # Exclude the log files themselves from new_files
-        new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
+        # Exclude the log/pid/sh files themselves from new_files (hook-specific names)
+        hook_output_files = {
+            f'{hook_basename}.stdout.log',
+            f'{hook_basename}.stderr.log',
+            f'{hook_basename}.pid',
+            f'{hook_basename}.sh',
+        }
+        new_files = [f for f in new_files if f not in hook_output_files]

        # Parse JSONL output from stdout
        # Each line starting with { that has 'type' field is a record
@@ -1235,15 +1244,16 @@ def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = Tru
    Kill process in PID file with optional validation.

    Args:
-        pid_file: Path to hook.pid file
+        pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid)
        sig: Signal to send (default SIGTERM)
        validate: If True, validate process identity before killing (default: True)
    """
    from archivebox.misc.process_utils import safe_kill_process
-    
+
    if validate:
        # Use safe kill with validation
-        cmd_file = pid_file.parent / 'cmd.sh'
+        # Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh
+        cmd_file = pid_file.with_suffix('.sh')
        safe_kill_process(pid_file, cmd_file, signal_num=sig)
    else:
        # Legacy behavior - kill without validation
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -533,9 +533,9 @@ async function killChrome(pid, outputDir = null) {
    }

    // Step 8: Clean up PID files
+    // Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup()
    if (outputDir) {
        try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
-        try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
    }

    console.error('[*] Chrome cleanup completed');
--- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
@@ -143,12 +143,11 @@ async function main() {
            console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
        }

-        // Write hook's own PID
-        const hookStartTime = Date.now() / 1000;
+        // Note: PID file is written by run_hook() with hook-specific name
+        // Snapshot.cleanup() kills all *.pid processes when done
        if (!fs.existsSync(OUTPUT_DIR)) {
            fs.mkdirSync(OUTPUT_DIR, { recursive: true });
        }
-        writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);

        // Launch Chromium using consolidated function
        const result = await launchChromium({
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
 const PLUGIN_NAME = 'consolelog';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'console.jsonl';
-const PID_FILE = 'hook.pid';
+// PID file is now written by run_hook() with hook-specific name
 const CHROME_SESSION_DIR = '../chrome';

 function parseArgs() {
@@ -221,8 +221,8 @@ async function main() {
        // Set up listeners BEFORE navigation
        await setupListeners();

-        // Write PID file so chrome_cleanup can kill any remaining processes
-        fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
+        // Note: PID file is written by run_hook() with hook-specific name
+        // Snapshot.cleanup() kills all *.pid processes when done

        // Wait for chrome_navigate to complete (BLOCKING)
        await waitForNavigation();
--- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js
+++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
 const PLUGIN_NAME = 'redirects';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'redirects.jsonl';
-const PID_FILE = 'hook.pid';
+// PID file is now written by run_hook() with hook-specific name
 const CHROME_SESSION_DIR = '../chrome';

 // Global state
@@ -274,8 +274,8 @@ async function main() {
        // Set up redirect listener BEFORE navigation
        await setupRedirectListener();

-        // Write PID file
-        fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
+        // Note: PID file is written by run_hook() with hook-specific name
+        // Snapshot.cleanup() kills all *.pid processes when done

        // Wait for chrome_navigate to complete (BLOCKING)
        await waitForNavigation();
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');

 const PLUGIN_NAME = 'responses';
 const OUTPUT_DIR = '.';
-const PID_FILE = 'hook.pid';
+// PID file is now written by run_hook() with hook-specific name
 const CHROME_SESSION_DIR = '../chrome';

 // Resource types to capture (by default, capture everything)
@@ -323,8 +323,8 @@ async function main() {
        // Set up listener BEFORE navigation
        await setupListener();

-        // Write PID file
-        fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
+        // Note: PID file is written by run_hook() with hook-specific name
+        // Snapshot.cleanup() kills all *.pid processes when done

        // Wait for chrome_navigate to complete (BLOCKING)
        await waitForNavigation();
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
 const PLUGIN_NAME = 'ssl';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'ssl.jsonl';
-const PID_FILE = 'hook.pid';
+// PID file is now written by run_hook() with hook-specific name
 const CHROME_SESSION_DIR = '../chrome';

 function parseArgs() {
@@ -211,8 +211,8 @@ async function main() {
        // Set up listener BEFORE navigation
        await setupListener(url);

-        // Write PID file so chrome_cleanup can kill any remaining processes
-        fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
+        // Note: PID file is written by run_hook() with hook-specific name
+        // Snapshot.cleanup() kills all *.pid processes when done

        // Wait for chrome_navigate to complete (BLOCKING)
        await waitForNavigation();
--- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js
+++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js
@@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core');

 const PLUGIN_NAME = 'staticfile';
 const OUTPUT_DIR = '.';
-const PID_FILE = 'hook.pid';
+// PID file is now written by run_hook() with hook-specific name
 const CHROME_SESSION_DIR = '../chrome';

 // Content-Types that indicate static files
@@ -398,8 +398,8 @@ async function main() {
        // Set up static file listener BEFORE navigation
        await setupStaticFileListener();

-        // Write PID file
-        fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
+        // Note: PID file is written by run_hook() with hook-specific name
+        // Snapshot.cleanup() kills all *.pid processes when done

        // Wait for chrome_navigate to complete (BLOCKING)
        await waitForNavigation();