Fix hook file overwrites in plugin directory (#1732)

Multiple hooks in the same plugin directory were overwriting each
other's stdout.log, stderr.log, hook.pid, and cmd.sh files. Now each
hook uses filenames prefixed with its hook name:
- on_Snapshot__20_chrome_tab.bg.stdout.log
- on_Snapshot__20_chrome_tab.bg.stderr.log
- on_Snapshot__20_chrome_tab.bg.pid
- on_Snapshot__20_chrome_tab.bg.sh

Updated:
- hooks.py run_hook() to use hook-specific names
- core/models.py cleanup and update_from_output methods
- Plugin scripts to no longer write redundant hook.pid files

<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->

# Summary

<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->

# Related issues

<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->

# Changes these areas

- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk


<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Prevented hook file collisions by giving each hook its own stdout,
stderr, pid, and cmd filenames. This fixes mixed logs and ensures
correct cleanup and status checks when multiple hooks run in the same
plugin directory.

- **Bug Fixes**
- hooks.py: write hook-specific stdout/stderr/pid/cmd files and exclude
them from new_files; derive cmd.sh from pid for safe kill.
- core/models.py: read hook-specific logs; exclude hook output files
when computing outputs; cleanup and background detection use *.pid.
- Plugins: stop writing redundant hook.pid files; minor chrome utils
cleanup.

<sup>Written for commit 754b096193.
Summary will update on new commits.</sup>

<!-- End of auto-generated description by cubic. -->
This commit is contained in:
Nick Sweeting
2025-12-30 23:36:09 -08:00
committed by GitHub
9 changed files with 63 additions and 41 deletions

View File

@@ -1435,10 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if not self.OUTPUT_DIR.exists():
return False
for plugin_dir in self.OUTPUT_DIR.iterdir():
if not plugin_dir.is_dir():
continue
pid_file = plugin_dir / 'hook.pid'
# Check all .pid files in the snapshot directory (hook-specific names)
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
if process_is_alive(pid_file):
return True
@@ -2702,8 +2700,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.save()
return
# Read and parse JSONL output from stdout.log
stdout_file = plugin_dir / 'stdout.log'
# Derive hook basename for hook-specific filenames
# e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget"
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
# Read and parse JSONL output from hook-specific stdout log
stdout_file = plugin_dir / f'{hook_basename}.stdout.log'
stdout = stdout_file.read_text() if stdout_file.exists() else ''
records = []
@@ -2744,7 +2746,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.output_str = 'Hook did not output ArchiveResult record'
# Walk filesystem and populate output_files, output_size, output_mimetypes
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
# Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log)
def is_hook_output_file(name: str) -> bool:
"""Check if a file is a hook output file that should be excluded."""
return (
name.endswith('.stdout.log') or
name.endswith('.stderr.log') or
name.endswith('.pid') or
(name.endswith('.sh') and name.startswith('on_'))
)
mime_sizes = defaultdict(int)
total_size = 0
output_files = {}
@@ -2752,7 +2763,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
for file_path in plugin_dir.rglob('*'):
if not file_path.is_file():
continue
if file_path.name in exclude_names:
if is_hook_output_file(file_path.name):
continue
try:
@@ -2810,10 +2821,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
}
process_hook_records(filtered_records, overrides=overrides)
# Cleanup PID files and empty logs
pid_file = plugin_dir / 'hook.pid'
# Cleanup PID files and empty logs (hook-specific names)
pid_file = plugin_dir / f'{hook_basename}.pid'
pid_file.unlink(missing_ok=True)
stderr_file = plugin_dir / 'stderr.log'
stderr_file = plugin_dir / f'{hook_basename}.stderr.log'
if stdout_file.exists() and stdout_file.stat().st_size == 0:
stdout_file.unlink()
if stderr_file.exists() and stderr_file.stat().st_size == 0:
@@ -2919,7 +2930,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
plugin_dir = Path(self.pwd) if self.pwd else None
if not plugin_dir:
return False
pid_file = plugin_dir / 'hook.pid'
# Use hook-specific pid filename
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
pid_file = plugin_dir / f'{hook_basename}.pid'
return pid_file.exists()

View File

@@ -365,11 +365,14 @@ def run_hook(
# Old convention: __background in stem (for backwards compatibility)
is_background = '.bg.' in script.name or '__background' in script.stem
# Set up output files for ALL hooks (useful for debugging)
stdout_file = output_dir / 'stdout.log'
stderr_file = output_dir / 'stderr.log'
pid_file = output_dir / 'hook.pid'
cmd_file = output_dir / 'cmd.sh'
# Set up output files for ALL hooks - use hook-specific names to avoid conflicts
# when multiple hooks run in the same plugin directory
# e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log
hook_basename = script.stem # e.g., "on_Snapshot__20_chrome_tab.bg"
stdout_file = output_dir / f'{hook_basename}.stdout.log'
stderr_file = output_dir / f'{hook_basename}.stderr.log'
pid_file = output_dir / f'{hook_basename}.pid'
cmd_file = output_dir / f'{hook_basename}.sh'
try:
# Write command script for validation
@@ -421,8 +424,14 @@ def run_hook(
# Detect new files created by the hook
files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
# Exclude the log files themselves from new_files
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
# Exclude the log/pid/sh files themselves from new_files (hook-specific names)
hook_output_files = {
f'{hook_basename}.stdout.log',
f'{hook_basename}.stderr.log',
f'{hook_basename}.pid',
f'{hook_basename}.sh',
}
new_files = [f for f in new_files if f not in hook_output_files]
# Parse JSONL output from stdout
# Each line starting with { that has 'type' field is a record
@@ -1235,15 +1244,16 @@ def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = Tru
Kill process in PID file with optional validation.
Args:
pid_file: Path to hook.pid file
pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid)
sig: Signal to send (default SIGTERM)
validate: If True, validate process identity before killing (default: True)
"""
from archivebox.misc.process_utils import safe_kill_process
if validate:
# Use safe kill with validation
cmd_file = pid_file.parent / 'cmd.sh'
# Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh
cmd_file = pid_file.with_suffix('.sh')
safe_kill_process(pid_file, cmd_file, signal_num=sig)
else:
# Legacy behavior - kill without validation

View File

@@ -533,9 +533,9 @@ async function killChrome(pid, outputDir = null) {
}
// Step 8: Clean up PID files
// Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup()
if (outputDir) {
try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
}
console.error('[*] Chrome cleanup completed');

View File

@@ -143,12 +143,11 @@ async function main() {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
// Write hook's own PID
const hookStartTime = Date.now() / 1000;
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
// Launch Chromium using consolidated function
const result = await launchChromium({

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
function parseArgs() {
@@ -221,8 +221,8 @@ async function main() {
// Set up listeners BEFORE navigation
await setupListeners();
// Write PID file so chrome_cleanup can kill any remaining processes
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'redirects';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.jsonl';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
// Global state
@@ -274,8 +274,8 @@ async function main() {
// Set up redirect listener BEFORE navigation
await setupRedirectListener();
// Write PID file
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'responses';
const OUTPUT_DIR = '.';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
// Resource types to capture (by default, capture everything)
@@ -323,8 +323,8 @@ async function main() {
// Set up listener BEFORE navigation
await setupListener();
// Write PID file
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'ssl';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.jsonl';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
function parseArgs() {
@@ -211,8 +211,8 @@ async function main() {
// Set up listener BEFORE navigation
await setupListener(url);
// Write PID file so chrome_cleanup can kill any remaining processes
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();

View File

@@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'staticfile';
const OUTPUT_DIR = '.';
const PID_FILE = 'hook.pid';
// PID file is now written by run_hook() with hook-specific name
const CHROME_SESSION_DIR = '../chrome';
// Content-Types that indicate static files
@@ -398,8 +398,8 @@ async function main() {
// Set up static file listener BEFORE navigation
await setupStaticFileListener();
// Write PID file
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
// Wait for chrome_navigate to complete (BLOCKING)
await waitForNavigation();