mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Fix hook file overwrites in plugin directory (#1732)
Multiple hooks in the same plugin directory were overwriting each
other's stdout.log, stderr.log, hook.pid, and cmd.sh files. Now each
hook uses filenames prefixed with its hook name:
- on_Snapshot__20_chrome_tab.bg.stdout.log
- on_Snapshot__20_chrome_tab.bg.stderr.log
- on_Snapshot__20_chrome_tab.bg.pid
- on_Snapshot__20_chrome_tab.bg.sh
Updated:
- hooks.py run_hook() to use hook-specific names
- core/models.py cleanup and update_from_output methods
- Plugin scripts to no longer write redundant hook.pid files
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->
# Summary
<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->
# Related issues
<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->
# Changes these areas
- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Prevented hook file collisions by giving each hook its own stdout,
stderr, pid, and cmd filenames. This fixes mixed logs and ensures
correct cleanup and status checks when multiple hooks run in the same
plugin directory.
- **Bug Fixes**
- hooks.py: write hook-specific stdout/stderr/pid/cmd files and exclude
them from new_files; derive cmd.sh from pid for safe kill.
- core/models.py: read hook-specific logs; exclude hook output files
when computing outputs; cleanup and background detection use *.pid.
- Plugins: stop writing redundant hook.pid files; minor chrome utils
cleanup.
<sup>Written for commit 754b096193.
Summary will update on new commits.</sup>
<!-- End of auto-generated description by cubic. -->
This commit is contained in:
@@ -1435,10 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if not self.OUTPUT_DIR.exists():
|
||||
return False
|
||||
|
||||
for plugin_dir in self.OUTPUT_DIR.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
# Check all .pid files in the snapshot directory (hook-specific names)
|
||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
||||
if process_is_alive(pid_file):
|
||||
return True
|
||||
|
||||
@@ -2702,8 +2700,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Read and parse JSONL output from stdout.log
|
||||
stdout_file = plugin_dir / 'stdout.log'
|
||||
# Derive hook basename for hook-specific filenames
|
||||
# e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget"
|
||||
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
|
||||
|
||||
# Read and parse JSONL output from hook-specific stdout log
|
||||
stdout_file = plugin_dir / f'{hook_basename}.stdout.log'
|
||||
stdout = stdout_file.read_text() if stdout_file.exists() else ''
|
||||
|
||||
records = []
|
||||
@@ -2744,7 +2746,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.output_str = 'Hook did not output ArchiveResult record'
|
||||
|
||||
# Walk filesystem and populate output_files, output_size, output_mimetypes
|
||||
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
|
||||
# Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log)
|
||||
def is_hook_output_file(name: str) -> bool:
|
||||
"""Check if a file is a hook output file that should be excluded."""
|
||||
return (
|
||||
name.endswith('.stdout.log') or
|
||||
name.endswith('.stderr.log') or
|
||||
name.endswith('.pid') or
|
||||
(name.endswith('.sh') and name.startswith('on_'))
|
||||
)
|
||||
|
||||
mime_sizes = defaultdict(int)
|
||||
total_size = 0
|
||||
output_files = {}
|
||||
@@ -2752,7 +2763,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
for file_path in plugin_dir.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
if file_path.name in exclude_names:
|
||||
if is_hook_output_file(file_path.name):
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -2810,10 +2821,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
}
|
||||
process_hook_records(filtered_records, overrides=overrides)
|
||||
|
||||
# Cleanup PID files and empty logs
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
# Cleanup PID files and empty logs (hook-specific names)
|
||||
pid_file = plugin_dir / f'{hook_basename}.pid'
|
||||
pid_file.unlink(missing_ok=True)
|
||||
stderr_file = plugin_dir / 'stderr.log'
|
||||
stderr_file = plugin_dir / f'{hook_basename}.stderr.log'
|
||||
if stdout_file.exists() and stdout_file.stat().st_size == 0:
|
||||
stdout_file.unlink()
|
||||
if stderr_file.exists() and stderr_file.stat().st_size == 0:
|
||||
@@ -2919,7 +2930,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
plugin_dir = Path(self.pwd) if self.pwd else None
|
||||
if not plugin_dir:
|
||||
return False
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
# Use hook-specific pid filename
|
||||
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
|
||||
pid_file = plugin_dir / f'{hook_basename}.pid'
|
||||
return pid_file.exists()
|
||||
|
||||
|
||||
|
||||
@@ -365,11 +365,14 @@ def run_hook(
|
||||
# Old convention: __background in stem (for backwards compatibility)
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
|
||||
# Set up output files for ALL hooks (useful for debugging)
|
||||
stdout_file = output_dir / 'stdout.log'
|
||||
stderr_file = output_dir / 'stderr.log'
|
||||
pid_file = output_dir / 'hook.pid'
|
||||
cmd_file = output_dir / 'cmd.sh'
|
||||
# Set up output files for ALL hooks - use hook-specific names to avoid conflicts
|
||||
# when multiple hooks run in the same plugin directory
|
||||
# e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log
|
||||
hook_basename = script.stem # e.g., "on_Snapshot__20_chrome_tab.bg"
|
||||
stdout_file = output_dir / f'{hook_basename}.stdout.log'
|
||||
stderr_file = output_dir / f'{hook_basename}.stderr.log'
|
||||
pid_file = output_dir / f'{hook_basename}.pid'
|
||||
cmd_file = output_dir / f'{hook_basename}.sh'
|
||||
|
||||
try:
|
||||
# Write command script for validation
|
||||
@@ -421,8 +424,14 @@ def run_hook(
|
||||
# Detect new files created by the hook
|
||||
files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
||||
new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
|
||||
# Exclude the log files themselves from new_files
|
||||
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
|
||||
# Exclude the log/pid/sh files themselves from new_files (hook-specific names)
|
||||
hook_output_files = {
|
||||
f'{hook_basename}.stdout.log',
|
||||
f'{hook_basename}.stderr.log',
|
||||
f'{hook_basename}.pid',
|
||||
f'{hook_basename}.sh',
|
||||
}
|
||||
new_files = [f for f in new_files if f not in hook_output_files]
|
||||
|
||||
# Parse JSONL output from stdout
|
||||
# Each line starting with { that has 'type' field is a record
|
||||
@@ -1235,15 +1244,16 @@ def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = Tru
|
||||
Kill process in PID file with optional validation.
|
||||
|
||||
Args:
|
||||
pid_file: Path to hook.pid file
|
||||
pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid)
|
||||
sig: Signal to send (default SIGTERM)
|
||||
validate: If True, validate process identity before killing (default: True)
|
||||
"""
|
||||
from archivebox.misc.process_utils import safe_kill_process
|
||||
|
||||
|
||||
if validate:
|
||||
# Use safe kill with validation
|
||||
cmd_file = pid_file.parent / 'cmd.sh'
|
||||
# Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh
|
||||
cmd_file = pid_file.with_suffix('.sh')
|
||||
safe_kill_process(pid_file, cmd_file, signal_num=sig)
|
||||
else:
|
||||
# Legacy behavior - kill without validation
|
||||
|
||||
@@ -533,9 +533,9 @@ async function killChrome(pid, outputDir = null) {
|
||||
}
|
||||
|
||||
// Step 8: Clean up PID files
|
||||
// Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup()
|
||||
if (outputDir) {
|
||||
try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
|
||||
try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
|
||||
}
|
||||
|
||||
console.error('[*] Chrome cleanup completed');
|
||||
|
||||
@@ -143,12 +143,11 @@ async function main() {
|
||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||
}
|
||||
|
||||
// Write hook's own PID
|
||||
const hookStartTime = Date.now() / 1000;
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
const result = await launchChromium({
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
function parseArgs() {
|
||||
@@ -221,8 +221,8 @@ async function main() {
|
||||
// Set up listeners BEFORE navigation
|
||||
await setupListeners();
|
||||
|
||||
// Write PID file so chrome_cleanup can kill any remaining processes
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'redirects';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'redirects.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Global state
|
||||
@@ -274,8 +274,8 @@ async function main() {
|
||||
// Set up redirect listener BEFORE navigation
|
||||
await setupRedirectListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'responses';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Resource types to capture (by default, capture everything)
|
||||
@@ -323,8 +323,8 @@ async function main() {
|
||||
// Set up listener BEFORE navigation
|
||||
await setupListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'ssl';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'ssl.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
function parseArgs() {
|
||||
@@ -211,8 +211,8 @@ async function main() {
|
||||
// Set up listener BEFORE navigation
|
||||
await setupListener(url);
|
||||
|
||||
// Write PID file so chrome_cleanup can kill any remaining processes
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'staticfile';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Content-Types that indicate static files
|
||||
@@ -398,8 +398,8 @@ async function main() {
|
||||
// Set up static file listener BEFORE navigation
|
||||
await setupStaticFileListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
Reference in New Issue
Block a user