mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
use hook-specific filenames to avoid overwrites
Multiple hooks in the same plugin directory were overwriting each other's stdout.log, stderr.log, hook.pid, and cmd.sh files. Now each hook uses filenames prefixed with its hook name: - on_Snapshot__20_chrome_tab.bg.stdout.log - on_Snapshot__20_chrome_tab.bg.stderr.log - on_Snapshot__20_chrome_tab.bg.pid - on_Snapshot__20_chrome_tab.bg.sh Updated: - hooks.py run_hook() to use hook-specific names - core/models.py cleanup and update_from_output methods - Plugin scripts to no longer write redundant hook.pid files
This commit is contained in:
@@ -1435,10 +1435,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if not self.OUTPUT_DIR.exists():
|
||||
return False
|
||||
|
||||
for plugin_dir in self.OUTPUT_DIR.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
# Check all .pid files in the snapshot directory (hook-specific names)
|
||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
||||
if process_is_alive(pid_file):
|
||||
return True
|
||||
|
||||
@@ -2702,8 +2700,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Read and parse JSONL output from stdout.log
|
||||
stdout_file = plugin_dir / 'stdout.log'
|
||||
# Derive hook basename for hook-specific filenames
|
||||
# e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget"
|
||||
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
|
||||
|
||||
# Read and parse JSONL output from hook-specific stdout log
|
||||
stdout_file = plugin_dir / f'{hook_basename}.stdout.log'
|
||||
stdout = stdout_file.read_text() if stdout_file.exists() else ''
|
||||
|
||||
records = []
|
||||
@@ -2744,7 +2746,16 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.output_str = 'Hook did not output ArchiveResult record'
|
||||
|
||||
# Walk filesystem and populate output_files, output_size, output_mimetypes
|
||||
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
|
||||
# Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log)
|
||||
def is_hook_output_file(name: str) -> bool:
|
||||
"""Check if a file is a hook output file that should be excluded."""
|
||||
return (
|
||||
name.endswith('.stdout.log') or
|
||||
name.endswith('.stderr.log') or
|
||||
name.endswith('.pid') or
|
||||
(name.endswith('.sh') and name.startswith('on_'))
|
||||
)
|
||||
|
||||
mime_sizes = defaultdict(int)
|
||||
total_size = 0
|
||||
output_files = {}
|
||||
@@ -2752,7 +2763,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
for file_path in plugin_dir.rglob('*'):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
if file_path.name in exclude_names:
|
||||
if is_hook_output_file(file_path.name):
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -2810,10 +2821,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
}
|
||||
process_hook_records(filtered_records, overrides=overrides)
|
||||
|
||||
# Cleanup PID files and empty logs
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
# Cleanup PID files and empty logs (hook-specific names)
|
||||
pid_file = plugin_dir / f'{hook_basename}.pid'
|
||||
pid_file.unlink(missing_ok=True)
|
||||
stderr_file = plugin_dir / 'stderr.log'
|
||||
stderr_file = plugin_dir / f'{hook_basename}.stderr.log'
|
||||
if stdout_file.exists() and stdout_file.stat().st_size == 0:
|
||||
stdout_file.unlink()
|
||||
if stderr_file.exists() and stderr_file.stat().st_size == 0:
|
||||
@@ -2919,7 +2930,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
plugin_dir = Path(self.pwd) if self.pwd else None
|
||||
if not plugin_dir:
|
||||
return False
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
# Use hook-specific pid filename
|
||||
hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
|
||||
pid_file = plugin_dir / f'{hook_basename}.pid'
|
||||
return pid_file.exists()
|
||||
|
||||
|
||||
|
||||
@@ -365,11 +365,14 @@ def run_hook(
|
||||
# Old convention: __background in stem (for backwards compatibility)
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
|
||||
# Set up output files for ALL hooks (useful for debugging)
|
||||
stdout_file = output_dir / 'stdout.log'
|
||||
stderr_file = output_dir / 'stderr.log'
|
||||
pid_file = output_dir / 'hook.pid'
|
||||
cmd_file = output_dir / 'cmd.sh'
|
||||
# Set up output files for ALL hooks - use hook-specific names to avoid conflicts
|
||||
# when multiple hooks run in the same plugin directory
|
||||
# e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log
|
||||
hook_basename = script.stem # e.g., "on_Snapshot__20_chrome_tab.bg"
|
||||
stdout_file = output_dir / f'{hook_basename}.stdout.log'
|
||||
stderr_file = output_dir / f'{hook_basename}.stderr.log'
|
||||
pid_file = output_dir / f'{hook_basename}.pid'
|
||||
cmd_file = output_dir / f'{hook_basename}.sh'
|
||||
|
||||
try:
|
||||
# Write command script for validation
|
||||
@@ -421,8 +424,14 @@ def run_hook(
|
||||
# Detect new files created by the hook
|
||||
files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
||||
new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
|
||||
# Exclude the log files themselves from new_files
|
||||
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
|
||||
# Exclude the log/pid/sh files themselves from new_files (hook-specific names)
|
||||
hook_output_files = {
|
||||
f'{hook_basename}.stdout.log',
|
||||
f'{hook_basename}.stderr.log',
|
||||
f'{hook_basename}.pid',
|
||||
f'{hook_basename}.sh',
|
||||
}
|
||||
new_files = [f for f in new_files if f not in hook_output_files]
|
||||
|
||||
# Parse JSONL output from stdout
|
||||
# Each line starting with { that has 'type' field is a record
|
||||
@@ -1235,15 +1244,16 @@ def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = Tru
|
||||
Kill process in PID file with optional validation.
|
||||
|
||||
Args:
|
||||
pid_file: Path to hook.pid file
|
||||
pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid)
|
||||
sig: Signal to send (default SIGTERM)
|
||||
validate: If True, validate process identity before killing (default: True)
|
||||
"""
|
||||
from archivebox.misc.process_utils import safe_kill_process
|
||||
|
||||
|
||||
if validate:
|
||||
# Use safe kill with validation
|
||||
cmd_file = pid_file.parent / 'cmd.sh'
|
||||
# Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh
|
||||
cmd_file = pid_file.with_suffix('.sh')
|
||||
safe_kill_process(pid_file, cmd_file, signal_num=sig)
|
||||
else:
|
||||
# Legacy behavior - kill without validation
|
||||
|
||||
@@ -533,9 +533,9 @@ async function killChrome(pid, outputDir = null) {
|
||||
}
|
||||
|
||||
// Step 8: Clean up PID files
|
||||
// Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup()
|
||||
if (outputDir) {
|
||||
try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
|
||||
try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
|
||||
}
|
||||
|
||||
console.error('[*] Chrome cleanup completed');
|
||||
|
||||
@@ -143,12 +143,11 @@ async function main() {
|
||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||
}
|
||||
|
||||
// Write hook's own PID
|
||||
const hookStartTime = Date.now() / 1000;
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
const result = await launchChromium({
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
function parseArgs() {
|
||||
@@ -221,8 +221,8 @@ async function main() {
|
||||
// Set up listeners BEFORE navigation
|
||||
await setupListeners();
|
||||
|
||||
// Write PID file so chrome_cleanup can kill any remaining processes
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'redirects';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'redirects.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Global state
|
||||
@@ -274,8 +274,8 @@ async function main() {
|
||||
// Set up redirect listener BEFORE navigation
|
||||
await setupRedirectListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'responses';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Resource types to capture (by default, capture everything)
|
||||
@@ -323,8 +323,8 @@ async function main() {
|
||||
// Set up listener BEFORE navigation
|
||||
await setupListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -19,7 +19,7 @@ const puppeteer = require('puppeteer-core');
|
||||
const PLUGIN_NAME = 'ssl';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'ssl.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
function parseArgs() {
|
||||
@@ -211,8 +211,8 @@ async function main() {
|
||||
// Set up listener BEFORE navigation
|
||||
await setupListener(url);
|
||||
|
||||
// Write PID file so chrome_cleanup can kill any remaining processes
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
@@ -18,7 +18,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'staticfile';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
// PID file is now written by run_hook() with hook-specific name
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Content-Types that indicate static files
|
||||
@@ -398,8 +398,8 @@ async function main() {
|
||||
// Set up static file listener BEFORE navigation
|
||||
await setupStaticFileListener();
|
||||
|
||||
// Write PID file
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, PID_FILE), String(process.pid));
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
|
||||
// Wait for chrome_navigate to complete (BLOCKING)
|
||||
await waitForNavigation();
|
||||
|
||||
Reference in New Issue
Block a user