mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
1363 lines
46 KiB
Python
1363 lines
46 KiB
Python
"""
|
|
Hook discovery and execution system for ArchiveBox plugins.
|
|
|
|
Hooks are standalone scripts that run as separate processes and communicate
|
|
with ArchiveBox via CLI arguments and stdout JSON output. This keeps the plugin
|
|
system simple and language-agnostic.
|
|
|
|
Directory structure:
|
|
archivebox/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in)
|
|
data/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
|
|
|
|
Hook contract:
|
|
Input: --url=<url> (and other --key=value args)
|
|
Output: JSON to stdout, files to $PWD
|
|
Exit: 0 = success, non-zero = failure
|
|
|
|
Execution order:
|
|
- Extractors run sequentially within each Snapshot (ordered by numeric prefix)
|
|
- Multiple Snapshots can process in parallel
|
|
- Failed extractors don't block subsequent extractors
|
|
|
|
Dependency handling:
|
|
Extractor plugins that depend on other plugins' output should check at runtime:
|
|
|
|
```python
|
|
# Example: screenshot plugin depends on chrome plugin
|
|
chrome_session_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome_session'
|
|
if not (chrome_session_dir / 'session.json').exists():
|
|
print('{"status": "skipped", "output": "chrome_session not available"}')
|
|
sys.exit(1) # Exit non-zero so it gets retried later
|
|
```
|
|
|
|
On retry (Snapshot.retry_failed_archiveresults()):
|
|
- Only FAILED/SKIPPED plugins reset to queued (SUCCEEDED stays)
|
|
- Run in order again
|
|
- If dependencies now succeed, dependents can run
|
|
|
|
API (all hook logic lives here):
|
|
discover_hooks(event) -> List[Path] Find hook scripts
|
|
run_hook(script, ...) -> HookResult Execute a hook script
|
|
run_hooks(event, ...) -> List[HookResult] Run all hooks for an event
|
|
"""
|
|
|
|
__package__ = 'archivebox'
|
|
|
|
import os
|
|
import json
|
|
import signal
|
|
import time
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional, TypedDict
|
|
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
|
|
|
|
# Plugin directories
|
|
BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
|
|
USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
|
|
|
|
|
|
class HookResult(TypedDict, total=False):
|
|
"""Raw result from run_hook()."""
|
|
returncode: int
|
|
stdout: str
|
|
stderr: str
|
|
output_json: Optional[Dict[str, Any]]
|
|
output_files: List[str]
|
|
duration_ms: int
|
|
hook: str
|
|
plugin: str # Plugin name (directory name, e.g., 'wget', 'screenshot')
|
|
hook_name: str # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
|
|
# New fields for JSONL parsing
|
|
records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
|
|
|
|
|
|
def discover_hooks(event_name: str) -> List[Path]:
|
|
"""
|
|
Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
|
|
|
|
Searches both built-in and user plugin directories.
|
|
Returns scripts sorted alphabetically by filename for deterministic execution order.
|
|
|
|
Hook naming convention uses numeric prefixes to control order:
|
|
on_Snapshot__10_title.py # runs first
|
|
on_Snapshot__15_singlefile.py # runs second
|
|
on_Snapshot__26_readability.py # runs later (depends on singlefile)
|
|
|
|
Example:
|
|
discover_hooks('Snapshot')
|
|
# Returns: [Path('.../on_Snapshot__10_title.py'), Path('.../on_Snapshot__15_singlefile.py'), ...]
|
|
"""
|
|
hooks = []
|
|
|
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
|
if not base_dir.exists():
|
|
continue
|
|
|
|
# Search for hook scripts in all subdirectories
|
|
for ext in ('sh', 'py', 'js'):
|
|
pattern = f'*/on_{event_name}__*.{ext}'
|
|
hooks.extend(base_dir.glob(pattern))
|
|
|
|
# Also check for hooks directly in the plugins directory
|
|
pattern_direct = f'on_{event_name}__*.{ext}'
|
|
hooks.extend(base_dir.glob(pattern_direct))
|
|
|
|
# Sort by filename (not full path) to ensure numeric prefix ordering works
|
|
# e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
|
|
return sorted(set(hooks), key=lambda p: p.name)
|
|
|
|
|
|
def discover_all_hooks() -> Dict[str, List[Path]]:
|
|
"""
|
|
Discover all hooks organized by event name.
|
|
|
|
Returns a dict mapping event names to lists of hook script paths.
|
|
"""
|
|
hooks_by_event: Dict[str, List[Path]] = {}
|
|
|
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
|
if not base_dir.exists():
|
|
continue
|
|
|
|
for ext in ('sh', 'py', 'js'):
|
|
for hook_path in base_dir.glob(f'*/on_*__*.{ext}'):
|
|
# Extract event name from filename: on_EventName__hook_name.ext
|
|
filename = hook_path.stem # on_EventName__hook_name
|
|
if filename.startswith('on_') and '__' in filename:
|
|
event_name = filename[3:].split('__')[0] # EventName
|
|
if event_name not in hooks_by_event:
|
|
hooks_by_event[event_name] = []
|
|
hooks_by_event[event_name].append(hook_path)
|
|
|
|
# Sort hooks within each event
|
|
for event_name in hooks_by_event:
|
|
hooks_by_event[event_name] = sorted(set(hooks_by_event[event_name]), key=lambda p: p.name)
|
|
|
|
return hooks_by_event
|
|
|
|
|
|
def run_hook(
|
|
script: Path,
|
|
output_dir: Path,
|
|
timeout: int = 300,
|
|
config_objects: Optional[List[Any]] = None,
|
|
**kwargs: Any
|
|
) -> HookResult:
|
|
"""
|
|
Execute a hook script with the given arguments.
|
|
|
|
This is the low-level hook executor. For running extractors with proper
|
|
metadata handling, use call_extractor() instead.
|
|
|
|
Config is passed to hooks via environment variables with this priority:
|
|
1. Plugin schema defaults (config.json)
|
|
2. Config file (ArchiveBox.conf)
|
|
3. Environment variables
|
|
4. Machine.config (auto-included, lowest override priority)
|
|
5. config_objects (in order - later objects override earlier ones)
|
|
|
|
Args:
|
|
script: Path to the hook script (.sh, .py, or .js)
|
|
output_dir: Working directory for the script (where output files go)
|
|
timeout: Maximum execution time in seconds
|
|
config_objects: Optional list of objects with .config JSON fields
|
|
(e.g., [crawl, snapshot] - later items have higher priority)
|
|
**kwargs: Arguments passed to the script as --key=value
|
|
|
|
Returns:
|
|
HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
|
|
"""
|
|
import time
|
|
start_time = time.time()
|
|
|
|
# Auto-include Machine.config at the start (lowest priority among config_objects)
|
|
from machine.models import Machine
|
|
machine = Machine.current()
|
|
all_config_objects = [machine] + list(config_objects or [])
|
|
|
|
if not script.exists():
|
|
return HookResult(
|
|
returncode=1,
|
|
stdout='',
|
|
stderr=f'Hook script not found: {script}',
|
|
output_json=None,
|
|
output_files=[],
|
|
duration_ms=0,
|
|
hook=str(script),
|
|
plugin=script.parent.name,
|
|
hook_name=script.name,
|
|
)
|
|
|
|
# Determine the interpreter based on file extension
|
|
ext = script.suffix.lower()
|
|
if ext == '.sh':
|
|
cmd = ['bash', str(script)]
|
|
elif ext == '.py':
|
|
cmd = ['python3', str(script)]
|
|
elif ext == '.js':
|
|
cmd = ['node', str(script)]
|
|
else:
|
|
# Try to execute directly (assumes shebang)
|
|
cmd = [str(script)]
|
|
|
|
# Build CLI arguments from kwargs
|
|
for key, value in kwargs.items():
|
|
# Skip keys that start with underscore (internal parameters)
|
|
if key.startswith('_'):
|
|
continue
|
|
|
|
arg_key = f'--{key.replace("_", "-")}'
|
|
if isinstance(value, bool):
|
|
if value:
|
|
cmd.append(arg_key)
|
|
elif value is not None and value != '':
|
|
# JSON-encode complex values, use str for simple ones
|
|
# Skip empty strings to avoid --key= which breaks argument parsers
|
|
if isinstance(value, (dict, list)):
|
|
cmd.append(f'{arg_key}={json.dumps(value)}')
|
|
else:
|
|
# Ensure value is converted to string and strip whitespace
|
|
str_value = str(value).strip()
|
|
if str_value: # Only add if non-empty after stripping
|
|
cmd.append(f'{arg_key}={str_value}')
|
|
|
|
# Set up environment with base paths
|
|
env = os.environ.copy()
|
|
env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
|
|
env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
|
|
env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
|
|
|
|
# If a Crawl is in config_objects, pass its OUTPUT_DIR for hooks that need to find crawl-level resources
|
|
for obj in all_config_objects:
|
|
if hasattr(obj, 'OUTPUT_DIR') and hasattr(obj, 'get_urls_list'): # Duck-type check for Crawl
|
|
env['CRAWL_OUTPUT_DIR'] = str(obj.OUTPUT_DIR)
|
|
break
|
|
|
|
# Build overrides from any objects with .config fields (in order, later overrides earlier)
|
|
# all_config_objects includes Machine at the start, then any passed config_objects
|
|
overrides = {}
|
|
for obj in all_config_objects:
|
|
if obj and hasattr(obj, 'config') and obj.config:
|
|
# Strip 'config/' prefix from Machine.config keys (e.g., 'config/CHROME_BINARY' -> 'CHROME_BINARY')
|
|
for key, value in obj.config.items():
|
|
clean_key = key.removeprefix('config/')
|
|
overrides[clean_key] = value
|
|
|
|
# Get plugin config from JSON schemas with hierarchy resolution
|
|
# This merges: schema defaults -> config file -> env vars -> object config overrides
|
|
plugin_config = get_flat_plugin_config(overrides=overrides if overrides else None)
|
|
export_plugin_config_to_env(plugin_config, env)
|
|
|
|
# Also pass core config values that aren't in plugin schemas yet
|
|
# These are legacy values that may still be needed
|
|
from archivebox import config
|
|
env.setdefault('CHROME_BINARY', str(getattr(config, 'CHROME_BINARY', '')))
|
|
env.setdefault('WGET_BINARY', str(getattr(config, 'WGET_BINARY', '')))
|
|
env.setdefault('CURL_BINARY', str(getattr(config, 'CURL_BINARY', '')))
|
|
env.setdefault('GIT_BINARY', str(getattr(config, 'GIT_BINARY', '')))
|
|
env.setdefault('YOUTUBEDL_BINARY', str(getattr(config, 'YOUTUBEDL_BINARY', '')))
|
|
env.setdefault('SINGLEFILE_BINARY', str(getattr(config, 'SINGLEFILE_BINARY', '')))
|
|
env.setdefault('READABILITY_BINARY', str(getattr(config, 'READABILITY_BINARY', '')))
|
|
env.setdefault('MERCURY_BINARY', str(getattr(config, 'MERCURY_BINARY', '')))
|
|
env.setdefault('NODE_BINARY', str(getattr(config, 'NODE_BINARY', '')))
|
|
env.setdefault('TIMEOUT', str(getattr(config, 'TIMEOUT', 60)))
|
|
env.setdefault('CHECK_SSL_VALIDITY', str(getattr(config, 'CHECK_SSL_VALIDITY', True)))
|
|
env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
|
|
env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
|
|
|
|
# Pass SEARCH_BACKEND_ENGINE from new-style config
|
|
try:
|
|
from archivebox.config.configset import get_config
|
|
search_config = get_config()
|
|
env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
|
|
except Exception:
|
|
env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
|
|
|
|
# Create output directory if needed
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Capture files before execution to detect new output
|
|
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
|
|
|
# Detect if this is a background hook (long-running daemon)
|
|
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
|
|
# Old convention: __background in stem (for backwards compatibility)
|
|
is_background = '.bg.' in script.name or '__background' in script.stem
|
|
|
|
# Set up output files for ALL hooks (useful for debugging)
|
|
stdout_file = output_dir / 'stdout.log'
|
|
stderr_file = output_dir / 'stderr.log'
|
|
pid_file = output_dir / 'hook.pid'
|
|
cmd_file = output_dir / 'cmd.sh'
|
|
|
|
try:
|
|
# Write command script for validation
|
|
from archivebox.misc.process_utils import write_cmd_file
|
|
write_cmd_file(cmd_file, cmd)
|
|
|
|
# Open log files for writing
|
|
with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err:
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
cwd=str(output_dir),
|
|
stdout=out,
|
|
stderr=err,
|
|
env=env,
|
|
)
|
|
|
|
# Write PID with mtime set to process start time for validation
|
|
from archivebox.misc.process_utils import write_pid_file_with_mtime
|
|
process_start_time = time.time()
|
|
write_pid_file_with_mtime(pid_file, process.pid, process_start_time)
|
|
|
|
if is_background:
|
|
# Background hook - return None immediately, don't wait
|
|
# Process continues running, writing to stdout.log
|
|
# ArchiveResult will poll for completion later
|
|
return None
|
|
|
|
# Normal hook - wait for completion with timeout
|
|
try:
|
|
returncode = process.wait(timeout=timeout)
|
|
except subprocess.TimeoutExpired:
|
|
process.kill()
|
|
process.wait() # Clean up zombie
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
return HookResult(
|
|
returncode=-1,
|
|
stdout='',
|
|
stderr=f'Hook timed out after {timeout} seconds',
|
|
output_json=None,
|
|
output_files=[],
|
|
duration_ms=duration_ms,
|
|
hook=str(script),
|
|
)
|
|
|
|
# Read output from files
|
|
stdout = stdout_file.read_text() if stdout_file.exists() else ''
|
|
stderr = stderr_file.read_text() if stderr_file.exists() else ''
|
|
|
|
# Detect new files created by the hook
|
|
files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
|
new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
|
|
# Exclude the log files themselves from new_files
|
|
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
|
|
|
|
# Parse JSONL output from stdout
|
|
# Each line starting with { that has 'type' field is a record
|
|
records = []
|
|
plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
|
|
hook_name = script.name # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
|
|
|
|
for line in stdout.splitlines():
|
|
line = line.strip()
|
|
if not line or not line.startswith('{'):
|
|
continue
|
|
|
|
try:
|
|
data = json.loads(line)
|
|
if 'type' in data:
|
|
# Add plugin metadata to every record
|
|
data['plugin'] = plugin_name
|
|
data['hook_name'] = hook_name
|
|
data['plugin_hook'] = str(script)
|
|
records.append(data)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
|
|
# Clean up log files on success (keep on failure for debugging)
|
|
if returncode == 0:
|
|
stdout_file.unlink(missing_ok=True)
|
|
stderr_file.unlink(missing_ok=True)
|
|
pid_file.unlink(missing_ok=True)
|
|
|
|
return HookResult(
|
|
returncode=returncode,
|
|
stdout=stdout,
|
|
stderr=stderr,
|
|
output_json=output_json,
|
|
output_files=new_files,
|
|
duration_ms=duration_ms,
|
|
hook=str(script),
|
|
plugin=plugin_name,
|
|
hook_name=hook_name,
|
|
records=records,
|
|
)
|
|
|
|
except Exception as e:
|
|
duration_ms = int((time.time() - start_time) * 1000)
|
|
return HookResult(
|
|
returncode=-1,
|
|
stdout='',
|
|
stderr=f'Failed to run hook: {type(e).__name__}: {e}',
|
|
output_json=None,
|
|
output_files=[],
|
|
duration_ms=duration_ms,
|
|
hook=str(script),
|
|
plugin=script.parent.name,
|
|
hook_name=script.name,
|
|
records=[],
|
|
)
|
|
|
|
|
|
def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
|
|
"""
|
|
Collect all urls.jsonl entries from parser plugin output subdirectories.
|
|
|
|
Each parser plugin outputs urls.jsonl to its own subdir:
|
|
snapshot_dir/parse_rss_urls/urls.jsonl
|
|
snapshot_dir/parse_html_urls/urls.jsonl
|
|
etc.
|
|
|
|
This is not special handling - urls.jsonl is just a normal output file.
|
|
This utility collects them all for the crawl system.
|
|
"""
|
|
urls = []
|
|
|
|
# Look in each immediate subdirectory for urls.jsonl
|
|
if not snapshot_dir.exists():
|
|
return urls
|
|
|
|
for subdir in snapshot_dir.iterdir():
|
|
if not subdir.is_dir():
|
|
continue
|
|
|
|
urls_file = subdir / 'urls.jsonl'
|
|
if not urls_file.exists():
|
|
continue
|
|
|
|
try:
|
|
with open(urls_file, 'r') as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
try:
|
|
entry = json.loads(line)
|
|
if entry.get('url'):
|
|
# Track which parser plugin found this URL
|
|
entry['plugin'] = subdir.name
|
|
urls.append(entry)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
except Exception:
|
|
pass
|
|
|
|
return urls
|
|
|
|
|
|
def run_hooks(
|
|
event_name: str,
|
|
output_dir: Path,
|
|
timeout: int = 300,
|
|
stop_on_failure: bool = False,
|
|
config_objects: Optional[List[Any]] = None,
|
|
**kwargs: Any
|
|
) -> List[HookResult]:
|
|
"""
|
|
Run all hooks for a given event.
|
|
|
|
Args:
|
|
event_name: The event name to trigger (e.g., 'Snapshot__wget')
|
|
output_dir: Working directory for hook scripts
|
|
timeout: Maximum execution time per hook
|
|
stop_on_failure: If True, stop executing hooks after first failure
|
|
config_objects: Optional list of objects with .config JSON fields
|
|
(e.g., [crawl, snapshot] - later items have higher priority)
|
|
**kwargs: Arguments passed to each hook script
|
|
|
|
Returns:
|
|
List of results from each hook execution
|
|
"""
|
|
hooks = discover_hooks(event_name)
|
|
results = []
|
|
|
|
for hook in hooks:
|
|
result = run_hook(hook, output_dir, timeout=timeout, config_objects=config_objects, **kwargs)
|
|
|
|
# Background hooks return None - skip adding to results
|
|
if result is None:
|
|
continue
|
|
|
|
result['hook'] = str(hook)
|
|
results.append(result)
|
|
|
|
if stop_on_failure and result['returncode'] != 0:
|
|
break
|
|
|
|
return results
|
|
|
|
|
|
def get_plugins() -> List[str]:
|
|
"""
|
|
Get list of available plugins by discovering Snapshot hooks.
|
|
|
|
Returns plugin names (directory names) that contain on_Snapshot hooks.
|
|
The plugin name is the plugin directory name, not the hook script name.
|
|
|
|
Example:
|
|
archivebox/plugins/chrome_session/on_Snapshot__20_chrome_tab.bg.js
|
|
-> plugin = 'chrome_session'
|
|
|
|
Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
|
|
"""
|
|
plugins = []
|
|
|
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
|
if not base_dir.exists():
|
|
continue
|
|
|
|
for ext in ('sh', 'py', 'js'):
|
|
for hook_path in base_dir.glob(f'*/on_Snapshot__*.{ext}'):
|
|
# Use plugin directory name as plugin name
|
|
plugin_name = hook_path.parent.name
|
|
plugins.append(plugin_name)
|
|
|
|
return sorted(set(plugins))
|
|
|
|
|
|
def get_parser_plugins() -> List[str]:
|
|
"""
|
|
Get list of parser plugins by discovering parse_*_urls hooks.
|
|
|
|
Parser plugins discover URLs from source files and output urls.jsonl.
|
|
Returns plugin names like: ['50_parse_html_urls', '51_parse_rss_urls', ...]
|
|
"""
|
|
return [e for e in get_plugins() if 'parse_' in e and '_urls' in e]
|
|
|
|
|
|
def get_plugin_name(plugin: str) -> str:
|
|
"""
|
|
Get the base plugin name without numeric prefix.
|
|
|
|
Examples:
|
|
'10_title' -> 'title'
|
|
'26_readability' -> 'readability'
|
|
'50_parse_html_urls' -> 'parse_html_urls'
|
|
"""
|
|
# Split on first underscore after any leading digits
|
|
parts = plugin.split('_', 1)
|
|
if len(parts) == 2 and parts[0].isdigit():
|
|
return parts[1]
|
|
return plugin
|
|
|
|
|
|
def is_parser_plugin(plugin: str) -> bool:
|
|
"""Check if a plugin is a parser plugin (discovers URLs)."""
|
|
name = get_plugin_name(plugin)
|
|
return name.startswith('parse_') and name.endswith('_urls')
|
|
|
|
|
|
# Precedence order for search indexing (lower number = higher priority)
|
|
# Used to select which plugin's output to use for full-text search
|
|
# Plugin names here should match the part after the numeric prefix
|
|
# e.g., '31_readability' -> 'readability'
|
|
EXTRACTOR_INDEXING_PRECEDENCE = [
|
|
('readability', 1),
|
|
('mercury', 2),
|
|
('htmltotext', 3),
|
|
('singlefile', 4),
|
|
('dom', 5),
|
|
('wget', 6),
|
|
]
|
|
|
|
|
|
def get_enabled_plugins(config: Optional[Dict] = None) -> List[str]:
|
|
"""
|
|
Get the list of enabled plugins based on config and available hooks.
|
|
|
|
Checks for ENABLED_PLUGINS (or legacy ENABLED_EXTRACTORS) in config,
|
|
falls back to discovering available hooks from the plugins directory.
|
|
|
|
Returns plugin names sorted alphabetically (numeric prefix controls order).
|
|
"""
|
|
if config:
|
|
# Support both new and legacy config keys
|
|
if 'ENABLED_PLUGINS' in config:
|
|
return config['ENABLED_PLUGINS']
|
|
if 'ENABLED_EXTRACTORS' in config:
|
|
return config['ENABLED_EXTRACTORS']
|
|
|
|
# Discover from hooks - this is the source of truth
|
|
return get_plugins()
|
|
|
|
|
|
def discover_plugins_that_provide_interface(
|
|
module_name: str,
|
|
required_attrs: List[str],
|
|
plugin_prefix: Optional[str] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Discover plugins that provide a specific Python module with required interface.
|
|
|
|
This enables dynamic plugin discovery for features like search backends,
|
|
storage backends, etc. without hardcoding imports.
|
|
|
|
Args:
|
|
module_name: Name of the module to look for (e.g., 'search')
|
|
required_attrs: List of attributes the module must have (e.g., ['search', 'flush'])
|
|
plugin_prefix: Optional prefix to filter plugins (e.g., 'search_backend_')
|
|
|
|
Returns:
|
|
Dict mapping backend names to imported modules.
|
|
Backend name is derived from plugin directory name minus the prefix.
|
|
e.g., search_backend_sqlite -> 'sqlite'
|
|
|
|
Example:
|
|
backends = discover_plugins_that_provide_interface(
|
|
module_name='search',
|
|
required_attrs=['search', 'flush'],
|
|
plugin_prefix='search_backend_',
|
|
)
|
|
# Returns: {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
|
|
"""
|
|
import importlib.util
|
|
|
|
backends = {}
|
|
|
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
|
if not base_dir.exists():
|
|
continue
|
|
|
|
for plugin_dir in base_dir.iterdir():
|
|
if not plugin_dir.is_dir():
|
|
continue
|
|
|
|
plugin_name = plugin_dir.name
|
|
|
|
# Filter by prefix if specified
|
|
if plugin_prefix and not plugin_name.startswith(plugin_prefix):
|
|
continue
|
|
|
|
# Look for the module file
|
|
module_path = plugin_dir / f'{module_name}.py'
|
|
if not module_path.exists():
|
|
continue
|
|
|
|
try:
|
|
# Import the module dynamically
|
|
spec = importlib.util.spec_from_file_location(
|
|
f'archivebox.plugins.{plugin_name}.{module_name}',
|
|
module_path
|
|
)
|
|
if spec is None or spec.loader is None:
|
|
continue
|
|
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
|
|
# Check for required attributes
|
|
if not all(hasattr(module, attr) for attr in required_attrs):
|
|
continue
|
|
|
|
# Derive backend name from plugin directory name
|
|
if plugin_prefix:
|
|
backend_name = plugin_name[len(plugin_prefix):]
|
|
else:
|
|
backend_name = plugin_name
|
|
|
|
backends[backend_name] = module
|
|
|
|
except Exception:
|
|
# Skip plugins that fail to import
|
|
continue
|
|
|
|
return backends
|
|
|
|
|
|
def get_search_backends() -> Dict[str, Any]:
|
|
"""
|
|
Discover all available search backend plugins.
|
|
|
|
Search backends must provide a search.py module with:
|
|
- search(query: str) -> List[str] (returns snapshot IDs)
|
|
- flush(snapshot_ids: Iterable[str]) -> None
|
|
|
|
Returns:
|
|
Dict mapping backend names to their modules.
|
|
e.g., {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
|
|
"""
|
|
return discover_plugins_that_provide_interface(
|
|
module_name='search',
|
|
required_attrs=['search', 'flush'],
|
|
plugin_prefix='search_backend_',
|
|
)
|
|
|
|
|
|
def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Discover all plugin config.json schemas.
|
|
|
|
Each plugin can define a config.json file with JSONSchema defining
|
|
its configuration options. This function discovers and loads all such schemas.
|
|
|
|
The config.json files use JSONSchema draft-07 with custom extensions:
|
|
- x-fallback: Global config key to use as fallback
|
|
- x-aliases: List of old/alternative config key names
|
|
|
|
Returns:
|
|
Dict mapping plugin names to their parsed JSONSchema configs.
|
|
e.g., {'wget': {...schema...}, 'chrome_session': {...schema...}}
|
|
|
|
Example config.json:
|
|
{
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"type": "object",
|
|
"properties": {
|
|
"SAVE_WGET": {"type": "boolean", "default": true},
|
|
"WGET_TIMEOUT": {"type": "integer", "default": 60, "x-fallback": "TIMEOUT"}
|
|
}
|
|
}
|
|
"""
|
|
configs = {}
|
|
|
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
|
if not base_dir.exists():
|
|
continue
|
|
|
|
for plugin_dir in base_dir.iterdir():
|
|
if not plugin_dir.is_dir():
|
|
continue
|
|
|
|
config_path = plugin_dir / 'config.json'
|
|
if not config_path.exists():
|
|
continue
|
|
|
|
try:
|
|
with open(config_path, 'r') as f:
|
|
schema = json.load(f)
|
|
|
|
# Basic validation: must be an object with properties
|
|
if not isinstance(schema, dict):
|
|
continue
|
|
if schema.get('type') != 'object':
|
|
continue
|
|
if 'properties' not in schema:
|
|
continue
|
|
|
|
configs[plugin_dir.name] = schema
|
|
|
|
except (json.JSONDecodeError, OSError) as e:
|
|
# Log warning but continue - malformed config shouldn't break discovery
|
|
import sys
|
|
print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr)
|
|
continue
|
|
|
|
return configs
|
|
|
|
|
|
def get_merged_config_schema() -> Dict[str, Any]:
|
|
"""
|
|
Get a merged JSONSchema combining all plugin config schemas.
|
|
|
|
This creates a single schema that can validate all plugin config keys.
|
|
Useful for validating the complete configuration at startup.
|
|
|
|
Returns:
|
|
Combined JSONSchema with all plugin properties merged.
|
|
"""
|
|
plugin_configs = discover_plugin_configs()
|
|
|
|
merged_properties = {}
|
|
for plugin_name, schema in plugin_configs.items():
|
|
properties = schema.get('properties', {})
|
|
for key, prop_schema in properties.items():
|
|
if key in merged_properties:
|
|
# Key already exists from another plugin - log warning but keep first
|
|
import sys
|
|
print(f"Warning: Config key '{key}' defined in multiple plugins, using first definition", file=sys.stderr)
|
|
continue
|
|
merged_properties[key] = prop_schema
|
|
|
|
return {
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"type": "object",
|
|
"additionalProperties": True, # Allow unknown keys (core config, etc.)
|
|
"properties": merged_properties,
|
|
}
|
|
|
|
|
|
def get_config_defaults_from_plugins() -> Dict[str, Any]:
|
|
"""
|
|
Get default values for all plugin config options.
|
|
|
|
Returns:
|
|
Dict mapping config keys to their default values.
|
|
e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
|
|
"""
|
|
plugin_configs = discover_plugin_configs()
|
|
defaults = {}
|
|
|
|
for plugin_name, schema in plugin_configs.items():
|
|
properties = schema.get('properties', {})
|
|
for key, prop_schema in properties.items():
|
|
if 'default' in prop_schema:
|
|
defaults[key] = prop_schema['default']
|
|
|
|
return defaults
|
|
|
|
|
|
def resolve_config_value(
|
|
key: str,
|
|
prop_schema: Dict[str, Any],
|
|
env_vars: Dict[str, str],
|
|
config_file: Dict[str, str],
|
|
overrides: Optional[Dict[str, Any]] = None,
|
|
) -> Any:
|
|
"""
|
|
Resolve a single config value following the hierarchy and schema rules.
|
|
|
|
Resolution order (later overrides earlier):
|
|
1. Schema default
|
|
2. x-fallback (global config key)
|
|
3. Config file (ArchiveBox.conf)
|
|
4. Environment variables (including x-aliases)
|
|
5. Explicit overrides (User/Crawl/Snapshot config)
|
|
|
|
Args:
|
|
key: Config key name (e.g., 'WGET_TIMEOUT')
|
|
prop_schema: JSONSchema property definition for this key
|
|
env_vars: Environment variables dict
|
|
config_file: Config file values dict
|
|
overrides: Optional override values (from User/Crawl/Snapshot)
|
|
|
|
Returns:
|
|
Resolved value with appropriate type coercion.
|
|
"""
|
|
value = None
|
|
prop_type = prop_schema.get('type', 'string')
|
|
|
|
# 1. Start with schema default
|
|
if 'default' in prop_schema:
|
|
value = prop_schema['default']
|
|
|
|
# 2. Check x-fallback (global config key)
|
|
fallback_key = prop_schema.get('x-fallback')
|
|
if fallback_key:
|
|
if fallback_key in env_vars:
|
|
value = env_vars[fallback_key]
|
|
elif fallback_key in config_file:
|
|
value = config_file[fallback_key]
|
|
|
|
# 3. Check config file for main key
|
|
if key in config_file:
|
|
value = config_file[key]
|
|
|
|
# 4. Check environment variables (main key and aliases)
|
|
keys_to_check = [key] + prop_schema.get('x-aliases', [])
|
|
for check_key in keys_to_check:
|
|
if check_key in env_vars:
|
|
value = env_vars[check_key]
|
|
break
|
|
|
|
# 5. Apply explicit overrides
|
|
if overrides and key in overrides:
|
|
value = overrides[key]
|
|
|
|
# Type coercion for env var strings
|
|
if value is not None and isinstance(value, str):
|
|
value = coerce_config_value(value, prop_type, prop_schema)
|
|
|
|
return value
|
|
|
|
|
|
def coerce_config_value(value: str, prop_type: str, prop_schema: Dict[str, Any]) -> Any:
|
|
"""
|
|
Coerce a string value to the appropriate type based on schema.
|
|
|
|
Args:
|
|
value: String value to coerce
|
|
prop_type: JSONSchema type ('boolean', 'integer', 'number', 'array', 'string')
|
|
prop_schema: Full property schema (for array item types, etc.)
|
|
|
|
Returns:
|
|
Coerced value of appropriate type.
|
|
"""
|
|
if prop_type == 'boolean':
|
|
return value.lower() in ('true', '1', 'yes', 'on')
|
|
elif prop_type == 'integer':
|
|
try:
|
|
return int(value)
|
|
except ValueError:
|
|
return prop_schema.get('default', 0)
|
|
elif prop_type == 'number':
|
|
try:
|
|
return float(value)
|
|
except ValueError:
|
|
return prop_schema.get('default', 0.0)
|
|
elif prop_type == 'array':
|
|
# Try JSON parse first, fall back to comma-separated
|
|
try:
|
|
return json.loads(value)
|
|
except json.JSONDecodeError:
|
|
return [v.strip() for v in value.split(',') if v.strip()]
|
|
else:
|
|
return value
|
|
|
|
|
|
def get_flat_plugin_config(
|
|
env_vars: Optional[Dict[str, str]] = None,
|
|
config_file: Optional[Dict[str, str]] = None,
|
|
overrides: Optional[Dict[str, Any]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Get all plugin config values resolved according to hierarchy.
|
|
|
|
This is the main function for getting plugin configuration.
|
|
It discovers all plugin schemas and resolves each config key.
|
|
|
|
Args:
|
|
env_vars: Environment variables (defaults to os.environ)
|
|
config_file: Config file values (from ArchiveBox.conf)
|
|
overrides: Override values (from User/Crawl/Snapshot config fields)
|
|
|
|
Returns:
|
|
Flat dict of all resolved config values.
|
|
e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
|
|
"""
|
|
if env_vars is None:
|
|
env_vars = dict(os.environ)
|
|
if config_file is None:
|
|
config_file = {}
|
|
|
|
plugin_configs = discover_plugin_configs()
|
|
flat_config = {}
|
|
|
|
for plugin_name, schema in plugin_configs.items():
|
|
properties = schema.get('properties', {})
|
|
for key, prop_schema in properties.items():
|
|
flat_config[key] = resolve_config_value(
|
|
key, prop_schema, env_vars, config_file, overrides
|
|
)
|
|
|
|
return flat_config
|
|
|
|
|
|
def export_plugin_config_to_env(
|
|
config: Dict[str, Any],
|
|
env: Optional[Dict[str, str]] = None,
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Export plugin config values to environment variable format.
|
|
|
|
Converts all values to strings suitable for subprocess environment.
|
|
Arrays are JSON-encoded.
|
|
|
|
Args:
|
|
config: Flat config dict from get_flat_plugin_config()
|
|
env: Optional existing env dict to update (creates new if None)
|
|
|
|
Returns:
|
|
Environment dict with config values as strings.
|
|
"""
|
|
if env is None:
|
|
env = {}
|
|
|
|
for key, value in config.items():
|
|
if value is None:
|
|
continue
|
|
elif isinstance(value, bool):
|
|
env[key] = 'true' if value else 'false'
|
|
elif isinstance(value, (list, dict)):
|
|
env[key] = json.dumps(value)
|
|
else:
|
|
env[key] = str(value)
|
|
|
|
return env
|
|
|
|
|
|
# =============================================================================
|
|
# Plugin Template Discovery
|
|
# =============================================================================
|
|
#
|
|
# Plugins can provide custom templates for rendering their output in the UI.
|
|
# Templates are discovered by filename convention inside each plugin's templates/ dir:
|
|
#
|
|
# archivebox/plugins/<plugin_name>/
|
|
# templates/
|
|
# icon.html # Icon for admin table view (small inline HTML)
|
|
# thumbnail.html # Preview thumbnail for snapshot cards
|
|
# embed.html # Iframe embed content for main preview
|
|
# fullscreen.html # Fullscreen view template
|
|
#
|
|
# Template context variables available:
|
|
# {{ result }} - ArchiveResult object
|
|
# {{ snapshot }} - Parent Snapshot object
|
|
# {{ output_path }} - Path to output file/dir relative to snapshot dir
|
|
# {{ plugin }} - Plugin name (e.g., 'screenshot', 'singlefile')
|
|
#
|
|
|
|
# Default templates used when plugin doesn't provide one
|
|
DEFAULT_TEMPLATES = {
|
|
'icon': '''<span title="{{ plugin }}">{{ icon }}</span>''',
|
|
'thumbnail': '''
|
|
<img src="{{ output_path }}"
|
|
alt="{{ plugin }} output"
|
|
style="max-width: 100%; max-height: 100px; object-fit: cover;"
|
|
onerror="this.style.display='none'">
|
|
''',
|
|
'embed': '''
|
|
<iframe src="{{ output_path }}"
|
|
style="width: 100%; height: 100%; border: none;"
|
|
sandbox="allow-same-origin allow-scripts">
|
|
</iframe>
|
|
''',
|
|
'fullscreen': '''
|
|
<iframe src="{{ output_path }}"
|
|
style="width: 100%; height: 100vh; border: none;"
|
|
sandbox="allow-same-origin allow-scripts allow-forms">
|
|
</iframe>
|
|
''',
|
|
}
|
|
|
|
# Default icons for known extractor plugins (emoji or short HTML)
|
|
DEFAULT_PLUGIN_ICONS = {
|
|
'screenshot': '📷',
|
|
'pdf': '📄',
|
|
'singlefile': '📦',
|
|
'dom': '🌐',
|
|
'wget': '📥',
|
|
'media': '🎬',
|
|
'git': '📂',
|
|
'readability': '📖',
|
|
'mercury': '☿️',
|
|
'favicon': '⭐',
|
|
'title': '📝',
|
|
'headers': '📋',
|
|
'archive_org': '🏛️',
|
|
'htmltotext': '📃',
|
|
'warc': '🗄️',
|
|
}
|
|
|
|
|
|
def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> Optional[str]:
|
|
"""
|
|
Get a plugin template by plugin name and template type.
|
|
|
|
Args:
|
|
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
|
|
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
|
|
fallback: If True, return default template if plugin template not found
|
|
|
|
Returns:
|
|
Template content as string, or None if not found and fallback=False.
|
|
"""
|
|
base_name = get_plugin_name(plugin)
|
|
|
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
|
if not base_dir.exists():
|
|
continue
|
|
|
|
# Look for plugin directory matching plugin name
|
|
for plugin_dir in base_dir.iterdir():
|
|
if not plugin_dir.is_dir():
|
|
continue
|
|
|
|
# Match by directory name (exact or partial)
|
|
if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
|
|
template_path = plugin_dir / 'templates' / f'{template_name}.html'
|
|
if template_path.exists():
|
|
return template_path.read_text()
|
|
|
|
# Fall back to default template if requested
|
|
if fallback:
|
|
return DEFAULT_TEMPLATES.get(template_name, '')
|
|
|
|
return None
|
|
|
|
|
|
def get_plugin_icon(plugin: str) -> str:
|
|
"""
|
|
Get the icon for a plugin.
|
|
|
|
First checks for plugin-provided icon.html template,
|
|
then falls back to DEFAULT_PLUGIN_ICONS.
|
|
|
|
Args:
|
|
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
|
|
|
|
Returns:
|
|
Icon HTML/emoji string.
|
|
"""
|
|
base_name = get_plugin_name(plugin)
|
|
|
|
# Try plugin-provided icon template
|
|
icon_template = get_plugin_template(plugin, 'icon', fallback=False)
|
|
if icon_template:
|
|
return icon_template.strip()
|
|
|
|
# Fall back to default icon
|
|
return DEFAULT_PLUGIN_ICONS.get(base_name, '📁')
|
|
|
|
|
|
def get_all_plugin_icons() -> Dict[str, str]:
|
|
"""
|
|
Get icons for all discovered plugins.
|
|
|
|
Returns:
|
|
Dict mapping plugin base names to their icons.
|
|
"""
|
|
icons = {}
|
|
for plugin in get_plugins():
|
|
base_name = get_plugin_name(plugin)
|
|
icons[base_name] = get_plugin_icon(plugin)
|
|
return icons
|
|
|
|
|
|
def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
|
"""
|
|
Discover all plugin templates organized by plugin.
|
|
|
|
Returns:
|
|
Dict mapping plugin names to dicts of template_name -> template_path.
|
|
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
|
|
"""
|
|
templates: Dict[str, Dict[str, str]] = {}
|
|
|
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
|
if not base_dir.exists():
|
|
continue
|
|
|
|
for plugin_dir in base_dir.iterdir():
|
|
if not plugin_dir.is_dir():
|
|
continue
|
|
|
|
templates_dir = plugin_dir / 'templates'
|
|
if not templates_dir.exists():
|
|
continue
|
|
|
|
plugin_templates = {}
|
|
for template_file in templates_dir.glob('*.html'):
|
|
template_name = template_file.stem # icon, thumbnail, embed, fullscreen
|
|
plugin_templates[template_name] = str(template_file)
|
|
|
|
if plugin_templates:
|
|
templates[plugin_dir.name] = plugin_templates
|
|
|
|
return templates
|
|
|
|
|
|
# =============================================================================
|
|
# Hook Result Processing Helpers
|
|
# =============================================================================
|
|
|
|
|
|
def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
|
|
"""
|
|
Find Binary for a command, trying abspath first then name.
|
|
Only matches binaries on the current machine.
|
|
|
|
Args:
|
|
cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
|
|
machine_id: Current machine ID
|
|
|
|
Returns:
|
|
Binary ID as string if found, None otherwise
|
|
"""
|
|
if not cmd:
|
|
return None
|
|
|
|
from machine.models import Binary
|
|
|
|
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
|
|
|
# Try matching by absolute path first
|
|
binary = Binary.objects.filter(
|
|
abspath=bin_path_or_name,
|
|
machine_id=machine_id
|
|
).first()
|
|
|
|
if binary:
|
|
return str(binary.id)
|
|
|
|
# Fallback: match by binary name
|
|
bin_name = Path(bin_path_or_name).name
|
|
binary = Binary.objects.filter(
|
|
name=bin_name,
|
|
machine_id=machine_id
|
|
).first()
|
|
|
|
return str(binary.id) if binary else None
|
|
|
|
|
|
def create_model_record(record: Dict[str, Any]) -> Any:
|
|
"""
|
|
Generic helper to create/update model instances from hook JSONL output.
|
|
|
|
Args:
|
|
record: Dict with 'type' field and model data
|
|
|
|
Returns:
|
|
Created/updated model instance, or None if type unknown
|
|
"""
|
|
from machine.models import Binary, Machine
|
|
|
|
record_type = record.pop('type', None)
|
|
if not record_type:
|
|
return None
|
|
|
|
# Remove plugin metadata (not model fields)
|
|
record.pop('plugin', None)
|
|
record.pop('plugin_hook', None)
|
|
|
|
if record_type == 'Binary':
|
|
# Binary requires machine FK
|
|
machine = Machine.current()
|
|
record.setdefault('machine', machine)
|
|
|
|
# Required fields check
|
|
name = record.get('name')
|
|
abspath = record.get('abspath')
|
|
if not name or not abspath:
|
|
return None
|
|
|
|
obj, created = Binary.objects.update_or_create(
|
|
machine=machine,
|
|
name=name,
|
|
defaults={
|
|
'abspath': abspath,
|
|
'version': record.get('version', ''),
|
|
'sha256': record.get('sha256', ''),
|
|
'binprovider': record.get('binprovider', 'env'),
|
|
}
|
|
)
|
|
return obj
|
|
|
|
elif record_type == 'Machine':
|
|
# Machine config update (special _method handling)
|
|
method = record.pop('_method', None)
|
|
if method == 'update':
|
|
key = record.get('key')
|
|
value = record.get('value')
|
|
if key and value:
|
|
machine = Machine.current()
|
|
if not machine.config:
|
|
machine.config = {}
|
|
machine.config[key] = value
|
|
machine.save(update_fields=['config'])
|
|
return machine
|
|
return None
|
|
|
|
# Add more types as needed (Dependency, Snapshot, etc.)
|
|
else:
|
|
# Unknown type - log warning but don't fail
|
|
import sys
|
|
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
|
|
"""
|
|
Process JSONL records from hook output.
|
|
Dispatches to Model.from_jsonl() for each record type.
|
|
|
|
Args:
|
|
records: List of JSONL record dicts from result['records']
|
|
overrides: Dict with 'snapshot', 'crawl', 'dependency', 'created_by_id', etc.
|
|
|
|
Returns:
|
|
Dict with counts by record type
|
|
"""
|
|
stats = {}
|
|
overrides = overrides or {}
|
|
|
|
for record in records:
|
|
record_type = record.get('type')
|
|
if not record_type:
|
|
continue
|
|
|
|
# Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
|
|
if record_type == 'ArchiveResult':
|
|
continue
|
|
|
|
try:
|
|
# Dispatch to appropriate model's from_jsonl() method
|
|
if record_type == 'Snapshot':
|
|
from core.models import Snapshot
|
|
obj = Snapshot.from_jsonl(record.copy(), overrides)
|
|
if obj:
|
|
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
|
|
|
|
elif record_type == 'Tag':
|
|
from core.models import Tag
|
|
obj = Tag.from_jsonl(record.copy(), overrides)
|
|
if obj:
|
|
stats['Tag'] = stats.get('Tag', 0) + 1
|
|
|
|
elif record_type == 'Binary':
|
|
from machine.models import Binary
|
|
obj = Binary.from_jsonl(record.copy(), overrides)
|
|
if obj:
|
|
stats['Binary'] = stats.get('Binary', 0) + 1
|
|
|
|
elif record_type == 'Machine':
|
|
from machine.models import Machine
|
|
obj = Machine.from_jsonl(record.copy(), overrides)
|
|
if obj:
|
|
stats['Machine'] = stats.get('Machine', 0) + 1
|
|
|
|
else:
|
|
import sys
|
|
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
|
|
|
|
except Exception as e:
|
|
import sys
|
|
print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
|
|
continue
|
|
|
|
return stats
|
|
|
|
|
|
def process_is_alive(pid_file: Path) -> bool:
|
|
"""
|
|
Check if process in PID file is still running.
|
|
|
|
Args:
|
|
pid_file: Path to hook.pid file
|
|
|
|
Returns:
|
|
True if process is alive, False otherwise
|
|
"""
|
|
if not pid_file.exists():
|
|
return False
|
|
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
os.kill(pid, 0) # Signal 0 = check if process exists without killing it
|
|
return True
|
|
except (OSError, ValueError):
|
|
return False
|
|
|
|
|
|
def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = True):
|
|
"""
|
|
Kill process in PID file with optional validation.
|
|
|
|
Args:
|
|
pid_file: Path to hook.pid file
|
|
sig: Signal to send (default SIGTERM)
|
|
validate: If True, validate process identity before killing (default: True)
|
|
"""
|
|
from archivebox.misc.process_utils import safe_kill_process
|
|
|
|
if validate:
|
|
# Use safe kill with validation
|
|
cmd_file = pid_file.parent / 'cmd.sh'
|
|
safe_kill_process(pid_file, cmd_file, signal_num=sig)
|
|
else:
|
|
# Legacy behavior - kill without validation
|
|
if not pid_file.exists():
|
|
return
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
os.kill(pid, sig)
|
|
except (OSError, ValueError):
|
|
pass
|
|
|
|
|