This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -146,11 +146,16 @@ class HookResult(TypedDict, total=False):
records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
def discover_hooks(event_name: str) -> List[Path]:
def discover_hooks(
event_name: str,
filter_disabled: bool = True,
config: Optional[Dict[str, Any]] = None
) -> List[Path]:
"""
Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
Searches both built-in and user plugin directories.
Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
Returns scripts sorted alphabetically by filename for deterministic execution order.
Hook naming convention uses numeric prefixes to control order:
@@ -158,9 +163,29 @@ def discover_hooks(event_name: str) -> List[Path]:
on_Snapshot__15_singlefile.py # runs second
on_Snapshot__26_readability.py # runs later (depends on singlefile)
Example:
Args:
event_name: Event name (e.g., 'Snapshot', 'Binary', 'Crawl')
filter_disabled: If True, skip hooks from disabled plugins (default: True)
config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
If None, will call get_config() with global scope
Returns:
Sorted list of hook script paths from enabled plugins only.
Examples:
# With proper config context (recommended):
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
discover_hooks('Snapshot', config=config)
# Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
# Without config (uses global defaults):
discover_hooks('Snapshot')
# Returns: [Path('.../on_Snapshot__10_title.py'), Path('.../on_Snapshot__15_singlefile.py'), ...]
# Returns: [Path('.../on_Snapshot__10_title.py'), ...]
# Show all plugins regardless of enabled status:
discover_hooks('Snapshot', filter_disabled=False)
# Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
"""
hooks = []
@@ -177,45 +202,44 @@ def discover_hooks(event_name: str) -> List[Path]:
pattern_direct = f'on_{event_name}__*.{ext}'
hooks.extend(base_dir.glob(pattern_direct))
# Filter by enabled plugins
if filter_disabled:
# Get merged config if not provided (lazy import to avoid circular dependency)
if config is None:
from archivebox.config.configset import get_config
config = get_config(scope='global')
enabled_hooks = []
for hook in hooks:
# Get plugin name from parent directory
# e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
plugin_name = hook.parent.name
# Check if this is a plugin directory (not the root plugins dir)
if plugin_name in ('plugins', '.'):
# Hook is in root plugins directory, not a plugin subdir
# Include it by default (no filtering for non-plugin hooks)
enabled_hooks.append(hook)
continue
# Check if plugin is enabled
plugin_config = get_plugin_special_config(plugin_name, config)
if plugin_config['enabled']:
enabled_hooks.append(hook)
hooks = enabled_hooks
# Sort by filename (not full path) to ensure numeric prefix ordering works
# e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
return sorted(set(hooks), key=lambda p: p.name)
def discover_all_hooks() -> Dict[str, List[Path]]:
"""
Discover all hooks organized by event name.
Returns a dict mapping event names to lists of hook script paths.
"""
hooks_by_event: Dict[str, List[Path]] = {}
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
for ext in ('sh', 'py', 'js'):
for hook_path in base_dir.glob(f'*/on_*__*.{ext}'):
# Extract event name from filename: on_EventName__hook_name.ext
filename = hook_path.stem # on_EventName__hook_name
if filename.startswith('on_') and '__' in filename:
event_name = filename[3:].split('__')[0] # EventName
if event_name not in hooks_by_event:
hooks_by_event[event_name] = []
hooks_by_event[event_name].append(hook_path)
# Sort hooks within each event
for event_name in hooks_by_event:
hooks_by_event[event_name] = sorted(set(hooks_by_event[event_name]), key=lambda p: p.name)
return hooks_by_event
def run_hook(
script: Path,
output_dir: Path,
timeout: int = 300,
config_objects: Optional[List[Any]] = None,
config: Dict[str, Any],
timeout: Optional[int] = None,
**kwargs: Any
) -> HookResult:
"""
@@ -224,31 +248,33 @@ def run_hook(
This is the low-level hook executor. For running extractors with proper
metadata handling, use call_extractor() instead.
Config is passed to hooks via environment variables with this priority:
1. Plugin schema defaults (config.json)
2. Config file (ArchiveBox.conf)
3. Environment variables
4. Machine.config (auto-included, lowest override priority)
5. config_objects (in order - later objects override earlier ones)
Config is passed to hooks via environment variables. Caller MUST use
get_config() to merge all sources (file, env, machine, crawl, snapshot).
Args:
script: Path to the hook script (.sh, .py, or .js)
output_dir: Working directory for the script (where output files go)
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
timeout: Maximum execution time in seconds
config_objects: Optional list of objects with .config JSON fields
(e.g., [crawl, snapshot] - later items have higher priority)
If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
**kwargs: Arguments passed to the script as --key=value
Returns:
HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
"""
import time
start_time = time.time()
# Auto-include Machine.config at the start (lowest priority among config_objects)
from machine.models import Machine
machine = Machine.current()
all_config_objects = [machine] + list(config_objects or [])
# Auto-detect timeout from plugin config if not explicitly provided
if timeout is None:
plugin_name = script.parent.name
plugin_config = get_plugin_special_config(plugin_name, config)
timeout = plugin_config['timeout']
if not script.exists():
return HookResult(
@@ -302,51 +328,16 @@ def run_hook(
env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
# If a Crawl is in config_objects, pass its OUTPUT_DIR for hooks that need to find crawl-level resources
for obj in all_config_objects:
if hasattr(obj, 'OUTPUT_DIR') and hasattr(obj, 'get_urls_list'): # Duck-type check for Crawl
env['CRAWL_OUTPUT_DIR'] = str(obj.OUTPUT_DIR)
break
# Build overrides from any objects with .config fields (in order, later overrides earlier)
# all_config_objects includes Machine at the start, then any passed config_objects
overrides = {}
for obj in all_config_objects:
if obj and hasattr(obj, 'config') and obj.config:
# Strip 'config/' prefix from Machine.config keys (e.g., 'config/CHROME_BINARY' -> 'CHROME_BINARY')
for key, value in obj.config.items():
clean_key = key.removeprefix('config/')
overrides[clean_key] = value
# Get plugin config from JSON schemas with hierarchy resolution
# This merges: schema defaults -> config file -> env vars -> object config overrides
plugin_config = get_flat_plugin_config(overrides=overrides if overrides else None)
export_plugin_config_to_env(plugin_config, env)
# Also pass core config values that aren't in plugin schemas yet
# These are legacy values that may still be needed
from archivebox import config
env.setdefault('CHROME_BINARY', str(getattr(config, 'CHROME_BINARY', '')))
env.setdefault('WGET_BINARY', str(getattr(config, 'WGET_BINARY', '')))
env.setdefault('CURL_BINARY', str(getattr(config, 'CURL_BINARY', '')))
env.setdefault('GIT_BINARY', str(getattr(config, 'GIT_BINARY', '')))
env.setdefault('YOUTUBEDL_BINARY', str(getattr(config, 'YOUTUBEDL_BINARY', '')))
env.setdefault('SINGLEFILE_BINARY', str(getattr(config, 'SINGLEFILE_BINARY', '')))
env.setdefault('READABILITY_BINARY', str(getattr(config, 'READABILITY_BINARY', '')))
env.setdefault('MERCURY_BINARY', str(getattr(config, 'MERCURY_BINARY', '')))
env.setdefault('NODE_BINARY', str(getattr(config, 'NODE_BINARY', '')))
env.setdefault('TIMEOUT', str(getattr(config, 'TIMEOUT', 60)))
env.setdefault('CHECK_SSL_VALIDITY', str(getattr(config, 'CHECK_SSL_VALIDITY', True)))
env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
# Pass SEARCH_BACKEND_ENGINE from new-style config
try:
from archivebox.config.configset import get_config
search_config = get_config()
env.setdefault('SEARCH_BACKEND_ENGINE', str(search_config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')))
except Exception:
env.setdefault('SEARCH_BACKEND_ENGINE', 'ripgrep')
# Export all config values to environment (already merged by get_config())
for key, value in config.items():
if value is None:
continue
elif isinstance(value, bool):
env[key] = 'true' if value else 'false'
elif isinstance(value, (list, dict)):
env[key] = json.dumps(value)
else:
env[key] = str(value)
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)
@@ -525,31 +516,35 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
def run_hooks(
event_name: str,
output_dir: Path,
timeout: int = 300,
config: Dict[str, Any],
timeout: Optional[int] = None,
stop_on_failure: bool = False,
config_objects: Optional[List[Any]] = None,
**kwargs: Any
) -> List[HookResult]:
"""
Run all hooks for a given event.
Args:
event_name: The event name to trigger (e.g., 'Snapshot__wget')
event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
output_dir: Working directory for hook scripts
timeout: Maximum execution time per hook
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
timeout: Maximum execution time per hook (None = auto-detect from plugin config)
stop_on_failure: If True, stop executing hooks after first failure
config_objects: Optional list of objects with .config JSON fields
(e.g., [crawl, snapshot] - later items have higher priority)
**kwargs: Arguments passed to each hook script
Returns:
List of results from each hook execution
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
"""
hooks = discover_hooks(event_name)
hooks = discover_hooks(event_name, config=config)
results = []
for hook in hooks:
result = run_hook(hook, output_dir, timeout=timeout, config_objects=config_objects, **kwargs)
result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
# Background hooks return None - skip adding to results
if result is None:
@@ -638,24 +633,44 @@ EXTRACTOR_INDEXING_PRECEDENCE = [
]
def get_enabled_plugins(config: Optional[Dict] = None) -> List[str]:
def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
"""
Get the list of enabled plugins based on config and available hooks.
Checks for ENABLED_PLUGINS (or legacy ENABLED_EXTRACTORS) in config,
falls back to discovering available hooks from the plugins directory.
Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
Returns plugin names sorted alphabetically (numeric prefix controls order).
Args:
config: Merged config dict from get_config() - if None, uses global config
Returns:
Plugin names sorted alphabetically (numeric prefix controls order).
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
enabled = get_enabled_plugins(config) # ['wget', 'media', 'chrome', ...]
"""
if config:
# Support both new and legacy config keys
if 'ENABLED_PLUGINS' in config:
return config['ENABLED_PLUGINS']
if 'ENABLED_EXTRACTORS' in config:
return config['ENABLED_EXTRACTORS']
# Get merged config if not provided
if config is None:
from archivebox.config.configset import get_config
config = get_config(scope='global')
# Discover from hooks - this is the source of truth
return get_plugins()
# Support explicit ENABLED_PLUGINS override (legacy)
if 'ENABLED_PLUGINS' in config:
return config['ENABLED_PLUGINS']
if 'ENABLED_EXTRACTORS' in config:
return config['ENABLED_EXTRACTORS']
# Filter all plugins by enabled status
all_plugins = get_plugins()
enabled = []
for plugin in all_plugins:
plugin_config = get_plugin_special_config(plugin, config)
if plugin_config['enabled']:
enabled.append(plugin)
return enabled
def discover_plugins_that_provide_interface(
@@ -822,37 +837,6 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
return configs
def get_merged_config_schema() -> Dict[str, Any]:
"""
Get a merged JSONSchema combining all plugin config schemas.
This creates a single schema that can validate all plugin config keys.
Useful for validating the complete configuration at startup.
Returns:
Combined JSONSchema with all plugin properties merged.
"""
plugin_configs = discover_plugin_configs()
merged_properties = {}
for plugin_name, schema in plugin_configs.items():
properties = schema.get('properties', {})
for key, prop_schema in properties.items():
if key in merged_properties:
# Key already exists from another plugin - log warning but keep first
import sys
print(f"Warning: Config key '{key}' defined in multiple plugins, using first definition", file=sys.stderr)
continue
merged_properties[key] = prop_schema
return {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": True, # Allow unknown keys (core config, etc.)
"properties": merged_properties,
}
def get_config_defaults_from_plugins() -> Dict[str, Any]:
"""
Get default values for all plugin config options.
@@ -873,173 +857,63 @@ def get_config_defaults_from_plugins() -> Dict[str, Any]:
return defaults
def resolve_config_value(
key: str,
prop_schema: Dict[str, Any],
env_vars: Dict[str, str],
config_file: Dict[str, str],
overrides: Optional[Dict[str, Any]] = None,
) -> Any:
def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
"""
Resolve a single config value following the hierarchy and schema rules.
Extract special config keys for a plugin following naming conventions.
Resolution order (later overrides earlier):
1. Schema default
2. x-fallback (global config key)
3. Config file (ArchiveBox.conf)
4. Environment variables (including x-aliases)
5. Explicit overrides (User/Crawl/Snapshot config)
ArchiveBox recognizes 3 special config key patterns per plugin:
- {PLUGIN}_ENABLED: Enable/disable toggle (default True)
- {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
- {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
These allow ArchiveBox to:
- Skip disabled plugins (optimization)
- Enforce plugin-specific timeouts automatically
- Discover plugin binaries for validation
Args:
key: Config key name (e.g., 'WGET_TIMEOUT')
prop_schema: JSONSchema property definition for this key
env_vars: Environment variables dict
config_file: Config file values dict
overrides: Optional override values (from User/Crawl/Snapshot)
plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
Returns:
Resolved value with appropriate type coercion.
Dict with standardized keys:
{
'enabled': True, # bool
'timeout': 60, # int, seconds
'binary': 'wget', # str, path or name
}
Examples:
>>> from archivebox.config.configset import get_config
>>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
>>> get_plugin_special_config('wget', config)
{'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
"""
value = None
prop_type = prop_schema.get('type', 'string')
plugin_upper = plugin_name.upper()
# 1. Start with schema default
if 'default' in prop_schema:
value = prop_schema['default']
# 1. Enabled: PLUGINNAME_ENABLED (default True)
# Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:
enabled = True
elif isinstance(enabled, str):
# Handle string values from config file ("true"/"false")
enabled = enabled.lower() not in ('false', '0', 'no', '')
# 2. Check x-fallback (global config key)
fallback_key = prop_schema.get('x-fallback')
if fallback_key:
if fallback_key in env_vars:
value = env_vars[fallback_key]
elif fallback_key in config_file:
value = config_file[fallback_key]
# 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
timeout_key = f'{plugin_upper}_TIMEOUT'
timeout = config.get(timeout_key) or config.get('TIMEOUT', 300)
# 3. Check config file for main key
if key in config_file:
value = config_file[key]
# 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
binary_key = f'{plugin_upper}_BINARY'
binary = config.get(binary_key, plugin_name)
# 4. Check environment variables (main key and aliases)
keys_to_check = [key] + prop_schema.get('x-aliases', [])
for check_key in keys_to_check:
if check_key in env_vars:
value = env_vars[check_key]
break
# 5. Apply explicit overrides
if overrides and key in overrides:
value = overrides[key]
# Type coercion for env var strings
if value is not None and isinstance(value, str):
value = coerce_config_value(value, prop_type, prop_schema)
return value
def coerce_config_value(value: str, prop_type: str, prop_schema: Dict[str, Any]) -> Any:
"""
Coerce a string value to the appropriate type based on schema.
Args:
value: String value to coerce
prop_type: JSONSchema type ('boolean', 'integer', 'number', 'array', 'string')
prop_schema: Full property schema (for array item types, etc.)
Returns:
Coerced value of appropriate type.
"""
if prop_type == 'boolean':
return value.lower() in ('true', '1', 'yes', 'on')
elif prop_type == 'integer':
try:
return int(value)
except ValueError:
return prop_schema.get('default', 0)
elif prop_type == 'number':
try:
return float(value)
except ValueError:
return prop_schema.get('default', 0.0)
elif prop_type == 'array':
# Try JSON parse first, fall back to comma-separated
try:
return json.loads(value)
except json.JSONDecodeError:
return [v.strip() for v in value.split(',') if v.strip()]
else:
return value
def get_flat_plugin_config(
env_vars: Optional[Dict[str, str]] = None,
config_file: Optional[Dict[str, str]] = None,
overrides: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Get all plugin config values resolved according to hierarchy.
This is the main function for getting plugin configuration.
It discovers all plugin schemas and resolves each config key.
Args:
env_vars: Environment variables (defaults to os.environ)
config_file: Config file values (from ArchiveBox.conf)
overrides: Override values (from User/Crawl/Snapshot config fields)
Returns:
Flat dict of all resolved config values.
e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
"""
if env_vars is None:
env_vars = dict(os.environ)
if config_file is None:
config_file = {}
plugin_configs = discover_plugin_configs()
flat_config = {}
for plugin_name, schema in plugin_configs.items():
properties = schema.get('properties', {})
for key, prop_schema in properties.items():
flat_config[key] = resolve_config_value(
key, prop_schema, env_vars, config_file, overrides
)
return flat_config
def export_plugin_config_to_env(
config: Dict[str, Any],
env: Optional[Dict[str, str]] = None,
) -> Dict[str, str]:
"""
Export plugin config values to environment variable format.
Converts all values to strings suitable for subprocess environment.
Arrays are JSON-encoded.
Args:
config: Flat config dict from get_flat_plugin_config()
env: Optional existing env dict to update (creates new if None)
Returns:
Environment dict with config values as strings.
"""
if env is None:
env = {}
for key, value in config.items():
if value is None:
continue
elif isinstance(value, bool):
env[key] = 'true' if value else 'false'
elif isinstance(value, (list, dict)):
env[key] = json.dumps(value)
else:
env[key] = str(value)
return env
return {
'enabled': bool(enabled),
'timeout': int(timeout),
'binary': str(binary),
}
# =============================================================================
@@ -1233,7 +1107,7 @@ def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
if not cmd:
return None
from machine.models import Binary
from archivebox.machine.models import Binary
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
@@ -1266,7 +1140,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
Returns:
Created/updated model instance, or None if type unknown
"""
from machine.models import Binary, Machine
from archivebox.machine.models import Binary, Machine
record_type = record.pop('type', None)
if not record_type:
@@ -1349,25 +1223,25 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
try:
# Dispatch to appropriate model's from_jsonl() method
if record_type == 'Snapshot':
from core.models import Snapshot
from archivebox.core.models import Snapshot
obj = Snapshot.from_jsonl(record.copy(), overrides)
if obj:
stats['Snapshot'] = stats.get('Snapshot', 0) + 1
elif record_type == 'Tag':
from core.models import Tag
from archivebox.core.models import Tag
obj = Tag.from_jsonl(record.copy(), overrides)
if obj:
stats['Tag'] = stats.get('Tag', 0) + 1
elif record_type == 'Binary':
from machine.models import Binary
from archivebox.machine.models import Binary
obj = Binary.from_jsonl(record.copy(), overrides)
if obj:
stats['Binary'] = stats.get('Binary', 0) + 1
elif record_type == 'Machine':
from machine.models import Machine
from archivebox.machine.models import Machine
obj = Machine.from_jsonl(record.copy(), overrides)
if obj:
stats['Machine'] = stats.get('Machine', 0) + 1