Files
ArchiveBox/archivebox/hooks.py
2026-03-25 05:36:07 -07:00

1140 lines
41 KiB
Python

"""
Hook discovery and execution helpers for ArchiveBox plugins.
ArchiveBox no longer drives plugin execution itself during normal crawls.
`abx-dl` owns the live runtime and emits typed bus events; ArchiveBox mainly:
- discovers hook files for inspection / docs / legacy direct execution helpers
- executes individual hook scripts when explicitly requested
- parses hook stdout JSONL records into ArchiveBox models when needed
Hook-backed event families are discovered from filenames like:
on_BinaryRequest__*
on_CrawlSetup__*
on_Snapshot__*
Internal bus event names are normalized to the corresponding
`on_{EventFamily}__*` prefix by a simple string transform. If no scripts exist
for that prefix, discovery returns `[]`.
Directory structure:
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
data/custom_plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
Hook contract:
Input: --url=<url> (and other --key=value args)
Output: JSONL records to stdout, files to $PWD
Exit: 0 = success, non-zero = failure
Execution order:
- Hooks are named with two-digit prefixes (00-99) and sorted lexicographically by filename
- Foreground hooks run sequentially in that order
- Background hooks (.bg suffix) run concurrently and do not block foreground progress
- After all foreground hooks complete, background hooks receive SIGTERM and must finalize
Hook naming convention:
on_{EventFamily}__{run_order}_{description}[.finite.bg|.daemon.bg].{ext}
API:
discover_hooks(event) -> List[Path] Find hook scripts for a hook-backed event family
run_hook(script, ...) -> Process Execute a hook script directly
is_background_hook(name) -> bool Check if hook is background (.bg suffix)
"""
__package__ = "archivebox"
import os
import json
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, TypedDict
from abx_plugins import get_plugins_dir
from django.conf import settings
from django.utils.safestring import mark_safe
from archivebox.config.constants import CONSTANTS
from archivebox.misc.util import fix_url_from_markdown, sanitize_extracted_url
if TYPE_CHECKING:
from archivebox.machine.models import Process
# Plugin directories
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
USER_PLUGINS_DIR = Path(
os.environ.get("ARCHIVEBOX_USER_PLUGINS_DIR") or getattr(settings, "USER_PLUGINS_DIR", "") or str(CONSTANTS.USER_PLUGINS_DIR),
).expanduser()
# =============================================================================
# Hook Step Extraction
# =============================================================================
def is_background_hook(hook_name: str) -> bool:
"""
Check if a hook is a background hook (doesn't block foreground progression).
Background hooks have '.bg.' in their filename before the extension.
Args:
hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.daemon.bg.js')
Returns:
True if background hook, False if foreground.
Examples:
is_background_hook('on_Snapshot__10_chrome_tab.daemon.bg.js') -> True
is_background_hook('on_Snapshot__50_wget.py') -> False
is_background_hook('on_Snapshot__63_media.finite.bg.py') -> True
"""
return ".bg." in hook_name or "__background" in hook_name
def is_finite_background_hook(hook_name: str) -> bool:
"""Check if a background hook is finite-lived and should be awaited."""
return ".finite.bg." in hook_name
def iter_plugin_dirs() -> list[Path]:
"""Iterate over all built-in and user plugin directories."""
plugin_dirs: list[Path] = []
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
for plugin_dir in base_dir.iterdir():
if plugin_dir.is_dir() and not plugin_dir.name.startswith("_"):
plugin_dirs.append(plugin_dir)
return plugin_dirs
def normalize_hook_event_name(event_name: str) -> str | None:
"""
Normalize a hook event family or event class name to its on_* prefix.
Examples:
BinaryRequestEvent -> BinaryRequest
CrawlSetupEvent -> CrawlSetup
SnapshotEvent -> Snapshot
BinaryEvent -> Binary
CrawlCleanupEvent -> CrawlCleanup
"""
normalized = str(event_name or "").strip()
if not normalized:
return None
if normalized.endswith("Event"):
return normalized[:-5] or None
return normalized
class HookResult(TypedDict, total=False):
"""Raw result from run_hook()."""
returncode: int
stdout: str
stderr: str
output_json: dict[str, Any] | None
output_files: list[dict[str, Any]]
duration_ms: int
hook: str
plugin: str # Plugin name (directory name, e.g., 'wget', 'screenshot')
hook_name: str # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
# New fields for JSONL parsing
records: list[dict[str, Any]] # Parsed JSONL records with 'type' field
def discover_hooks(
event_name: str,
filter_disabled: bool = True,
config: dict[str, Any] | None = None,
) -> list[Path]:
"""
Find all hook scripts for an event family.
Searches both built-in and user plugin directories.
Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
Returns scripts sorted alphabetically by filename for deterministic execution order.
Hook naming convention uses numeric prefixes to control order:
on_Snapshot__10_title.py # runs first
on_Snapshot__15_singlefile.py # runs second
on_Snapshot__26_readability.py # runs later (depends on singlefile)
Args:
event_name: Hook event family or event class name.
Examples: 'BinaryRequestEvent', 'Snapshot'.
Event names are normalized by stripping a trailing `Event`.
If no matching `on_{EventFamily}__*` scripts exist, returns [].
filter_disabled: If True, skip hooks from disabled plugins (default: True)
config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
If None, will call get_config() with global scope
Returns:
Sorted list of hook script paths from enabled plugins only.
Examples:
# With proper config context (recommended):
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
discover_hooks('Snapshot', config=config)
# Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
# Without config (uses global defaults):
discover_hooks('Snapshot')
# Returns: [Path('.../on_Snapshot__10_title.py'), ...]
# Show all plugins regardless of enabled status:
discover_hooks('Snapshot', filter_disabled=False)
# Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
"""
hook_event_name = normalize_hook_event_name(event_name)
if not hook_event_name:
return []
hooks = []
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
# Search for hook scripts in all subdirectories
for ext in ("sh", "py", "js"):
pattern = f"*/on_{hook_event_name}__*.{ext}"
hooks.extend(base_dir.glob(pattern))
# Also check for hooks directly in the plugins directory
pattern_direct = f"on_{hook_event_name}__*.{ext}"
hooks.extend(base_dir.glob(pattern_direct))
# Binary provider hooks are not end-user extractors. They
# self-filter via `binproviders`, so applying the PLUGINS whitelist here
# can hide the very installer needed by a selected plugin (e.g.
# `--plugins=singlefile` still needs the `npm` BinaryRequest hook).
if filter_disabled and hook_event_name != "BinaryRequest":
# Get merged config if not provided (lazy import to avoid circular dependency)
if config is None:
from archivebox.config.configset import get_config
config = get_config()
enabled_hooks = []
for hook in hooks:
# Get plugin name from parent directory
# e.g., abx_plugins/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
plugin_name = hook.parent.name
# Check if this is a plugin directory (not the root plugins dir)
if hook.parent.resolve() in (BUILTIN_PLUGINS_DIR.resolve(), USER_PLUGINS_DIR.resolve()):
# Hook is in root plugins directory, not a plugin subdir
# Include it by default (no filtering for non-plugin hooks)
enabled_hooks.append(hook)
continue
# Check if plugin is enabled
plugin_config = get_plugin_special_config(plugin_name, config)
if plugin_config["enabled"]:
enabled_hooks.append(hook)
hooks = enabled_hooks
# Sort by filename (not full path) to ensure numeric prefix ordering works
# e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
return sorted(set(hooks), key=lambda p: p.name)
def run_hook(
script: Path,
output_dir: Path,
config: dict[str, Any],
timeout: int | None = None,
parent: Optional["Process"] = None,
**kwargs: Any,
) -> "Process":
"""
Execute a hook script with the given arguments using Process model.
This is the low-level hook executor that creates a Process record and
uses Process.launch() for subprocess management.
Config is passed to hooks via environment variables. Caller MUST use
get_config() to merge all sources (file, env, machine, crawl, snapshot).
Args:
script: Path to the hook script (.sh, .py, or .js)
output_dir: Working directory for the script (where output files go)
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
timeout: Maximum execution time in seconds
If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
parent: Optional parent Process (for tracking worker->hook hierarchy)
**kwargs: Arguments passed to the script as --key=value
Returns:
Process model instance (use process.exit_code, process.stdout, process.get_records())
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
if process.status == 'exited':
records = process.get_records() # Get parsed JSONL output
"""
from archivebox.machine.models import Process, Machine, NetworkInterface
from archivebox.config.constants import CONSTANTS
import sys
# Auto-detect timeout from plugin config if not explicitly provided
if timeout is None:
plugin_name = script.parent.name
plugin_config = get_plugin_special_config(plugin_name, config)
timeout = plugin_config["timeout"]
if timeout:
timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS))
# Get current machine
machine = Machine.current()
iface = NetworkInterface.current(refresh=True)
machine = iface.machine
# Auto-detect parent process if not explicitly provided
# This enables automatic hierarchy tracking: Worker -> Hook
if parent is None:
try:
parent = Process.current()
except Exception:
# If Process.current() fails (e.g., not in a worker context), leave parent as None
pass
if not script.exists():
# Create a failed Process record for hooks that don't exist
process = Process.objects.create(
machine=machine,
iface=iface,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=["echo", f"Hook script not found: {script}"],
timeout=timeout,
status=Process.StatusChoices.EXITED,
exit_code=1,
stderr=f"Hook script not found: {script}",
)
return process
# Determine the interpreter based on file extension
ext = script.suffix.lower()
if ext == ".sh":
cmd = ["bash", str(script)]
elif ext == ".py":
cmd = [sys.executable, str(script)]
elif ext == ".js":
cmd = ["node", str(script)]
else:
# Try to execute directly (assumes shebang)
cmd = [str(script)]
# Build CLI arguments from kwargs
for key, value in kwargs.items():
# Skip keys that start with underscore (internal parameters)
if key.startswith("_"):
continue
arg_key = f"--{key.replace('_', '-')}"
if isinstance(value, bool):
if value:
cmd.append(arg_key)
elif value is not None and value != "":
# JSON-encode complex values, use str for simple ones
# Skip empty strings to avoid --key= which breaks argument parsers
if isinstance(value, (dict, list)):
cmd.append(f"{arg_key}={json.dumps(value)}")
else:
# Ensure value is converted to string and strip whitespace
str_value = str(value).strip()
if str_value: # Only add if non-empty after stripping
cmd.append(f"{arg_key}={str_value}")
# Set up environment with base paths
env = os.environ.copy()
env["DATA_DIR"] = str(getattr(settings, "DATA_DIR", Path.cwd()))
env["ARCHIVE_DIR"] = str(getattr(settings, "ARCHIVE_DIR", Path.cwd() / "archive"))
env["ABX_RUNTIME"] = "archivebox"
env.setdefault("MACHINE_ID", getattr(settings, "MACHINE_ID", "") or os.environ.get("MACHINE_ID", ""))
resolved_output_dir = output_dir.resolve()
output_parts = set(resolved_output_dir.parts)
if "snapshots" in output_parts:
env["SNAP_DIR"] = str(resolved_output_dir.parent)
if "crawls" in output_parts:
env["CRAWL_DIR"] = str(resolved_output_dir.parent)
crawl_id = kwargs.get("_crawl_id") or kwargs.get("crawl_id")
if crawl_id:
try:
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.filter(id=crawl_id).first()
if crawl:
env["CRAWL_DIR"] = str(crawl.output_dir)
except Exception:
pass
# Get LIB_DIR and LIB_BIN_DIR from config
lib_dir = config.get("LIB_DIR", getattr(settings, "LIB_DIR", None))
lib_bin_dir = config.get("LIB_BIN_DIR", getattr(settings, "LIB_BIN_DIR", None))
if lib_dir:
env["LIB_DIR"] = str(lib_dir)
if not lib_bin_dir and lib_dir:
# Derive LIB_BIN_DIR from LIB_DIR if not set
lib_bin_dir = Path(lib_dir) / "bin"
# Set NODE_PATH for Node.js module resolution.
# Priority: config dict > derive from LIB_DIR
node_path = config.get("NODE_PATH")
if not node_path and lib_dir:
# Derive from LIB_DIR/npm/node_modules (create if needed)
node_modules_dir = Path(lib_dir) / "npm" / "node_modules"
node_modules_dir.mkdir(parents=True, exist_ok=True)
node_path = str(node_modules_dir)
if node_path:
env["NODE_PATH"] = node_path
env["NODE_MODULES_DIR"] = node_path # For backwards compatibility
# Export all config values to environment (already merged by get_config())
# Skip keys we've already handled specially above (PATH, LIB_DIR, LIB_BIN_DIR, NODE_PATH, etc.)
SKIP_KEYS = {
"PATH",
"LIB_DIR",
"LIB_BIN_DIR",
"NODE_PATH",
"NODE_MODULES_DIR",
"DATA_DIR",
"ARCHIVE_DIR",
"MACHINE_ID",
"SNAP_DIR",
"CRAWL_DIR",
}
for key, value in config.items():
if key in SKIP_KEYS:
continue # Already handled specially above, don't overwrite
if value is None:
continue
elif isinstance(value, bool):
env[key] = "true" if value else "false"
elif isinstance(value, (list, dict)):
env[key] = json.dumps(value)
else:
env[key] = str(value)
# Build PATH with proper precedence:
# 1. path-like *_BINARY parents (explicit binary overrides / cached abspaths)
# 2. LIB_BIN_DIR (local symlinked binaries)
# 3. existing PATH
runtime_bin_dirs: list[str] = []
if lib_bin_dir:
lib_bin_dir = str(lib_bin_dir)
env["LIB_BIN_DIR"] = lib_bin_dir
for key, raw_value in env.items():
if not key.endswith("_BINARY"):
continue
value = str(raw_value or "").strip()
if not value:
continue
path_value = Path(value).expanduser()
if not (path_value.is_absolute() or "/" in value or "\\" in value):
continue
binary_dir = str(path_value.resolve(strict=False).parent)
if binary_dir and binary_dir not in runtime_bin_dirs:
runtime_bin_dirs.append(binary_dir)
if lib_bin_dir and lib_bin_dir not in runtime_bin_dirs:
runtime_bin_dirs.append(lib_bin_dir)
uv_value = str(env.get("UV") or "").strip()
if uv_value:
uv_bin_dir = str(Path(uv_value).expanduser().resolve(strict=False).parent)
if uv_bin_dir and uv_bin_dir not in runtime_bin_dirs:
runtime_bin_dirs.append(uv_bin_dir)
current_path = env.get("PATH", "")
path_parts = [part for part in current_path.split(os.pathsep) if part]
for extra_dir in reversed(runtime_bin_dirs):
if extra_dir not in path_parts:
path_parts.insert(0, extra_dir)
env["PATH"] = os.pathsep.join(path_parts)
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)
# Detect if this is a background hook (long-running daemon)
# Background hooks use the .daemon.bg. or .finite.bg. filename convention.
# Old convention: __background in stem (for backwards compatibility)
is_background = ".bg." in script.name or "__background" in script.stem
try:
# Create Process record
process = Process.objects.create(
machine=machine,
iface=iface,
parent=parent,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
timeout=timeout,
)
# Copy the env dict we already built (includes os.environ + all customizations)
process.env = env.copy()
process.hydrate_binary_from_context(plugin_name=script.parent.name, hook_path=str(script))
# Save env before launching
process.save()
# Launch subprocess using Process.launch()
process.launch(background=is_background)
# Return Process object (caller can use process.exit_code, process.stdout, process.get_records())
return process
except Exception as e:
# Create a failed Process record for exceptions
process = Process.objects.create(
machine=machine,
iface=iface,
process_type=Process.TypeChoices.HOOK,
pwd=str(output_dir),
cmd=cmd,
timeout=timeout,
status=Process.StatusChoices.EXITED,
exit_code=1,
stderr=f"Failed to run hook: {type(e).__name__}: {e}",
)
return process
def extract_records_from_process(process: "Process") -> list[dict[str, Any]]:
"""
Extract JSONL records from a Process's stdout.
Adds plugin metadata to each record.
Args:
process: Process model instance with stdout captured
Returns:
List of parsed JSONL records with plugin metadata
"""
records = process.get_records()
if not records:
return []
# Extract plugin metadata from process.pwd and process.cmd
plugin_name = Path(process.pwd).name if process.pwd else "unknown"
hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else "unknown"
plugin_hook = process.cmd[1] if len(process.cmd) > 1 else ""
for record in records:
# Add plugin metadata to record
record.setdefault("plugin", plugin_name)
record.setdefault("hook_name", hook_name)
record.setdefault("plugin_hook", plugin_hook)
return records
def collect_urls_from_plugins(snapshot_dir: Path) -> list[dict[str, Any]]:
"""
Collect all urls.jsonl entries from parser plugin output subdirectories.
Each parser plugin outputs urls.jsonl to its own subdir:
snapshot_dir/parse_rss_urls/urls.jsonl
snapshot_dir/parse_html_urls/urls.jsonl
etc.
This is not special handling - urls.jsonl is just a normal output file.
This utility collects them all for the crawl system.
"""
urls = []
# Look in each immediate subdirectory for urls.jsonl
if not snapshot_dir.exists():
return urls
for subdir in snapshot_dir.iterdir():
if not subdir.is_dir():
continue
urls_file = subdir / "urls.jsonl"
if not urls_file.exists():
continue
try:
from archivebox.machine.models import Process
text = urls_file.read_text()
for entry in Process.parse_records_from_text(text):
if entry.get("url"):
entry["url"] = sanitize_extracted_url(fix_url_from_markdown(str(entry["url"]).strip()))
if not entry["url"]:
continue
# Track which parser plugin found this URL
entry["plugin"] = subdir.name
urls.append(entry)
except Exception:
pass
return urls
@lru_cache(maxsize=1)
def get_plugins() -> list[str]:
"""
Get list of available plugins by discovering plugin directories.
Returns plugin directory names for any plugin that exposes hooks, config.json,
or a standardized templates/icon.html asset. This includes non-extractor
plugins such as binary providers and shared base plugins.
"""
plugins = []
for plugin_dir in iter_plugin_dirs():
has_hooks = any(plugin_dir.glob("on_*__*.*"))
has_config = (plugin_dir / "config.json").exists()
has_icon = (plugin_dir / "templates" / "icon.html").exists()
if has_hooks or has_config or has_icon:
plugins.append(plugin_dir.name)
return sorted(set(plugins))
def get_plugin_name(plugin: str) -> str:
"""
Get the base plugin name without numeric prefix.
Examples:
'10_title' -> 'title'
'26_readability' -> 'readability'
'50_parse_html_urls' -> 'parse_html_urls'
"""
# Split on first underscore after any leading digits
parts = plugin.split("_", 1)
if len(parts) == 2 and parts[0].isdigit():
return parts[1]
return plugin
def get_enabled_plugins(config: dict[str, Any] | None = None) -> list[str]:
"""
Get the list of enabled plugins based on config and available hooks.
Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
Args:
config: Merged config dict from get_config() - if None, uses global config
Returns:
Plugin names sorted alphabetically (numeric prefix controls order).
Example:
from archivebox.config.configset import get_config
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
enabled = get_enabled_plugins(config) # ['wget', 'media', 'chrome', ...]
"""
# Get merged config if not provided
if config is None:
from archivebox.config.configset import get_config
config = get_config()
def normalize_enabled_plugins(value: Any) -> list[str]:
if value is None:
return []
if isinstance(value, str):
raw = value.strip()
if not raw:
return []
if raw.startswith("["):
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
parsed = None
if isinstance(parsed, list):
return [str(plugin).strip() for plugin in parsed if str(plugin).strip()]
return [plugin.strip() for plugin in raw.split(",") if plugin.strip()]
if isinstance(value, (list, tuple, set)):
return [str(plugin).strip() for plugin in value if str(plugin).strip()]
return [str(value).strip()] if str(value).strip() else []
# Support explicit ENABLED_PLUGINS override (legacy)
if "ENABLED_PLUGINS" in config:
return normalize_enabled_plugins(config["ENABLED_PLUGINS"])
if "ENABLED_EXTRACTORS" in config:
return normalize_enabled_plugins(config["ENABLED_EXTRACTORS"])
# Filter all plugins by enabled status
all_plugins = get_plugins()
enabled = []
for plugin in all_plugins:
plugin_config = get_plugin_special_config(plugin, config)
if plugin_config["enabled"]:
enabled.append(plugin)
return enabled
def discover_plugins_that_provide_interface(
module_name: str,
required_attrs: list[str],
plugin_prefix: str | None = None,
) -> dict[str, Any]:
"""
Discover plugins that provide a specific Python module with required interface.
This enables dynamic plugin discovery for features like search backends,
storage backends, etc. without hardcoding imports.
Args:
module_name: Name of the module to look for (e.g., 'search')
required_attrs: List of attributes the module must have (e.g., ['search', 'flush'])
plugin_prefix: Optional prefix to filter plugins (e.g., 'search_backend_')
Returns:
Dict mapping backend names to imported modules.
Backend name is derived from plugin directory name minus the prefix.
e.g., search_backend_sqlite -> 'sqlite'
Example:
backends = discover_plugins_that_provide_interface(
module_name='search',
required_attrs=['search', 'flush'],
plugin_prefix='search_backend_',
)
# Returns: {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
"""
import importlib.util
backends = {}
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
for plugin_dir in base_dir.iterdir():
if not plugin_dir.is_dir():
continue
plugin_name = plugin_dir.name
# Filter by prefix if specified
if plugin_prefix and not plugin_name.startswith(plugin_prefix):
continue
# Look for the module file
module_path = plugin_dir / f"{module_name}.py"
if not module_path.exists():
continue
try:
# Import the module dynamically
spec = importlib.util.spec_from_file_location(
f"archivebox.dynamic_plugins.{plugin_name}.{module_name}",
module_path,
)
if spec is None or spec.loader is None:
continue
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Check for required attributes
if not all(hasattr(module, attr) for attr in required_attrs):
continue
# Derive backend name from plugin directory name
if plugin_prefix:
backend_name = plugin_name[len(plugin_prefix) :]
else:
backend_name = plugin_name
backends[backend_name] = module
except Exception:
# Skip plugins that fail to import
continue
return backends
def get_search_backends() -> dict[str, Any]:
"""
Discover all available search backend plugins.
Search backends must provide a search.py module with:
- search(query: str) -> List[str] (returns snapshot IDs)
- flush(snapshot_ids: Iterable[str]) -> None
Returns:
Dict mapping backend names to their modules.
e.g., {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
"""
return discover_plugins_that_provide_interface(
module_name="search",
required_attrs=["search", "flush"],
plugin_prefix="search_backend_",
)
def discover_plugin_configs() -> dict[str, dict[str, Any]]:
"""
Discover all plugin config.json schemas.
Each plugin can define a config.json file with JSONSchema defining
its configuration options. This function discovers and loads all such schemas.
The config.json files use JSONSchema draft-07 with custom extensions:
- x-fallback: Global config key to use as fallback
- x-aliases: List of old/alternative config key names
Returns:
Dict mapping plugin names to their parsed JSONSchema configs.
e.g., {'wget': {...schema...}, 'chrome': {...schema...}}
Example config.json:
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"SAVE_WGET": {"type": "boolean", "default": true},
"WGET_TIMEOUT": {"type": "integer", "default": 60, "x-fallback": "TIMEOUT"}
}
}
"""
configs = {}
for plugin_dir in iter_plugin_dirs():
config_path = plugin_dir / "config.json"
if not config_path.exists():
continue
try:
with open(config_path) as f:
schema = json.load(f)
# Basic validation: must be an object with properties
if not isinstance(schema, dict):
continue
if schema.get("type") != "object":
continue
if "properties" not in schema:
continue
configs[plugin_dir.name] = schema
except (json.JSONDecodeError, OSError) as e:
# Log warning but continue - malformed config shouldn't break discovery
import sys
print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr)
continue
return configs
def get_config_defaults_from_plugins() -> dict[str, Any]:
"""
Get default values for all plugin config options.
Returns:
Dict mapping config keys to their default values.
e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
"""
plugin_configs = discover_plugin_configs()
defaults = {}
for plugin_name, schema in plugin_configs.items():
properties = schema.get("properties", {})
for key, prop_schema in properties.items():
if "default" in prop_schema:
defaults[key] = prop_schema["default"]
return defaults
def get_plugin_special_config(plugin_name: str, config: dict[str, Any]) -> dict[str, Any]:
"""
Extract special config keys for a plugin following naming conventions.
ArchiveBox recognizes 3 special config key patterns per plugin:
- {PLUGIN}_ENABLED: Enable/disable toggle (default True)
- {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
- {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
These allow ArchiveBox to:
- Skip disabled plugins (optimization)
- Enforce plugin-specific timeouts automatically
- Discover plugin binaries for validation
Args:
plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
Returns:
Dict with standardized keys:
{
'enabled': True, # bool
'timeout': 60, # int, seconds
'binary': 'wget', # str, path or name
}
Examples:
>>> from archivebox.config.configset import get_config
>>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
>>> get_plugin_special_config('wget', config)
{'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
"""
plugin_upper = plugin_name.upper()
# 1. Enabled: Check PLUGINS whitelist first, then PLUGINNAME_ENABLED (default True)
# Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
plugins_whitelist = config.get("PLUGINS", "")
if plugins_whitelist:
# PLUGINS whitelist is specified - include transitive required_plugins from
# config.json so selecting a plugin also enables its declared plugin-level
# dependencies (e.g. singlefile -> chrome).
plugin_configs = discover_plugin_configs()
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(",") if p.strip()}
pending = list(plugin_names)
while pending:
current = pending.pop()
schema = plugin_configs.get(current, {})
required_plugins = schema.get("required_plugins", [])
if not isinstance(required_plugins, list):
continue
for required_plugin in required_plugins:
required_plugin_name = str(required_plugin).strip().lower()
if not required_plugin_name or required_plugin_name in plugin_names:
continue
plugin_names.add(required_plugin_name)
pending.append(required_plugin_name)
if plugin_name.lower() not in plugin_names:
# Plugin not in whitelist - explicitly disabled
enabled = False
else:
# Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
enabled_key = f"{plugin_upper}_ENABLED"
enabled = config.get(enabled_key)
if enabled is None:
enabled = True # Default to enabled if in whitelist
elif isinstance(enabled, str):
enabled = enabled.lower() not in ("false", "0", "no", "")
else:
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
enabled_key = f"{plugin_upper}_ENABLED"
enabled = config.get(enabled_key)
if enabled is None:
enabled = True
elif isinstance(enabled, str):
# Handle string values from config file ("true"/"false")
enabled = enabled.lower() not in ("false", "0", "no", "")
# 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
timeout_key = f"{plugin_upper}_TIMEOUT"
timeout = config.get(timeout_key) or config.get("TIMEOUT", 300)
# 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
binary_key = f"{plugin_upper}_BINARY"
binary = config.get(binary_key, plugin_name)
return {
"enabled": bool(enabled),
"timeout": int(timeout),
"binary": str(binary),
}
# =============================================================================
# Plugin Template Discovery
# =============================================================================
#
# Plugins can provide custom templates for rendering their output in the UI.
# Templates are discovered by filename convention inside each plugin's templates/ dir:
#
# abx_plugins/plugins/<plugin_name>/
# templates/
# icon.html # Icon for admin table view (small inline HTML)
# card.html # Preview card for snapshot header
# full.html # Fullscreen view template
#
# Template context variables available:
# {{ result }} - ArchiveResult object
# {{ snapshot }} - Parent Snapshot object
# {{ output_path }} - Path to output file/dir relative to snapshot dir
# {{ plugin }} - Plugin name (e.g., 'screenshot', 'singlefile')
#
# Default templates used when plugin doesn't provide one
DEFAULT_TEMPLATES = {
"icon": """
<span title="{{ plugin }}" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;">
{{ icon }}
</span>
""",
"card": """
<iframe src="{{ output_path }}"
class="card-img-top"
style="width: 100%; height: 100%; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms"
loading="lazy">
</iframe>
""",
"full": """
<iframe src="{{ output_path }}"
class="full-page-iframe"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>
""",
}
def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> str | None:
"""
Get a plugin template by plugin name and template type.
Args:
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
template_name: One of 'icon', 'card', 'full'
fallback: If True, return default template if plugin template not found
Returns:
Template content as string, or None if not found and fallback=False.
"""
base_name = get_plugin_name(plugin)
if base_name in ("yt-dlp", "youtube-dl"):
base_name = "ytdlp"
for plugin_dir in iter_plugin_dirs():
# Match by directory name (exact or partial)
if plugin_dir.name == base_name or plugin_dir.name.endswith(f"_{base_name}"):
template_path = plugin_dir / "templates" / f"{template_name}.html"
if template_path.exists():
return template_path.read_text()
# Fall back to default template if requested
if fallback:
return DEFAULT_TEMPLATES.get(template_name, "")
return None
@lru_cache(maxsize=None)
def get_plugin_icon(plugin: str) -> str:
"""
Get the icon for a plugin from its icon.html template.
Args:
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
Returns:
Icon HTML/emoji string.
"""
# Try plugin-provided icon template
icon_template = get_plugin_template(plugin, "icon", fallback=False)
if icon_template:
return mark_safe(icon_template.strip())
# Fall back to generic folder icon
return mark_safe("📁")
# =============================================================================
# Hook Result Processing Helpers
# =============================================================================
def process_hook_records(records: list[dict[str, Any]], overrides: dict[str, Any] | None = None) -> dict[str, int]:
"""
Process JSONL records emitted by hook stdout.
This handles hook-emitted record types such as Snapshot, Tag, BinaryRequest,
and Binary. It does not process internal bus lifecycle events, since those
are not emitted as JSONL records by hook subprocesses.
Args:
records: List of JSONL record dicts from result['records']
overrides: Dict with 'snapshot', 'crawl', 'dependency', 'created_by_id', etc.
Returns:
Dict with counts by record type
"""
stats = {}
overrides = overrides or {}
for record in records:
record_type = record.get("type")
if not record_type:
continue
# Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
if record_type == "ArchiveResult":
continue
try:
# Dispatch to appropriate model's from_json() method
if record_type == "Snapshot":
from archivebox.core.models import Snapshot
if record.get("url"):
record = {
**record,
"url": sanitize_extracted_url(fix_url_from_markdown(str(record["url"]).strip())),
}
if not record["url"]:
continue
# Check if discovered snapshot exceeds crawl max_depth
snapshot_depth = record.get("depth", 0)
crawl = overrides.get("crawl")
if crawl and snapshot_depth > crawl.max_depth:
# Skip - this URL was discovered but exceeds max crawl depth
continue
obj = Snapshot.from_json(record.copy(), overrides)
if obj:
stats["Snapshot"] = stats.get("Snapshot", 0) + 1
elif record_type == "Tag":
from archivebox.core.models import Tag
obj = Tag.from_json(record.copy(), overrides)
if obj:
stats["Tag"] = stats.get("Tag", 0) + 1
elif record_type in {"BinaryRequest", "Binary"}:
from archivebox.machine.models import Binary
obj = Binary.from_json(record.copy(), overrides)
if obj:
stats[record_type] = stats.get(record_type, 0) + 1
else:
import sys
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
except Exception as e:
import sys
print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
continue
return stats