wip major changes

2026-04-05 23:37:58 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -0,0 +1,867 @@
+"""
+Hook discovery and execution system for ArchiveBox plugins.
+
+Hooks are standalone scripts that run as separate processes and communicate
+with ArchiveBox via CLI arguments and stdout JSON output. This keeps the plugin
+system simple and language-agnostic.
+
+Directory structure:
+    archivebox/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext>  (built-in)
+    data/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext>        (user)
+
+Hook contract:
+    Input:  --url=<url> (and other --key=value args)
+    Output: JSON to stdout, files to $PWD
+    Exit:   0 = success, non-zero = failure
+
+Execution order:
+    - Extractors run sequentially within each Snapshot (ordered by numeric prefix)
+    - Multiple Snapshots can process in parallel
+    - Failed extractors don't block subsequent extractors
+
+Dependency handling:
+    Extractors that depend on other extractors' output should check at runtime:
+
+    ```python
+    # Example: screenshot extractor depends on chrome_session
+    chrome_session_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome_session'
+    if not (chrome_session_dir / 'session.json').exists():
+        print('{"status": "skipped", "output": "chrome_session not available"}')
+        sys.exit(1)  # Exit non-zero so it gets retried later
+    ```
+
+    On retry (Snapshot.retry_failed_archiveresults()):
+    - Only FAILED/SKIPPED extractors reset to queued (SUCCEEDED stays)
+    - Run in order again
+    - If dependencies now succeed, dependents can run
+
+API (all hook logic lives here):
+    discover_hooks(event)     -> List[Path]     Find hook scripts
+    run_hook(script, ...)     -> HookResult     Execute a hook script
+    run_hooks(event, ...)     -> List[HookResult]  Run all hooks for an event
+"""
+
+__package__ = 'archivebox'
+
+import os
+import json
+import time
+import subprocess
+from pathlib import Path
+from typing import List, Dict, Any, Optional, TypedDict
+
+from django.conf import settings
+from django.utils import timezone
+
+
+# Plugin directories
+BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
+USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
+
+
+class HookResult(TypedDict, total=False):
+    """Raw result from run_hook()."""
+    returncode: int
+    stdout: str
+    stderr: str
+    output_json: Optional[Dict[str, Any]]
+    output_files: List[str]
+    duration_ms: int
+    hook: str
+
+
+def discover_hooks(event_name: str) -> List[Path]:
+    """
+    Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
+
+    Searches both built-in and user plugin directories.
+    Returns scripts sorted alphabetically by filename for deterministic execution order.
+
+    Hook naming convention uses numeric prefixes to control order:
+        on_Snapshot__10_title.py        # runs first
+        on_Snapshot__15_singlefile.py   # runs second
+        on_Snapshot__26_readability.py  # runs later (depends on singlefile)
+
+    Example:
+        discover_hooks('Snapshot')
+        # Returns: [Path('.../on_Snapshot__10_title.py'), Path('.../on_Snapshot__15_singlefile.py'), ...]
+    """
+    hooks = []
+
+    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
+        if not base_dir.exists():
+            continue
+
+        # Search for hook scripts in all subdirectories
+        for ext in ('sh', 'py', 'js'):
+            pattern = f'*/on_{event_name}__*.{ext}'
+            hooks.extend(base_dir.glob(pattern))
+
+            # Also check for hooks directly in the plugins directory
+            pattern_direct = f'on_{event_name}__*.{ext}'
+            hooks.extend(base_dir.glob(pattern_direct))
+
+    # Sort by filename (not full path) to ensure numeric prefix ordering works
+    # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
+    return sorted(set(hooks), key=lambda p: p.name)
+
+
+def discover_all_hooks() -> Dict[str, List[Path]]:
+    """
+    Discover all hooks organized by event name.
+
+    Returns a dict mapping event names to lists of hook script paths.
+    """
+    hooks_by_event: Dict[str, List[Path]] = {}
+
+    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
+        if not base_dir.exists():
+            continue
+
+        for ext in ('sh', 'py', 'js'):
+            for hook_path in base_dir.glob(f'*/on_*__*.{ext}'):
+                # Extract event name from filename: on_EventName__hook_name.ext
+                filename = hook_path.stem  # on_EventName__hook_name
+                if filename.startswith('on_') and '__' in filename:
+                    event_name = filename[3:].split('__')[0]  # EventName
+                    if event_name not in hooks_by_event:
+                        hooks_by_event[event_name] = []
+                    hooks_by_event[event_name].append(hook_path)
+
+    # Sort hooks within each event
+    for event_name in hooks_by_event:
+        hooks_by_event[event_name] = sorted(set(hooks_by_event[event_name]), key=lambda p: p.name)
+
+    return hooks_by_event
+
+
+def run_hook(
+    script: Path,
+    output_dir: Path,
+    timeout: int = 300,
+    config_objects: Optional[List[Any]] = None,
+    **kwargs: Any
+) -> HookResult:
+    """
+    Execute a hook script with the given arguments.
+
+    This is the low-level hook executor. For running extractors with proper
+    metadata handling, use call_extractor() instead.
+
+    Config is passed to hooks via environment variables with this priority:
+    1. Plugin schema defaults (config.json)
+    2. Config file (ArchiveBox.conf)
+    3. Environment variables
+    4. Machine.config (auto-included, lowest override priority)
+    5. config_objects (in order - later objects override earlier ones)
+
+    Args:
+        script: Path to the hook script (.sh, .py, or .js)
+        output_dir: Working directory for the script (where output files go)
+        timeout: Maximum execution time in seconds
+        config_objects: Optional list of objects with .config JSON fields
+                       (e.g., [crawl, snapshot] - later items have higher priority)
+        **kwargs: Arguments passed to the script as --key=value
+
+    Returns:
+        HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
+    """
+    import time
+    start_time = time.time()
+
+    # Auto-include Machine.config at the start (lowest priority among config_objects)
+    from machine.models import Machine
+    machine = Machine.current()
+    all_config_objects = [machine] + list(config_objects or [])
+
+    if not script.exists():
+        return HookResult(
+            returncode=1,
+            stdout='',
+            stderr=f'Hook script not found: {script}',
+            output_json=None,
+            output_files=[],
+            duration_ms=0,
+            hook=str(script),
+        )
+
+    # Determine the interpreter based on file extension
+    ext = script.suffix.lower()
+    if ext == '.sh':
+        cmd = ['bash', str(script)]
+    elif ext == '.py':
+        cmd = ['python3', str(script)]
+    elif ext == '.js':
+        cmd = ['node', str(script)]
+    else:
+        # Try to execute directly (assumes shebang)
+        cmd = [str(script)]
+
+    # Build CLI arguments from kwargs
+    for key, value in kwargs.items():
+        arg_key = f'--{key.replace("_", "-")}'
+        if isinstance(value, bool):
+            if value:
+                cmd.append(arg_key)
+        elif value is not None:
+            # JSON-encode complex values, use str for simple ones
+            if isinstance(value, (dict, list)):
+                cmd.append(f'{arg_key}={json.dumps(value)}')
+            else:
+                cmd.append(f'{arg_key}={value}')
+
+    # Set up environment with base paths
+    env = os.environ.copy()
+    env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
+    env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
+    env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
+
+    # Build overrides from any objects with .config fields (in order, later overrides earlier)
+    # all_config_objects includes Machine at the start, then any passed config_objects
+    overrides = {}
+    for obj in all_config_objects:
+        if obj and hasattr(obj, 'config') and obj.config:
+            overrides.update(obj.config)
+
+    # Get plugin config from JSON schemas with hierarchy resolution
+    # This merges: schema defaults -> config file -> env vars -> object config overrides
+    plugin_config = get_flat_plugin_config(overrides=overrides if overrides else None)
+    export_plugin_config_to_env(plugin_config, env)
+
+    # Also pass core config values that aren't in plugin schemas yet
+    # These are legacy values that may still be needed
+    from archivebox import config
+    env.setdefault('CHROME_BINARY', str(getattr(config, 'CHROME_BINARY', '')))
+    env.setdefault('WGET_BINARY', str(getattr(config, 'WGET_BINARY', '')))
+    env.setdefault('CURL_BINARY', str(getattr(config, 'CURL_BINARY', '')))
+    env.setdefault('GIT_BINARY', str(getattr(config, 'GIT_BINARY', '')))
+    env.setdefault('YOUTUBEDL_BINARY', str(getattr(config, 'YOUTUBEDL_BINARY', '')))
+    env.setdefault('SINGLEFILE_BINARY', str(getattr(config, 'SINGLEFILE_BINARY', '')))
+    env.setdefault('READABILITY_BINARY', str(getattr(config, 'READABILITY_BINARY', '')))
+    env.setdefault('MERCURY_BINARY', str(getattr(config, 'MERCURY_BINARY', '')))
+    env.setdefault('NODE_BINARY', str(getattr(config, 'NODE_BINARY', '')))
+    env.setdefault('TIMEOUT', str(getattr(config, 'TIMEOUT', 60)))
+    env.setdefault('CHECK_SSL_VALIDITY', str(getattr(config, 'CHECK_SSL_VALIDITY', True)))
+    env.setdefault('USER_AGENT', str(getattr(config, 'USER_AGENT', '')))
+    env.setdefault('RESOLUTION', str(getattr(config, 'RESOLUTION', '')))
+
+    # Create output directory if needed
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Capture files before execution to detect new output
+    files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
+
+    try:
+        result = subprocess.run(
+            cmd,
+            cwd=str(output_dir),
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            env=env,
+        )
+
+        # Detect new files created by the hook
+        files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
+        new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
+
+        # Try to parse stdout as JSON
+        output_json = None
+        stdout = result.stdout.strip()
+        if stdout:
+            try:
+                output_json = json.loads(stdout)
+            except json.JSONDecodeError:
+                pass  # Not JSON output, that's fine
+
+        duration_ms = int((time.time() - start_time) * 1000)
+
+        return HookResult(
+            returncode=result.returncode,
+            stdout=result.stdout,
+            stderr=result.stderr,
+            output_json=output_json,
+            output_files=new_files,
+            duration_ms=duration_ms,
+            hook=str(script),
+        )
+
+    except subprocess.TimeoutExpired:
+        duration_ms = int((time.time() - start_time) * 1000)
+        return HookResult(
+            returncode=-1,
+            stdout='',
+            stderr=f'Hook timed out after {timeout} seconds',
+            output_json=None,
+            output_files=[],
+            duration_ms=duration_ms,
+            hook=str(script),
+        )
+    except Exception as e:
+        duration_ms = int((time.time() - start_time) * 1000)
+        return HookResult(
+            returncode=-1,
+            stdout='',
+            stderr=f'Failed to run hook: {type(e).__name__}: {e}',
+            output_json=None,
+            output_files=[],
+            duration_ms=duration_ms,
+            hook=str(script),
+        )
+
+
+def collect_urls_from_extractors(snapshot_dir: Path) -> List[Dict[str, Any]]:
+    """
+    Collect all urls.jsonl entries from extractor output subdirectories.
+
+    Each parser extractor outputs urls.jsonl to its own subdir:
+        snapshot_dir/parse_rss_urls/urls.jsonl
+        snapshot_dir/parse_html_urls/urls.jsonl
+        etc.
+
+    This is not special handling - urls.jsonl is just a normal output file.
+    This utility collects them all for the crawl system.
+    """
+    urls = []
+
+    # Look in each immediate subdirectory for urls.jsonl
+    if not snapshot_dir.exists():
+        return urls
+
+    for subdir in snapshot_dir.iterdir():
+        if not subdir.is_dir():
+            continue
+
+        urls_file = subdir / 'urls.jsonl'
+        if not urls_file.exists():
+            continue
+
+        try:
+            with open(urls_file, 'r') as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        try:
+                            entry = json.loads(line)
+                            if entry.get('url'):
+                                # Track which extractor found this URL
+                                entry['via_extractor'] = subdir.name
+                                urls.append(entry)
+                        except json.JSONDecodeError:
+                            continue
+        except Exception:
+            pass
+
+    return urls
+
+
+def run_hooks(
+    event_name: str,
+    output_dir: Path,
+    timeout: int = 300,
+    stop_on_failure: bool = False,
+    config_objects: Optional[List[Any]] = None,
+    **kwargs: Any
+) -> List[HookResult]:
+    """
+    Run all hooks for a given event.
+
+    Args:
+        event_name: The event name to trigger (e.g., 'Snapshot__wget')
+        output_dir: Working directory for hook scripts
+        timeout: Maximum execution time per hook
+        stop_on_failure: If True, stop executing hooks after first failure
+        config_objects: Optional list of objects with .config JSON fields
+                       (e.g., [crawl, snapshot] - later items have higher priority)
+        **kwargs: Arguments passed to each hook script
+
+    Returns:
+        List of results from each hook execution
+    """
+    hooks = discover_hooks(event_name)
+    results = []
+
+    for hook in hooks:
+        result = run_hook(hook, output_dir, timeout=timeout, config_objects=config_objects, **kwargs)
+        result['hook'] = str(hook)
+        results.append(result)
+
+        if stop_on_failure and result['returncode'] != 0:
+            break
+
+    return results
+
+
+def get_extractors() -> List[str]:
+    """
+    Get list of available extractors by discovering Snapshot hooks.
+
+    Returns extractor names (including numeric prefix) from hook filenames:
+    on_Snapshot__10_title.py -> '10_title'
+    on_Snapshot__26_readability.py -> '26_readability'
+
+    Sorted alphabetically so numeric prefixes control execution order.
+    """
+    extractors = []
+
+    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
+        if not base_dir.exists():
+            continue
+
+        for ext in ('sh', 'py', 'js'):
+            for hook_path in base_dir.glob(f'*/on_Snapshot__*.{ext}'):
+                # Extract extractor name: on_Snapshot__26_readability.py -> 26_readability
+                filename = hook_path.stem  # on_Snapshot__26_readability
+                if '__' in filename:
+                    extractor = filename.split('__', 1)[1]
+                    extractors.append(extractor)
+
+    return sorted(set(extractors))
+
+
+def get_parser_extractors() -> List[str]:
+    """
+    Get list of parser extractors by discovering parse_*_urls hooks.
+
+    Parser extractors discover URLs from source files and output urls.jsonl.
+    Returns extractor names like: ['50_parse_html_urls', '51_parse_rss_urls', ...]
+    """
+    return [e for e in get_extractors() if 'parse_' in e and '_urls' in e]
+
+
+def get_extractor_name(extractor: str) -> str:
+    """
+    Get the base extractor name without numeric prefix.
+
+    Examples:
+        '10_title' -> 'title'
+        '26_readability' -> 'readability'
+        '50_parse_html_urls' -> 'parse_html_urls'
+    """
+    # Split on first underscore after any leading digits
+    parts = extractor.split('_', 1)
+    if len(parts) == 2 and parts[0].isdigit():
+        return parts[1]
+    return extractor
+
+
+def is_parser_extractor(extractor: str) -> bool:
+    """Check if an extractor is a parser extractor (discovers URLs)."""
+    name = get_extractor_name(extractor)
+    return name.startswith('parse_') and name.endswith('_urls')
+
+
+# Precedence order for search indexing (lower number = higher priority)
+# Used to select which extractor's output to use for full-text search
+# Extractor names here should match the part after the numeric prefix
+# e.g., '31_readability' -> 'readability'
+ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
+    ('readability', 1),
+    ('mercury', 2),
+    ('htmltotext', 3),
+    ('singlefile', 4),
+    ('dom', 5),
+    ('wget', 6),
+]
+
+
+def get_enabled_extractors(config: Optional[Dict] = None) -> List[str]:
+    """
+    Get the list of enabled extractors based on config and available hooks.
+
+    Checks for ENABLED_EXTRACTORS in config, falls back to discovering
+    available hooks from the plugins directory.
+
+    Returns extractor names sorted alphabetically (numeric prefix controls order).
+    """
+    if config and 'ENABLED_EXTRACTORS' in config:
+        return config['ENABLED_EXTRACTORS']
+
+    # Discover from hooks - this is the source of truth
+    return get_extractors()
+
+
+def discover_plugins_that_provide_interface(
+    module_name: str,
+    required_attrs: List[str],
+    plugin_prefix: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Discover plugins that provide a specific Python module with required interface.
+
+    This enables dynamic plugin discovery for features like search backends,
+    storage backends, etc. without hardcoding imports.
+
+    Args:
+        module_name: Name of the module to look for (e.g., 'search')
+        required_attrs: List of attributes the module must have (e.g., ['search', 'flush'])
+        plugin_prefix: Optional prefix to filter plugins (e.g., 'search_backend_')
+
+    Returns:
+        Dict mapping backend names to imported modules.
+        Backend name is derived from plugin directory name minus the prefix.
+        e.g., search_backend_sqlite -> 'sqlite'
+
+    Example:
+        backends = discover_plugins_that_provide_interface(
+            module_name='search',
+            required_attrs=['search', 'flush'],
+            plugin_prefix='search_backend_',
+        )
+        # Returns: {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
+    """
+    import importlib.util
+
+    backends = {}
+
+    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
+        if not base_dir.exists():
+            continue
+
+        for plugin_dir in base_dir.iterdir():
+            if not plugin_dir.is_dir():
+                continue
+
+            plugin_name = plugin_dir.name
+
+            # Filter by prefix if specified
+            if plugin_prefix and not plugin_name.startswith(plugin_prefix):
+                continue
+
+            # Look for the module file
+            module_path = plugin_dir / f'{module_name}.py'
+            if not module_path.exists():
+                continue
+
+            try:
+                # Import the module dynamically
+                spec = importlib.util.spec_from_file_location(
+                    f'archivebox.plugins.{plugin_name}.{module_name}',
+                    module_path
+                )
+                if spec is None or spec.loader is None:
+                    continue
+
+                module = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(module)
+
+                # Check for required attributes
+                if not all(hasattr(module, attr) for attr in required_attrs):
+                    continue
+
+                # Derive backend name from plugin directory name
+                if plugin_prefix:
+                    backend_name = plugin_name[len(plugin_prefix):]
+                else:
+                    backend_name = plugin_name
+
+                backends[backend_name] = module
+
+            except Exception:
+                # Skip plugins that fail to import
+                continue
+
+    return backends
+
+
+def get_search_backends() -> Dict[str, Any]:
+    """
+    Discover all available search backend plugins.
+
+    Search backends must provide a search.py module with:
+        - search(query: str) -> List[str]  (returns snapshot IDs)
+        - flush(snapshot_ids: Iterable[str]) -> None
+
+    Returns:
+        Dict mapping backend names to their modules.
+        e.g., {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
+    """
+    return discover_plugins_that_provide_interface(
+        module_name='search',
+        required_attrs=['search', 'flush'],
+        plugin_prefix='search_backend_',
+    )
+
+
+def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
+    """
+    Discover all plugin config.json schemas.
+
+    Each plugin can define a config.json file with JSONSchema defining
+    its configuration options. This function discovers and loads all such schemas.
+
+    The config.json files use JSONSchema draft-07 with custom extensions:
+        - x-fallback: Global config key to use as fallback
+        - x-aliases: List of old/alternative config key names
+
+    Returns:
+        Dict mapping plugin names to their parsed JSONSchema configs.
+        e.g., {'wget': {...schema...}, 'chrome_session': {...schema...}}
+
+    Example config.json:
+        {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": {
+                "SAVE_WGET": {"type": "boolean", "default": true},
+                "WGET_TIMEOUT": {"type": "integer", "default": 60, "x-fallback": "TIMEOUT"}
+            }
+        }
+    """
+    configs = {}
+
+    for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
+        if not base_dir.exists():
+            continue
+
+        for plugin_dir in base_dir.iterdir():
+            if not plugin_dir.is_dir():
+                continue
+
+            config_path = plugin_dir / 'config.json'
+            if not config_path.exists():
+                continue
+
+            try:
+                with open(config_path, 'r') as f:
+                    schema = json.load(f)
+
+                # Basic validation: must be an object with properties
+                if not isinstance(schema, dict):
+                    continue
+                if schema.get('type') != 'object':
+                    continue
+                if 'properties' not in schema:
+                    continue
+
+                configs[plugin_dir.name] = schema
+
+            except (json.JSONDecodeError, OSError) as e:
+                # Log warning but continue - malformed config shouldn't break discovery
+                import sys
+                print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr)
+                continue
+
+    return configs
+
+
+def get_merged_config_schema() -> Dict[str, Any]:
+    """
+    Get a merged JSONSchema combining all plugin config schemas.
+
+    This creates a single schema that can validate all plugin config keys.
+    Useful for validating the complete configuration at startup.
+
+    Returns:
+        Combined JSONSchema with all plugin properties merged.
+    """
+    plugin_configs = discover_plugin_configs()
+
+    merged_properties = {}
+    for plugin_name, schema in plugin_configs.items():
+        properties = schema.get('properties', {})
+        for key, prop_schema in properties.items():
+            if key in merged_properties:
+                # Key already exists from another plugin - log warning but keep first
+                import sys
+                print(f"Warning: Config key '{key}' defined in multiple plugins, using first definition", file=sys.stderr)
+                continue
+            merged_properties[key] = prop_schema
+
+    return {
+        "$schema": "http://json-schema.org/draft-07/schema#",
+        "type": "object",
+        "additionalProperties": True,  # Allow unknown keys (core config, etc.)
+        "properties": merged_properties,
+    }
+
+
+def get_config_defaults_from_plugins() -> Dict[str, Any]:
+    """
+    Get default values for all plugin config options.
+
+    Returns:
+        Dict mapping config keys to their default values.
+        e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
+    """
+    plugin_configs = discover_plugin_configs()
+    defaults = {}
+
+    for plugin_name, schema in plugin_configs.items():
+        properties = schema.get('properties', {})
+        for key, prop_schema in properties.items():
+            if 'default' in prop_schema:
+                defaults[key] = prop_schema['default']
+
+    return defaults
+
+
+def resolve_config_value(
+    key: str,
+    prop_schema: Dict[str, Any],
+    env_vars: Dict[str, str],
+    config_file: Dict[str, str],
+    overrides: Optional[Dict[str, Any]] = None,
+) -> Any:
+    """
+    Resolve a single config value following the hierarchy and schema rules.
+
+    Resolution order (later overrides earlier):
+        1. Schema default
+        2. x-fallback (global config key)
+        3. Config file (ArchiveBox.conf)
+        4. Environment variables (including x-aliases)
+        5. Explicit overrides (User/Crawl/Snapshot config)
+
+    Args:
+        key: Config key name (e.g., 'WGET_TIMEOUT')
+        prop_schema: JSONSchema property definition for this key
+        env_vars: Environment variables dict
+        config_file: Config file values dict
+        overrides: Optional override values (from User/Crawl/Snapshot)
+
+    Returns:
+        Resolved value with appropriate type coercion.
+    """
+    value = None
+    prop_type = prop_schema.get('type', 'string')
+
+    # 1. Start with schema default
+    if 'default' in prop_schema:
+        value = prop_schema['default']
+
+    # 2. Check x-fallback (global config key)
+    fallback_key = prop_schema.get('x-fallback')
+    if fallback_key:
+        if fallback_key in env_vars:
+            value = env_vars[fallback_key]
+        elif fallback_key in config_file:
+            value = config_file[fallback_key]
+
+    # 3. Check config file for main key
+    if key in config_file:
+        value = config_file[key]
+
+    # 4. Check environment variables (main key and aliases)
+    keys_to_check = [key] + prop_schema.get('x-aliases', [])
+    for check_key in keys_to_check:
+        if check_key in env_vars:
+            value = env_vars[check_key]
+            break
+
+    # 5. Apply explicit overrides
+    if overrides and key in overrides:
+        value = overrides[key]
+
+    # Type coercion for env var strings
+    if value is not None and isinstance(value, str):
+        value = coerce_config_value(value, prop_type, prop_schema)
+
+    return value
+
+
+def coerce_config_value(value: str, prop_type: str, prop_schema: Dict[str, Any]) -> Any:
+    """
+    Coerce a string value to the appropriate type based on schema.
+
+    Args:
+        value: String value to coerce
+        prop_type: JSONSchema type ('boolean', 'integer', 'number', 'array', 'string')
+        prop_schema: Full property schema (for array item types, etc.)
+
+    Returns:
+        Coerced value of appropriate type.
+    """
+    if prop_type == 'boolean':
+        return value.lower() in ('true', '1', 'yes', 'on')
+    elif prop_type == 'integer':
+        try:
+            return int(value)
+        except ValueError:
+            return prop_schema.get('default', 0)
+    elif prop_type == 'number':
+        try:
+            return float(value)
+        except ValueError:
+            return prop_schema.get('default', 0.0)
+    elif prop_type == 'array':
+        # Try JSON parse first, fall back to comma-separated
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            return [v.strip() for v in value.split(',') if v.strip()]
+    else:
+        return value
+
+
+def get_flat_plugin_config(
+    env_vars: Optional[Dict[str, str]] = None,
+    config_file: Optional[Dict[str, str]] = None,
+    overrides: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """
+    Get all plugin config values resolved according to hierarchy.
+
+    This is the main function for getting plugin configuration.
+    It discovers all plugin schemas and resolves each config key.
+
+    Args:
+        env_vars: Environment variables (defaults to os.environ)
+        config_file: Config file values (from ArchiveBox.conf)
+        overrides: Override values (from User/Crawl/Snapshot config fields)
+
+    Returns:
+        Flat dict of all resolved config values.
+        e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
+    """
+    if env_vars is None:
+        env_vars = dict(os.environ)
+    if config_file is None:
+        config_file = {}
+
+    plugin_configs = discover_plugin_configs()
+    flat_config = {}
+
+    for plugin_name, schema in plugin_configs.items():
+        properties = schema.get('properties', {})
+        for key, prop_schema in properties.items():
+            flat_config[key] = resolve_config_value(
+                key, prop_schema, env_vars, config_file, overrides
+            )
+
+    return flat_config
+
+
+def export_plugin_config_to_env(
+    config: Dict[str, Any],
+    env: Optional[Dict[str, str]] = None,
+) -> Dict[str, str]:
+    """
+    Export plugin config values to environment variable format.
+
+    Converts all values to strings suitable for subprocess environment.
+    Arrays are JSON-encoded.
+
+    Args:
+        config: Flat config dict from get_flat_plugin_config()
+        env: Optional existing env dict to update (creates new if None)
+
+    Returns:
+        Environment dict with config values as strings.
+    """
+    if env is None:
+        env = {}
+
+    for key, value in config.items():
+        if value is None:
+            continue
+        elif isinstance(value, bool):
+            env[key] = 'true' if value else 'false'
+        elif isinstance(value, (list, dict)):
+            env[key] = json.dumps(value)
+        else:
+            env[key] = str(value)
+
+    return env
+
+