diff --git a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate.py b/archivebox/plugins/chrome/on_Crawl__10_chrome_validate.py deleted file mode 100644 index 7aa8639c..00000000 --- a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 -""" -Validate and compute derived Chrome config values. - -This hook runs early in the Crawl lifecycle to: -1. Auto-detect Chrome binary location -2. Compute sandbox settings based on Docker detection -3. Validate binary availability and version -4. Set computed env vars for subsequent hooks - -Output: - - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env - - Binary JSONL records to stdout when binaries are found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -# Chrome binary search order -CHROME_BINARY_NAMES = [ - 'chromium', - 'chromium-browser', - 'google-chrome', - 'google-chrome-stable', - 'chrome', -] - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def detect_docker() -> bool: - """Detect if running inside Docker container.""" - return ( - os.path.exists('/.dockerenv') or - os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or - os.path.exists('/run/.containerenv') - ) - - -def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None: - """Find Chrome binary using abx-pkg, checking configured path first.""" - # Try configured binary first - if configured: - try: - binary = Binary(name=configured, binproviders=[provider]).load() - if binary.abspath: - return binary - except Exception: - pass - - # Search common names - for name in CHROME_BINARY_NAMES: - try: - binary = Binary(name=name, binproviders=[provider]).load() - if binary.abspath: - return binary - except Exception: - continue - - return None - - -def output_binary(binary: Binary, name: str): - """Output Binary JSONL record to stdout.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - warnings = [] - errors = [] - computed = {} - - # Get config values - chrome_binary = get_env('CHROME_BINARY', 'chromium') - chrome_sandbox = get_env_bool('CHROME_SANDBOX', True) - screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True) - pdf_enabled = get_env_bool('PDF_ENABLED', True) - dom_enabled = get_env_bool('DOM_ENABLED', True) - - # Compute USE_CHROME (derived from extractor enabled flags) - use_chrome = screenshot_enabled or pdf_enabled or dom_enabled - computed['USE_CHROME'] = str(use_chrome).lower() - - # Detect Docker and adjust sandbox - in_docker = detect_docker() - computed['IN_DOCKER'] = str(in_docker).lower() - - if in_docker and chrome_sandbox: - warnings.append( - "Running in Docker with CHROME_SANDBOX=true. " - "Chrome may fail to start. Consider setting CHROME_SANDBOX=false." - ) - # Auto-disable sandbox in Docker unless explicitly set - if not get_env('CHROME_SANDBOX'): - computed['CHROME_SANDBOX'] = 'false' - - # Find Chrome binary using abx-pkg - provider = EnvProvider() - if use_chrome: - chrome = find_chrome_binary(chrome_binary, provider) - if not chrome or not chrome.abspath: - errors.append( - f"Chrome binary not found (tried: {chrome_binary}). " - "Install Chrome/Chromium or set CHROME_BINARY path." - ) - computed['CHROME_BINARY'] = '' - else: - computed['CHROME_BINARY'] = str(chrome.abspath) - computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown' - - # Output Binary JSONL record for Chrome - output_binary(chrome, name='chrome') - - # Check Node.js for Puppeteer - node_binary_name = get_env('NODE_BINARY', 'node') - try: - node = Binary(name=node_binary_name, binproviders=[provider]).load() - node_path = str(node.abspath) if node.abspath else '' - except Exception: - node = None - node_path = '' - - if use_chrome and not node_path: - errors.append( - f"Node.js not found (tried: {node_binary_name}). " - "Install Node.js or set NODE_BINARY path for Puppeteer." - ) - else: - computed['NODE_BINARY'] = node_path - if node and node.abspath: - # Output Binary JSONL record for Node - output_binary(node, name='node') - - # Output computed values - for key, value in computed.items(): - print(f"COMPUTED:{key}={value}") - - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - - for error in errors: - print(f"ERROR:{error}", file=sys.stderr) - - sys.exit(1 if errors else 0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/wget/on_Crawl__11_wget_validate.py b/archivebox/plugins/wget/on_Crawl__06_wget_install.py similarity index 100% rename from archivebox/plugins/wget/on_Crawl__11_wget_validate.py rename to archivebox/plugins/wget/on_Crawl__06_wget_install.py