Remove redundant chrome_validate hook, rename wget_validate to wget_install

- Delete chrome/on_Crawl__10_chrome_validate.py (duplicates chrome_install)
- Rename wget/on_Crawl__11_wget_validate.py → on_Crawl__06_wget_install.py

All hooks now follow consistent naming: install, launch, or config
This commit is contained in:
Claude
2025-12-31 23:41:40 +00:00
parent 8f518500a4
commit 4d33084496
2 changed files with 0 additions and 172 deletions

View File

@@ -1,172 +0,0 @@
#!/usr/bin/env python3
"""
Validate and compute derived Chrome config values.
This hook runs early in the Crawl lifecycle to:
1. Auto-detect Chrome binary location
2. Compute sandbox settings based on Docker detection
3. Validate binary availability and version
4. Set computed env vars for subsequent hooks
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- Binary JSONL records to stdout when binaries are found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
# Chrome binary search order
CHROME_BINARY_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
"""Find Chrome binary using abx-pkg, checking configured path first."""
# Try configured binary first
if configured:
try:
binary = Binary(name=configured, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
pass
# Search common names
for name in CHROME_BINARY_NAMES:
try:
binary = Binary(name=name, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
continue
return None
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
chrome_binary = get_env('CHROME_BINARY', 'chromium')
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
pdf_enabled = get_env_bool('PDF_ENABLED', True)
dom_enabled = get_env_bool('DOM_ENABLED', True)
# Compute USE_CHROME (derived from extractor enabled flags)
use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
computed['USE_CHROME'] = str(use_chrome).lower()
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Find Chrome binary using abx-pkg
provider = EnvProvider()
if use_chrome:
chrome = find_chrome_binary(chrome_binary, provider)
if not chrome or not chrome.abspath:
errors.append(
f"Chrome binary not found (tried: {chrome_binary}). "
"Install Chrome/Chromium or set CHROME_BINARY path."
)
computed['CHROME_BINARY'] = ''
else:
computed['CHROME_BINARY'] = str(chrome.abspath)
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
# Output Binary JSONL record for Chrome
output_binary(chrome, name='chrome')
# Check Node.js for Puppeteer
node_binary_name = get_env('NODE_BINARY', 'node')
try:
node = Binary(name=node_binary_name, binproviders=[provider]).load()
node_path = str(node.abspath) if node.abspath else ''
except Exception:
node = None
node_path = ''
if use_chrome and not node_path:
errors.append(
f"Node.js not found (tried: {node_binary_name}). "
"Install Node.js or set NODE_BINARY path for Puppeteer."
)
else:
computed['NODE_BINARY'] = node_path
if node and node.abspath:
# Output Binary JSONL record for Node
output_binary(node, name='node')
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()