mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
173 lines
5.0 KiB
Python
173 lines
5.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate and compute derived Chrome config values.
|
|
|
|
This hook runs early in the Crawl lifecycle to:
|
|
1. Auto-detect Chrome binary location
|
|
2. Compute sandbox settings based on Docker detection
|
|
3. Validate binary availability and version
|
|
4. Set computed env vars for subsequent hooks
|
|
|
|
Output:
|
|
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
|
- Binary JSONL records to stdout when binaries are found
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
|
|
from abx_pkg import Binary, EnvProvider
|
|
|
|
|
|
# Chrome binary search order
|
|
CHROME_BINARY_NAMES = [
|
|
'chromium',
|
|
'chromium-browser',
|
|
'google-chrome',
|
|
'google-chrome-stable',
|
|
'chrome',
|
|
]
|
|
|
|
def get_env(name: str, default: str = '') -> str:
|
|
return os.environ.get(name, default).strip()
|
|
|
|
def get_env_bool(name: str, default: bool = False) -> bool:
|
|
val = get_env(name, '').lower()
|
|
if val in ('true', '1', 'yes', 'on'):
|
|
return True
|
|
if val in ('false', '0', 'no', 'off'):
|
|
return False
|
|
return default
|
|
|
|
|
|
def detect_docker() -> bool:
|
|
"""Detect if running inside Docker container."""
|
|
return (
|
|
os.path.exists('/.dockerenv') or
|
|
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
|
|
os.path.exists('/run/.containerenv')
|
|
)
|
|
|
|
|
|
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
|
|
"""Find Chrome binary using abx-pkg, checking configured path first."""
|
|
# Try configured binary first
|
|
if configured:
|
|
try:
|
|
binary = Binary(name=configured, binproviders=[provider]).load()
|
|
if binary.abspath:
|
|
return binary
|
|
except Exception:
|
|
pass
|
|
|
|
# Search common names
|
|
for name in CHROME_BINARY_NAMES:
|
|
try:
|
|
binary = Binary(name=name, binproviders=[provider]).load()
|
|
if binary.abspath:
|
|
return binary
|
|
except Exception:
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def output_binary(binary: Binary, name: str):
|
|
"""Output Binary JSONL record to stdout."""
|
|
machine_id = os.environ.get('MACHINE_ID', '')
|
|
|
|
record = {
|
|
'type': 'Binary',
|
|
'name': name,
|
|
'abspath': str(binary.abspath),
|
|
'version': str(binary.version) if binary.version else '',
|
|
'sha256': binary.sha256 or '',
|
|
'binprovider': 'env',
|
|
'machine_id': machine_id,
|
|
}
|
|
print(json.dumps(record))
|
|
|
|
|
|
def main():
|
|
warnings = []
|
|
errors = []
|
|
computed = {}
|
|
|
|
# Get config values
|
|
chrome_binary = get_env('CHROME_BINARY', 'chromium')
|
|
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
|
|
save_screenshot = get_env_bool('SAVE_SCREENSHOT', True)
|
|
save_pdf = get_env_bool('SAVE_PDF', True)
|
|
save_dom = get_env_bool('SAVE_DOM', True)
|
|
|
|
# Compute USE_CHROME (derived from SAVE_* flags)
|
|
use_chrome = save_screenshot or save_pdf or save_dom
|
|
computed['USE_CHROME'] = str(use_chrome).lower()
|
|
|
|
# Detect Docker and adjust sandbox
|
|
in_docker = detect_docker()
|
|
computed['IN_DOCKER'] = str(in_docker).lower()
|
|
|
|
if in_docker and chrome_sandbox:
|
|
warnings.append(
|
|
"Running in Docker with CHROME_SANDBOX=true. "
|
|
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
|
|
)
|
|
# Auto-disable sandbox in Docker unless explicitly set
|
|
if not get_env('CHROME_SANDBOX'):
|
|
computed['CHROME_SANDBOX'] = 'false'
|
|
|
|
# Find Chrome binary using abx-pkg
|
|
provider = EnvProvider()
|
|
if use_chrome:
|
|
chrome = find_chrome_binary(chrome_binary, provider)
|
|
if not chrome or not chrome.abspath:
|
|
errors.append(
|
|
f"Chrome binary not found (tried: {chrome_binary}). "
|
|
"Install Chrome/Chromium or set CHROME_BINARY path."
|
|
)
|
|
computed['CHROME_BINARY'] = ''
|
|
else:
|
|
computed['CHROME_BINARY'] = str(chrome.abspath)
|
|
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
|
|
|
|
# Output Binary JSONL record for Chrome
|
|
output_binary(chrome, name='chrome')
|
|
|
|
# Check Node.js for Puppeteer
|
|
node_binary_name = get_env('NODE_BINARY', 'node')
|
|
try:
|
|
node = Binary(name=node_binary_name, binproviders=[provider]).load()
|
|
node_path = str(node.abspath) if node.abspath else ''
|
|
except Exception:
|
|
node = None
|
|
node_path = ''
|
|
|
|
if use_chrome and not node_path:
|
|
errors.append(
|
|
f"Node.js not found (tried: {node_binary_name}). "
|
|
"Install Node.js or set NODE_BINARY path for Puppeteer."
|
|
)
|
|
else:
|
|
computed['NODE_BINARY'] = node_path
|
|
if node and node.abspath:
|
|
# Output Binary JSONL record for Node
|
|
output_binary(node, name='node')
|
|
|
|
# Output computed values
|
|
for key, value in computed.items():
|
|
print(f"COMPUTED:{key}={value}")
|
|
|
|
for warning in warnings:
|
|
print(f"WARNING:{warning}", file=sys.stderr)
|
|
|
|
for error in errors:
|
|
print(f"ERROR:{error}", file=sys.stderr)
|
|
|
|
sys.exit(1 if errors else 0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|