mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Remove redundant chrome_validate hook, rename wget_validate to wget_i… (#1752)
…nstall
- Delete chrome/on_Crawl__10_chrome_validate.py (duplicates
chrome_install)
- Rename wget/on_Crawl__11_wget_validate.py →
on_Crawl__06_wget_install.py
All hooks now follow consistent naming: install, launch, or config
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->
# Summary
<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->
# Related issues
<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->
# Changes these areas
- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Removed the redundant Chrome validate hook, renamed the Wget validate
hook to wget_install, and standardized hook names and priorities to
match the install/launch/config lifecycle. This removes duplicate logic
and fixes priority conflicts across Crawl, Binary, and Snapshot hooks.
- **Refactors**
- Deleted chrome/on_Crawl__10_chrome_validate.py (dup of chrome_install)
- Renamed wget validate to on_Crawl__06_wget_install.py
- Standardized on_Binary hook priorities: npm 10, pip 11, brew 12, apt
13, custom 14, env 15
- Fixed on_Snapshot order: staticfile 32, readability 56, mercury 57,
htmltotext 58
<sup>Written for commit 09a1ca3134.
Summary will update on new commits.</sup>
<!-- End of auto-generated description by cubic. -->
This commit is contained in:
@@ -1,172 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Validate and compute derived Chrome config values.
|
|
||||||
|
|
||||||
This hook runs early in the Crawl lifecycle to:
|
|
||||||
1. Auto-detect Chrome binary location
|
|
||||||
2. Compute sandbox settings based on Docker detection
|
|
||||||
3. Validate binary availability and version
|
|
||||||
4. Set computed env vars for subsequent hooks
|
|
||||||
|
|
||||||
Output:
|
|
||||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
|
||||||
- Binary JSONL records to stdout when binaries are found
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from abx_pkg import Binary, EnvProvider
|
|
||||||
|
|
||||||
|
|
||||||
# Chrome binary search order
|
|
||||||
CHROME_BINARY_NAMES = [
|
|
||||||
'chromium',
|
|
||||||
'chromium-browser',
|
|
||||||
'google-chrome',
|
|
||||||
'google-chrome-stable',
|
|
||||||
'chrome',
|
|
||||||
]
|
|
||||||
|
|
||||||
def get_env(name: str, default: str = '') -> str:
|
|
||||||
return os.environ.get(name, default).strip()
|
|
||||||
|
|
||||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
|
||||||
val = get_env(name, '').lower()
|
|
||||||
if val in ('true', '1', 'yes', 'on'):
|
|
||||||
return True
|
|
||||||
if val in ('false', '0', 'no', 'off'):
|
|
||||||
return False
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
|
||||||
def detect_docker() -> bool:
|
|
||||||
"""Detect if running inside Docker container."""
|
|
||||||
return (
|
|
||||||
os.path.exists('/.dockerenv') or
|
|
||||||
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
|
|
||||||
os.path.exists('/run/.containerenv')
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
|
|
||||||
"""Find Chrome binary using abx-pkg, checking configured path first."""
|
|
||||||
# Try configured binary first
|
|
||||||
if configured:
|
|
||||||
try:
|
|
||||||
binary = Binary(name=configured, binproviders=[provider]).load()
|
|
||||||
if binary.abspath:
|
|
||||||
return binary
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Search common names
|
|
||||||
for name in CHROME_BINARY_NAMES:
|
|
||||||
try:
|
|
||||||
binary = Binary(name=name, binproviders=[provider]).load()
|
|
||||||
if binary.abspath:
|
|
||||||
return binary
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def output_binary(binary: Binary, name: str):
|
|
||||||
"""Output Binary JSONL record to stdout."""
|
|
||||||
machine_id = os.environ.get('MACHINE_ID', '')
|
|
||||||
|
|
||||||
record = {
|
|
||||||
'type': 'Binary',
|
|
||||||
'name': name,
|
|
||||||
'abspath': str(binary.abspath),
|
|
||||||
'version': str(binary.version) if binary.version else '',
|
|
||||||
'sha256': binary.sha256 or '',
|
|
||||||
'binprovider': 'env',
|
|
||||||
'machine_id': machine_id,
|
|
||||||
}
|
|
||||||
print(json.dumps(record))
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
warnings = []
|
|
||||||
errors = []
|
|
||||||
computed = {}
|
|
||||||
|
|
||||||
# Get config values
|
|
||||||
chrome_binary = get_env('CHROME_BINARY', 'chromium')
|
|
||||||
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
|
|
||||||
screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
|
|
||||||
pdf_enabled = get_env_bool('PDF_ENABLED', True)
|
|
||||||
dom_enabled = get_env_bool('DOM_ENABLED', True)
|
|
||||||
|
|
||||||
# Compute USE_CHROME (derived from extractor enabled flags)
|
|
||||||
use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
|
|
||||||
computed['USE_CHROME'] = str(use_chrome).lower()
|
|
||||||
|
|
||||||
# Detect Docker and adjust sandbox
|
|
||||||
in_docker = detect_docker()
|
|
||||||
computed['IN_DOCKER'] = str(in_docker).lower()
|
|
||||||
|
|
||||||
if in_docker and chrome_sandbox:
|
|
||||||
warnings.append(
|
|
||||||
"Running in Docker with CHROME_SANDBOX=true. "
|
|
||||||
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
|
|
||||||
)
|
|
||||||
# Auto-disable sandbox in Docker unless explicitly set
|
|
||||||
if not get_env('CHROME_SANDBOX'):
|
|
||||||
computed['CHROME_SANDBOX'] = 'false'
|
|
||||||
|
|
||||||
# Find Chrome binary using abx-pkg
|
|
||||||
provider = EnvProvider()
|
|
||||||
if use_chrome:
|
|
||||||
chrome = find_chrome_binary(chrome_binary, provider)
|
|
||||||
if not chrome or not chrome.abspath:
|
|
||||||
errors.append(
|
|
||||||
f"Chrome binary not found (tried: {chrome_binary}). "
|
|
||||||
"Install Chrome/Chromium or set CHROME_BINARY path."
|
|
||||||
)
|
|
||||||
computed['CHROME_BINARY'] = ''
|
|
||||||
else:
|
|
||||||
computed['CHROME_BINARY'] = str(chrome.abspath)
|
|
||||||
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
|
|
||||||
|
|
||||||
# Output Binary JSONL record for Chrome
|
|
||||||
output_binary(chrome, name='chrome')
|
|
||||||
|
|
||||||
# Check Node.js for Puppeteer
|
|
||||||
node_binary_name = get_env('NODE_BINARY', 'node')
|
|
||||||
try:
|
|
||||||
node = Binary(name=node_binary_name, binproviders=[provider]).load()
|
|
||||||
node_path = str(node.abspath) if node.abspath else ''
|
|
||||||
except Exception:
|
|
||||||
node = None
|
|
||||||
node_path = ''
|
|
||||||
|
|
||||||
if use_chrome and not node_path:
|
|
||||||
errors.append(
|
|
||||||
f"Node.js not found (tried: {node_binary_name}). "
|
|
||||||
"Install Node.js or set NODE_BINARY path for Puppeteer."
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
computed['NODE_BINARY'] = node_path
|
|
||||||
if node and node.abspath:
|
|
||||||
# Output Binary JSONL record for Node
|
|
||||||
output_binary(node, name='node')
|
|
||||||
|
|
||||||
# Output computed values
|
|
||||||
for key, value in computed.items():
|
|
||||||
print(f"COMPUTED:{key}={value}")
|
|
||||||
|
|
||||||
for warning in warnings:
|
|
||||||
print(f"WARNING:{warning}", file=sys.stderr)
|
|
||||||
|
|
||||||
for error in errors:
|
|
||||||
print(f"ERROR:{error}", file=sys.stderr)
|
|
||||||
|
|
||||||
sys.exit(1 if errors else 0)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
Reference in New Issue
Block a user