Files
ArchiveBox/archivebox/plugins/readability/on_Crawl__00_validate_readability.py
Nick Sweeting d95f0dc186 remove huey
2025-12-24 23:40:18 -08:00

124 lines
3.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Validation hook for readability-extractor binary.
Runs at crawl start to verify readability-extractor is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_readability() -> dict | None:
"""Find readability-extractor binary."""
try:
from abx_pkg import Binary, NpmProvider, EnvProvider
class ReadabilityBinary(Binary):
name: str = 'readability-extractor'
binproviders_supported = [NpmProvider(), EnvProvider()]
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
binary = ReadabilityBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'readability-extractor',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'readability-extractor',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
result = find_readability()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'readability-extractor',
'bin_providers': 'npm,env',
}))
print(f"readability-extractor binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()