way better plugin hooks system wip

This commit is contained in:
Nick Sweeting
2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions

View File

@@ -0,0 +1 @@
{"type": "Binary", "name": "readability-extractor", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["https://github.com/ArchiveBox/readability-extractor"]}}}

View File

@@ -1,101 +0,0 @@
#!/usr/bin/env python3
"""
Install hook for readability-extractor binary.
Runs at crawl start to verify readability-extractor is available.
Outputs JSONL for InstalledBinary and Machine config updates.
Respects READABILITY_BINARY env var for custom binary paths.
"""
import os
import sys
import json
from pathlib import Path
def find_readability() -> dict | None:
"""Find readability-extractor binary, respecting READABILITY_BINARY env var."""
try:
from abx_pkg import Binary, NpmProvider, EnvProvider
# Check if user has configured a custom binary
configured_binary = os.environ.get('READABILITY_BINARY', '').strip()
if configured_binary:
if '/' in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
else:
bin_name = 'readability-extractor'
binary = Binary(name=bin_name, binproviders=[NpmProvider(), EnvProvider()])
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': bin_name,
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except Exception:
pass
return None
def main():
# Determine binary name from config
configured_binary = os.environ.get('READABILITY_BINARY', '').strip()
if configured_binary and '/' in configured_binary:
bin_name = Path(configured_binary).name
elif configured_binary:
bin_name = configured_binary
else:
bin_name = 'readability-extractor'
result = find_readability()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
# readability-extractor is installed from GitHub
print(json.dumps({
'type': 'Dependency',
'bin_name': bin_name,
'bin_providers': 'npm,env',
'overrides': {
'npm': {'packages': ['github:ArchiveBox/readability-extractor']}
}
}))
print(f"{bin_name} binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -7,7 +7,10 @@ Output: Creates readability/ directory with content.html, content.txt, article.j
Environment variables:
READABILITY_BINARY: Path to readability-extractor binary
TIMEOUT: Timeout in seconds (default: 60)
READABILITY_TIMEOUT: Timeout in seconds (default: 60)
# Fallback to ARCHIVING_CONFIG values if READABILITY_* not set:
TIMEOUT: Fallback timeout
Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
This extractor looks for HTML source from other extractors (wget, singlefile, dom)
@@ -15,11 +18,9 @@ Note: Requires readability-extractor from https://github.com/ArchiveBox/readabil
import json
import os
import shutil
import subprocess
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
@@ -43,29 +44,6 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def find_readability() -> str | None:
"""Find readability-extractor binary."""
readability = get_env('READABILITY_BINARY')
if readability and os.path.isfile(readability):
return readability
for name in ['readability-extractor']:
binary = shutil.which(name)
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get readability-extractor version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
def find_html_source() -> str | None:
"""Find HTML content from other extractors in the snapshot directory."""
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
@@ -94,7 +72,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('TIMEOUT', 60)
timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
# Find HTML source
html_source = find_html_source()
@@ -145,42 +123,22 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
def main(url: str, snapshot_id: str):
"""Extract article content using Mozilla's Readability."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
try:
# Find binary
binary = find_readability()
if not binary:
print(f'ERROR: readability-extractor binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
# Get binary from environment
binary = get_env('READABILITY_BINARY', 'readability-extractor')
# Run extraction
success, output, error = extract_readability(url, binary)
status = 'succeeded' if success else 'failed'
if success:
text_file = Path(output) / 'content.txt'
html_file = Path(output) / 'content.html'
text_len = text_file.stat().st_size if text_file.exists() else 0
html_len = html_file.stat().st_size if html_file.exists() else 0
print(f'Readability extracted: {text_len} chars text, {html_len} chars HTML')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Calculate duration
end_ts = datetime.now(timezone.utc)
if error:
print(f'ERROR: {error}', file=sys.stderr)
@@ -190,10 +148,6 @@ def main(url: str, snapshot_id: str):
'status': status,
'output_str': output or error or '',
}
if binary:
result['cmd'] = [binary, '<html>']
if version:
result['cmd_version'] = version
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)

View File

@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py'
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
TEST_URL = 'https://example.com'
@@ -101,10 +101,10 @@ def test_reports_missing_dependency_when_not_installed():
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
def test_readability_validate_hook():
"""Test readability validate hook checks for readability-extractor binary."""
def test_readability_install_hook():
"""Test readability install hook checks for readability-extractor binary."""
result = subprocess.run(
[sys.executable, str(READABILITY_VALIDATE_HOOK)],
[sys.executable, str(READABILITY_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
@@ -112,20 +112,20 @@ def test_readability_validate_hook():
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
if record.get('type') == 'Binary':
assert record['name'] == 'readability-extractor'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
assert found_binary, "Should output Binary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
@@ -170,7 +170,7 @@ def test_extracts_article_after_installation():
# Create example.com HTML for readability to process
create_example_html(tmpdir)
# Run readability extraction (should find the installed binary)
# Run readability extraction (should find the binary)
result = subprocess.run(
[sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
@@ -181,14 +181,26 @@ def test_extracts_article_after_installation():
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Verify output directory created
readability_dir = tmpdir / 'readability'
assert readability_dir.exists(), "Output directory not created"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
# Verify output files exist
html_file = readability_dir / 'content.html'
txt_file = readability_dir / 'content.txt'
json_file = readability_dir / 'article.json'
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output files exist (hook writes to current directory)
html_file = tmpdir / 'content.html'
txt_file = tmpdir / 'content.txt'
json_file = tmpdir / 'article.json'
assert html_file.exists(), "content.html not created"
assert txt_file.exists(), "content.txt not created"
@@ -212,10 +224,6 @@ def test_extracts_article_after_installation():
json_data = json.loads(json_file.read_text())
assert isinstance(json_data, dict), "article.json should be a dict"
# Verify stdout contains expected output
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'OUTPUT=readability' in result.stdout, "Should report output directory"
def test_fails_gracefully_without_html_source():
"""Test that extraction fails gracefully when no HTML source is available."""