mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 18:05:36 +10:00
Fix hook priority conflicts and standardize on_Binary naming
on_Snapshot priority fixes: - redirects.bg.js stays at 31, staticfile.bg.js → 32 - headers.js stays at 55, readability.py → 56 - mercury.py → 57, htmltotext.py → 58 on_Binary hooks now have numeric priorities: - 10: npm_install.py - 11: pip_install.py - 12: brew_install.py - 13: apt_install.py - 14: custom_install.py - 15: env_install.py
This commit is contained in:
170
archivebox/plugins/readability/on_Snapshot__56_readability.py
Normal file
170
archivebox/plugins/readability/on_Snapshot__56_readability.py
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract article content using Mozilla's Readability.
|
||||
|
||||
Usage: on_Snapshot__readability.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Creates readability/ directory with content.html, content.txt, article.json
|
||||
|
||||
Environment variables:
|
||||
READABILITY_BINARY: Path to readability-extractor binary
|
||||
READABILITY_TIMEOUT: Timeout in seconds (default: 60)
|
||||
READABILITY_ARGS: Default Readability arguments (JSON array)
|
||||
READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
TIMEOUT: Fallback timeout
|
||||
|
||||
Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
|
||||
This extractor looks for HTML source from other extractors (wget, singlefile, dom)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'readability'
|
||||
BIN_NAME = 'readability-extractor'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def find_html_source() -> str | None:
|
||||
"""Find HTML content from other extractors in the snapshot directory."""
|
||||
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
|
||||
search_patterns = [
|
||||
'singlefile/singlefile.html',
|
||||
'singlefile/*.html',
|
||||
'dom/output.html',
|
||||
'dom/*.html',
|
||||
'wget/**/*.html',
|
||||
'wget/**/*.htm',
|
||||
]
|
||||
|
||||
cwd = Path.cwd()
|
||||
for pattern in search_patterns:
|
||||
matches = list(cwd.glob(pattern))
|
||||
for match in matches:
|
||||
if match.is_file() and match.stat().st_size > 0:
|
||||
return str(match)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Extract article using Readability.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
readability_args = get_env_array('READABILITY_ARGS', [])
|
||||
readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])
|
||||
|
||||
# Find HTML source
|
||||
html_source = find_html_source()
|
||||
if not html_source:
|
||||
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
|
||||
try:
|
||||
# Run readability-extractor (outputs JSON by default)
|
||||
cmd = [binary, *readability_args, *readability_args_extra, html_source]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'readability-extractor failed: {stderr[:200]}'
|
||||
|
||||
# Parse JSON output
|
||||
try:
|
||||
result_json = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return False, None, 'readability-extractor returned invalid JSON'
|
||||
|
||||
# Extract and save content
|
||||
# readability-extractor uses camelCase field names (textContent, content)
|
||||
text_content = result_json.pop('textContent', result_json.pop('text-content', ''))
|
||||
html_content = result_json.pop('content', result_json.pop('html-content', ''))
|
||||
|
||||
if not text_content and not html_content:
|
||||
return False, None, 'No content extracted'
|
||||
|
||||
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
|
||||
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
|
||||
(output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
|
||||
|
||||
return True, OUTPUT_DIR, ''
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to extract article from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Extract article content using Mozilla's Readability."""
|
||||
|
||||
try:
|
||||
# Get binary from environment
|
||||
binary = get_env('READABILITY_BINARY', 'readability-extractor')
|
||||
|
||||
# Run extraction
|
||||
success, output, error = extract_readability(url, binary)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user