Fix hook priority conflicts and standardize on_Binary naming

on_Snapshot priority fixes: - redirects.bg.js stays at 31, staticfile.bg.js → 32 - headers.js stays at 55, readability.py → 56 - mercury.py → 57, htmltotext.py → 58 on_Binary hooks now have numeric priorities: - 10: npm_install.py - 11: pip_install.py - 12: brew_install.py - 13: apt_install.py - 14: custom_install.py - 15: env_install.py
2026-01-04 18:05:36 +10:00 · 2026-01-01 01:31:52 +00:00
parent 4d33084496
commit 09a1ca3134
10 changed files with 0 additions and 0 deletions
--- a/archivebox/plugins/readability/on_Snapshot__56_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__56_readability.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+Extract article content using Mozilla's Readability.
+
+Usage: on_Snapshot__readability.py --url=<url> --snapshot-id=<uuid>
+Output: Creates readability/ directory with content.html, content.txt, article.json
+
+Environment variables:
+    READABILITY_BINARY: Path to readability-extractor binary
+    READABILITY_TIMEOUT: Timeout in seconds (default: 60)
+    READABILITY_ARGS: Default Readability arguments (JSON array)
+    READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
+    TIMEOUT: Fallback timeout
+
+Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
+      This extractor looks for HTML source from other extractors (wget, singlefile, dom)
+"""
+
+import json
+import os
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+PLUGIN_NAME = 'readability'
+BIN_NAME = 'readability-extractor'
+BIN_PROVIDERS = 'npm,env'
+OUTPUT_DIR = '.'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
+    """Parse a JSON array from environment variable."""
+    val = get_env(name, '')
+    if not val:
+        return default if default is not None else []
+    try:
+        result = json.loads(val)
+        if isinstance(result, list):
+            return [str(item) for item in result]
+        return default if default is not None else []
+    except json.JSONDecodeError:
+        return default if default is not None else []
+
+
+def find_html_source() -> str | None:
+    """Find HTML content from other extractors in the snapshot directory."""
+    # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
+    search_patterns = [
+        'singlefile/singlefile.html',
+        'singlefile/*.html',
+        'dom/output.html',
+        'dom/*.html',
+        'wget/**/*.html',
+        'wget/**/*.htm',
+    ]
+
+    cwd = Path.cwd()
+    for pattern in search_patterns:
+        matches = list(cwd.glob(pattern))
+        for match in matches:
+            if match.is_file() and match.stat().st_size > 0:
+                return str(match)
+
+    return None
+
+
+def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
+    """
+    Extract article using Readability.
+
+    Returns: (success, output_path, error_message)
+    """
+    timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
+    readability_args = get_env_array('READABILITY_ARGS', [])
+    readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])
+
+    # Find HTML source
+    html_source = find_html_source()
+    if not html_source:
+        return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
+
+    # Output directory is current directory (hook already runs in output dir)
+    output_dir = Path(OUTPUT_DIR)
+
+    try:
+        # Run readability-extractor (outputs JSON by default)
+        cmd = [binary, *readability_args, *readability_args_extra, html_source]
+        result = subprocess.run(cmd, capture_output=True, timeout=timeout)
+
+        if result.returncode != 0:
+            stderr = result.stderr.decode('utf-8', errors='replace')
+            return False, None, f'readability-extractor failed: {stderr[:200]}'
+
+        # Parse JSON output
+        try:
+            result_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            return False, None, 'readability-extractor returned invalid JSON'
+
+        # Extract and save content
+        # readability-extractor uses camelCase field names (textContent, content)
+        text_content = result_json.pop('textContent', result_json.pop('text-content', ''))
+        html_content = result_json.pop('content', result_json.pop('html-content', ''))
+
+        if not text_content and not html_content:
+            return False, None, 'No content extracted'
+
+        (output_dir / 'content.html').write_text(html_content, encoding='utf-8')
+        (output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
+        (output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
+
+        return True, OUTPUT_DIR, ''
+
+    except subprocess.TimeoutExpired:
+        return False, None, f'Timed out after {timeout} seconds'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to extract article from')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Extract article content using Mozilla's Readability."""
+
+    try:
+        # Get binary from environment
+        binary = get_env('READABILITY_BINARY', 'readability-extractor')
+
+        # Run extraction
+        success, output, error = extract_readability(url, binary)
+
+        if success:
+            # Success - emit ArchiveResult
+            result = {
+                'type': 'ArchiveResult',
+                'status': 'succeeded',
+                'output_str': output or ''
+            }
+            print(json.dumps(result))
+            sys.exit(0)
+        else:
+            # Transient error - emit NO JSONL
+            print(f'ERROR: {error}', file=sys.stderr)
+            sys.exit(1)
+
+    except Exception as e:
+        # Transient error - emit NO JSONL
+        print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()