#!/usr/bin/env python3
"""
Extract article content using Postlight's Mercury Parser.

Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
Output: Creates mercury/ directory with content.html, content.txt, article.json

Environment variables:
    MERCURY_BINARY: Path to postlight-parser binary
    MERCURY_TIMEOUT: Timeout in seconds (default: 60)

    # Fallback to ARCHIVING_CONFIG values if MERCURY_* not set:
    TIMEOUT: Fallback timeout

Note: Requires postlight-parser: npm install -g @postlight/parser
"""

import json
import os
import subprocess
import sys
from pathlib import Path

import rich_click as click


# Extractor metadata
EXTRACTOR_NAME = 'mercury'
BIN_NAME = 'postlight-parser'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'


def get_env(name: str, default: str = '') -> str:
    return os.environ.get(name, default).strip()


def get_env_int(name: str, default: int = 0) -> int:
    try:
        return int(get_env(name, str(default)))
    except ValueError:
        return default


def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
    """
    Extract article using Mercury Parser.

    Returns: (success, output_path, error_message)
    """
    timeout = get_env_int('MERCURY_TIMEOUT') or get_env_int('TIMEOUT', 60)

    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)

    try:
        # Get text version
        cmd_text = [binary, url, '--format=text']
        result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)

        if result_text.returncode != 0:
            stderr = result_text.stderr.decode('utf-8', errors='replace')
            return False, None, f'postlight-parser failed: {stderr[:200]}'

        try:
            text_json = json.loads(result_text.stdout)
        except json.JSONDecodeError:
            return False, None, 'postlight-parser returned invalid JSON'

        if text_json.get('failed'):
            return False, None, 'Mercury was not able to extract article'

        # Save text content
        text_content = text_json.get('content', '')
        (output_dir / 'content.txt').write_text(text_content, encoding='utf-8')

        # Get HTML version
        cmd_html = [binary, url, '--format=html']
        result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)

        try:
            html_json = json.loads(result_html.stdout)
        except json.JSONDecodeError:
            html_json = {}

        # Save HTML content and metadata
        html_content = html_json.pop('content', '')
        (output_dir / 'content.html').write_text(html_content, encoding='utf-8')

        # Save article metadata
        metadata = {k: v for k, v in text_json.items() if k != 'content'}
        (output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8')

        return True, OUTPUT_DIR, ''

    except subprocess.TimeoutExpired:
        return False, None, f'Timed out after {timeout} seconds'
    except Exception as e:
        return False, None, f'{type(e).__name__}: {e}'


@click.command()
@click.option('--url', required=True, help='URL to extract article from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
    """Extract article content using Postlight's Mercury Parser."""

    output = None
    status = 'failed'
    error = ''

    try:
        # Get binary from environment
        binary = get_env('MERCURY_BINARY', 'postlight-parser')

        # Run extraction
        success, output, error = extract_mercury(url, binary)
        status = 'succeeded' if success else 'failed'

    except Exception as e:
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

    if error:
        print(f'ERROR: {error}', file=sys.stderr)

    # Output clean JSONL (no RESULT_JSON= prefix)
    result = {
        'type': 'ArchiveResult',
        'status': status,
        'output_str': output or error or '',
    }
    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)


if __name__ == '__main__':
    main()