ArchiveBox/archivebox/plugins/title/tests/test_title.py

"""
Integration tests for title plugin

Tests verify:
1. Plugin script exists
2. Node.js is available
3. Title extraction works for real example.com
4. Output file contains actual page title
5. Handles various title sources (<title>, og:title, twitter:title)
6. Config options work (TIMEOUT, USER_AGENT)
7. Fallback to HTTP when chrome not available
"""

import json
import shutil
import subprocess
import tempfile
from pathlib import Path

import pytest

from archivebox.plugins.chrome.tests.chrome_test_helpers import parse_jsonl_output


PLUGIN_DIR = Path(__file__).parent.parent
TITLE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_title.*'), None)
TEST_URL = 'https://example.com'


def test_hook_script_exists():
    """Verify hook script exists."""
    assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"


def test_extracts_title_from_example_com():
    """Test full workflow: extract title from real example.com."""

    # Check node is available
    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Run title extraction
        result = subprocess.run(
            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
            cwd=tmpdir,
            capture_output=True,
            text=True,
            timeout=60
        )

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

        # Parse clean JSONL output
        result_json = None
        for line in result.stdout.strip().split('\n'):
            line = line.strip()
            if line.startswith('{'):
                pass
                try:
                    record = json.loads(line)
                    if record.get('type') == 'ArchiveResult':
                        result_json = record
                        break
                except json.JSONDecodeError:
                    pass

        assert result_json, "Should have ArchiveResult JSONL output"
        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"

        # Verify output file exists (hook writes to current directory)
        title_file = tmpdir / 'title.txt'
        assert title_file.exists(), "title.txt not created"

        # Verify title contains REAL example.com title
        title_text = title_file.read_text().strip()
        assert len(title_text) > 0, "Title should not be empty"
        assert 'example' in title_text.lower(), "Title should contain 'example'"

        # example.com has title "Example Domain"
        assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"


def test_falls_back_to_http_when_chrome_unavailable():
    """Test that title plugin falls back to HTTP when chrome unavailable."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Don't create chrome directory - force HTTP fallback

        # Run title extraction
        result = subprocess.run(
            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
            cwd=tmpdir,
            capture_output=True,
            text=True,
            timeout=60
        )

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

        # Parse clean JSONL output
        result_json = None
        for line in result.stdout.strip().split('\n'):
            line = line.strip()
            if line.startswith('{'):
                pass
                try:
                    record = json.loads(line)
                    if record.get('type') == 'ArchiveResult':
                        result_json = record
                        break
                except json.JSONDecodeError:
                    pass

        assert result_json, "Should have ArchiveResult JSONL output"
        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"

        # Verify output exists and has real title (hook writes to current directory)
        output_title_file = tmpdir / 'title.txt'
        assert output_title_file.exists(), "Output title.txt not created"

        title_text = output_title_file.read_text().strip()
        assert 'example' in title_text.lower()


def test_config_timeout_honored():
    """Test that TIMEOUT config is respected."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Set very short timeout (but example.com should still succeed)
        import os
        env = os.environ.copy()
        env['TIMEOUT'] = '5'

        result = subprocess.run(
            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
            cwd=tmpdir,
            capture_output=True,
            text=True,
            env=env,
            timeout=30
        )

        # Should complete (success or fail, but not hang)
        assert result.returncode in (0, 1), "Should complete without hanging"


def test_config_user_agent():
    """Test that USER_AGENT config is used."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # Set custom user agent
        import os
        env = os.environ.copy()
        env['USER_AGENT'] = 'TestBot/1.0'

        result = subprocess.run(
            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
            cwd=tmpdir,
            capture_output=True,
            text=True,
            env=env,
            timeout=60
        )

        # Should succeed (example.com doesn't block)
        if result.returncode == 0:
            # Parse clean JSONL output
            result_json = None
            for line in result.stdout.strip().split('\n'):
                line = line.strip()
                if line.startswith('{'):
                    pass
                    try:
                        record = json.loads(line)
                        if record.get('type') == 'ArchiveResult':
                            result_json = record
                            break
                    except json.JSONDecodeError:
                        pass

            assert result_json, "Should have ArchiveResult JSONL output"
            assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"


def test_handles_https_urls():
    """Test that HTTPS URLs work correctly."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        result = subprocess.run(
            ['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
            cwd=tmpdir,
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode == 0:
            # Hook writes to current directory
            output_title_file = tmpdir / 'title.txt'
            if output_title_file.exists():
                title_text = output_title_file.read_text().strip()
                assert len(title_text) > 0, "Title should not be empty"
                assert 'example' in title_text.lower()


def test_handles_404_gracefully():
    """Test that title plugin handles 404 pages.

    Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
    with the generic "Example Domain" title.
    """

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        result = subprocess.run(
            ['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
            cwd=tmpdir,
            capture_output=True,
            text=True,
            timeout=60
        )

        # May succeed or fail depending on server behavior
        # example.com returns "Example Domain" even for 404s
        assert result.returncode in (0, 1), "Should complete (may succeed or fail)"


def test_handles_redirects():
    """Test that title plugin handles redirects correctly."""

    if not shutil.which('node'):
        pass

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

        # http://example.com redirects to https://example.com
        result = subprocess.run(
            ['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'],
            cwd=tmpdir,
            capture_output=True,
            text=True,
            timeout=60
        )

        # Should succeed and follow redirect
        if result.returncode == 0:
            # Hook writes to current directory
            output_title_file = tmpdir / 'title.txt'
            if output_title_file.exists():
                title_text = output_title_file.read_text().strip()
                assert 'example' in title_text.lower()


if __name__ == '__main__':
    pytest.main([__file__, '-v'])