ArchiveBox/archivebox/plugins/staticfile/tests/test_staticfile.py

"""
Tests for the staticfile plugin.

Tests the real staticfile hook with actual URLs to verify
static file detection and download.
"""

import json
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path

import pytest
from django.test import TestCase

# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
    chrome_session,
    get_test_env,
    get_plugin_dir,
    get_hook_script,
)


def chrome_available() -> bool:
    """Check if Chrome/Chromium is available."""
    for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
        if shutil.which(name):
            return True
    return False


# Get the path to the staticfile hook
PLUGIN_DIR = get_plugin_dir(__file__)
STATICFILE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_staticfile.*')


class TestStaticfilePlugin(TestCase):
    """Test the staticfile plugin."""

    def test_staticfile_hook_exists(self):
        """Staticfile hook script should exist."""
        self.assertIsNotNone(STATICFILE_HOOK, "Staticfile hook not found in plugin directory")
        self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}")


@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestStaticfileWithChrome(TestCase):
    """Integration tests for staticfile plugin with Chrome."""

    def setUp(self):
        """Set up test environment."""
        self.temp_dir = Path(tempfile.mkdtemp())

    def tearDown(self):
        """Clean up."""
        shutil.rmtree(self.temp_dir, ignore_errors=True)

    def test_staticfile_skips_html_pages(self):
        """Staticfile hook should skip HTML pages (not static files)."""
        test_url = 'https://example.com'  # HTML page, not a static file
        snapshot_id = 'test-staticfile-snapshot'

        try:
            with chrome_session(
                self.temp_dir,
                crawl_id='test-staticfile-crawl',
                snapshot_id=snapshot_id,
                test_url=test_url,
                navigate=True,
                timeout=30,
            ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
                # Use the environment from chrome_session (already has CHROME_HEADLESS=true)


                # Run staticfile hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
                    cwd=str(snapshot_chrome_dir),
                    capture_output=True,
                    text=True,
                    timeout=120,  # Longer timeout as it waits for navigation
                    env=env
                )

                # Verify hook ran without crash
                self.assertNotIn('Traceback', result.stderr)

                # Parse JSONL output to verify it recognized HTML as non-static
                for line in result.stdout.split('\n'):
                    line = line.strip()
                    if line.startswith('{'):
                        try:
                            record = json.loads(line)
                            if record.get('type') == 'ArchiveResult':
                                # HTML pages should be skipped
                                if record.get('status') == 'skipped':
                                    self.assertIn('Not a static file', record.get('output_str', ''))
                                break
                        except json.JSONDecodeError:
                            continue

        except RuntimeError as e:
            if 'Chrome' in str(e) or 'CDP' in str(e):
                self.skipTest(f"Chrome session setup failed: {e}")
            raise


if __name__ == '__main__':
    pytest.main([__file__, '-v'])