Files
ArchiveBox/archivebox/plugins/staticfile/tests/test_staticfile.py
Claude 9703a8e88c Add tests for responses, staticfile, and env provider plugins
- responses: Tests network response capture during page load
- staticfile: Tests static file detection and download skip for HTML
- env: Tests PATH-based binary discovery (python3, bash)
2025-12-31 18:28:01 +00:00

115 lines
3.8 KiB
Python

"""
Tests for the staticfile plugin.
Tests the real staticfile hook with actual URLs to verify
static file detection and download.
"""
import json
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
get_test_env,
get_plugin_dir,
get_hook_script,
)
def chrome_available() -> bool:
"""Check if Chrome/Chromium is available."""
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
if shutil.which(name):
return True
return False
# Get the path to the staticfile hook
PLUGIN_DIR = get_plugin_dir(__file__)
STATICFILE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_staticfile.*')
class TestStaticfilePlugin(TestCase):
"""Test the staticfile plugin."""
def test_staticfile_hook_exists(self):
"""Staticfile hook script should exist."""
self.assertIsNotNone(STATICFILE_HOOK, "Staticfile hook not found in plugin directory")
self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}")
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
class TestStaticfileWithChrome(TestCase):
"""Integration tests for staticfile plugin with Chrome."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = Path(tempfile.mkdtemp())
def tearDown(self):
"""Clean up."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_staticfile_skips_html_pages(self):
"""Staticfile hook should skip HTML pages (not static files)."""
test_url = 'https://example.com' # HTML page, not a static file
snapshot_id = 'test-staticfile-snapshot'
try:
with chrome_session(
self.temp_dir,
crawl_id='test-staticfile-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
# Get environment and run the staticfile hook
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Run staticfile hook with the active Chrome session
result = subprocess.run(
['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120, # Longer timeout as it waits for navigation
env=env
)
# Verify hook ran without crash
self.assertNotIn('Traceback', result.stderr)
# Parse JSONL output to verify it recognized HTML as non-static
for line in result.stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
# HTML pages should be skipped
if record.get('status') == 'skipped':
self.assertIn('Not a static file', record.get('output_str', ''))
break
except json.JSONDecodeError:
continue
except RuntimeError as e:
if 'Chrome' in str(e) or 'CDP' in str(e):
self.skipTest(f"Chrome session setup failed: {e}")
raise
if __name__ == '__main__':
pytest.main([__file__, '-v'])