mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-06 10:55:44 +10:00
- responses: Tests network response capture during page load - staticfile: Tests static file detection and download skip for HTML - env: Tests PATH-based binary discovery (python3, bash)
115 lines
3.8 KiB
Python
115 lines
3.8 KiB
Python
"""
|
|
Tests for the staticfile plugin.
|
|
|
|
Tests the real staticfile hook with actual URLs to verify
|
|
static file detection and download.
|
|
"""
|
|
|
|
import json
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from django.test import TestCase
|
|
|
|
# Import chrome test helpers
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
|
|
from chrome_test_helpers import (
|
|
chrome_session,
|
|
get_test_env,
|
|
get_plugin_dir,
|
|
get_hook_script,
|
|
)
|
|
|
|
|
|
def chrome_available() -> bool:
|
|
"""Check if Chrome/Chromium is available."""
|
|
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
|
|
if shutil.which(name):
|
|
return True
|
|
return False
|
|
|
|
|
|
# Get the path to the staticfile hook
|
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
|
STATICFILE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_staticfile.*')
|
|
|
|
|
|
class TestStaticfilePlugin(TestCase):
|
|
"""Test the staticfile plugin."""
|
|
|
|
def test_staticfile_hook_exists(self):
|
|
"""Staticfile hook script should exist."""
|
|
self.assertIsNotNone(STATICFILE_HOOK, "Staticfile hook not found in plugin directory")
|
|
self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}")
|
|
|
|
|
|
@pytest.mark.skipif(not chrome_available(), reason="Chrome not installed")
|
|
class TestStaticfileWithChrome(TestCase):
|
|
"""Integration tests for staticfile plugin with Chrome."""
|
|
|
|
def setUp(self):
|
|
"""Set up test environment."""
|
|
self.temp_dir = Path(tempfile.mkdtemp())
|
|
|
|
def tearDown(self):
|
|
"""Clean up."""
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_staticfile_skips_html_pages(self):
|
|
"""Staticfile hook should skip HTML pages (not static files)."""
|
|
test_url = 'https://example.com' # HTML page, not a static file
|
|
snapshot_id = 'test-staticfile-snapshot'
|
|
|
|
try:
|
|
with chrome_session(
|
|
self.temp_dir,
|
|
crawl_id='test-staticfile-crawl',
|
|
snapshot_id=snapshot_id,
|
|
test_url=test_url,
|
|
navigate=True,
|
|
timeout=30,
|
|
) as (chrome_process, chrome_pid, snapshot_chrome_dir):
|
|
# Get environment and run the staticfile hook
|
|
env = get_test_env()
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
|
|
# Run staticfile hook with the active Chrome session
|
|
result = subprocess.run(
|
|
['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
|
cwd=str(snapshot_chrome_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120, # Longer timeout as it waits for navigation
|
|
env=env
|
|
)
|
|
|
|
# Verify hook ran without crash
|
|
self.assertNotIn('Traceback', result.stderr)
|
|
|
|
# Parse JSONL output to verify it recognized HTML as non-static
|
|
for line in result.stdout.split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('{'):
|
|
try:
|
|
record = json.loads(line)
|
|
if record.get('type') == 'ArchiveResult':
|
|
# HTML pages should be skipped
|
|
if record.get('status') == 'skipped':
|
|
self.assertIn('Not a static file', record.get('output_str', ''))
|
|
break
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
except RuntimeError as e:
|
|
if 'Chrome' in str(e) or 'CDP' in str(e):
|
|
self.skipTest(f"Chrome session setup failed: {e}")
|
|
raise
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v'])
|