Files
ArchiveBox/archivebox/plugins/favicon/tests/test_favicon.py
Claude adeffb4bc5 Add JS-Python path delegation to reduce Chrome-related duplication
- Add getMachineType, getLibDir, getNodeModulesDir, getTestEnv CLI commands to chrome_utils.js
  These are now the single source of truth for path calculations
- Update chrome_test_helpers.py with call_chrome_utils() dispatcher
- Add get_test_env_from_js(), get_machine_type_from_js(), kill_chrome_via_js() helpers
- Update cleanup_chrome and kill_chromium_session to use JS killChrome
- Remove unused Chrome binary search lists from singlefile hook (~25 lines)
- Update readability, mercury, favicon, title tests to use shared helpers
2025-12-31 09:11:11 +00:00

294 lines
9.1 KiB
Python

"""
Integration tests for favicon plugin
Tests verify:
1. Plugin script exists
2. requests library is available
3. Favicon extraction works for real example.com
4. Output file is actual image data
5. Tries multiple favicon URLs
6. Falls back to Google's favicon service
7. Config options work (TIMEOUT, USER_AGENT)
8. Handles failures gracefully
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
PLUGIN_DIR = get_plugin_dir(__file__)
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
def test_requests_library_available():
"""Test that requests library is available."""
result = subprocess.run(
[sys.executable, '-c', 'import requests; print(requests.__version__)'],
capture_output=True,
text=True
)
if result.returncode != 0:
pass
assert len(result.stdout.strip()) > 0, "Should report requests version"
def test_extracts_favicon_from_example_com():
"""Test full workflow: extract favicon from real example.com.
Note: example.com doesn't have a favicon and Google's service may also fail,
so we test that the extraction completes and reports appropriate status.
"""
# Check requests is available
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run favicon extraction
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (if Google service works) or fail (if no favicon)
assert result.returncode in (0, 1), "Should complete extraction attempt"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
# If it succeeded, verify the favicon file
if result_json['status'] == 'succeeded':
favicon_file = tmpdir / 'favicon.ico'
assert favicon_file.exists(), "favicon.ico not created"
# Verify file is not empty and contains actual image data
file_size = favicon_file.stat().st_size
assert file_size > 0, "Favicon file should not be empty"
assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
# Check for common image magic bytes
favicon_data = favicon_file.read_bytes()
# ICO, PNG, GIF, JPEG, or WebP
is_image = (
favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
favicon_data[:3] == b'GIF' or # GIF
favicon_data[:2] == b'\xff\xd8' or # JPEG
favicon_data[8:12] == b'WEBP' # WebP
)
assert is_image, "Favicon file should be a valid image format"
else:
# Failed as expected
assert result_json['status'] == 'failed', f"Should report failure: {result_json}"
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result_json:
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
favicon_file = tmpdir / 'favicon.ico'
if favicon_file.exists():
assert favicon_file.stat().st_size > 0
def test_handles_missing_favicon_gracefully():
"""Test that favicon plugin handles sites without favicons gracefully.
Note: The plugin falls back to Google's favicon service, which generates
a generic icon even if the site doesn't have one, so extraction usually succeeds.
"""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try a URL that likely doesn't have a favicon
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (Google fallback) or fail gracefully
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
if result.returncode != 0:
combined = result.stdout + result.stderr
assert 'No favicon found' in combined or 'ERROR=' in combined
def test_reports_missing_requests_library():
"""Test that script reports error when requests library is missing."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run with PYTHONPATH cleared to simulate missing requests
import os
env = os.environ.copy()
# Keep only minimal PATH, clear PYTHONPATH
env['PYTHONPATH'] = '/nonexistent'
result = subprocess.run(
[sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env
)
# Should fail and report missing requests
if result.returncode != 0:
combined = result.stdout + result.stderr
# May report missing requests or other import errors
assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
if __name__ == '__main__':
pytest.main([__file__, '-v'])