wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,31 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_FAVICON": {
"type": "boolean",
"default": true,
"description": "Enable favicon downloading"
},
"FAVICON_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for favicon fetch in seconds"
},
"FAVICON_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
},
"FAVICON_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
}
}
}

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""
Extract favicon from a URL.
Usage: on_Snapshot__favicon.py --url=<url> --snapshot-id=<uuid>
Output: Writes favicon.ico to $PWD
Environment variables:
TIMEOUT: Timeout in seconds (default: 30)
USER_AGENT: User agent string
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
It can run standalone if requests is installed: pip install requests
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin, urlparse
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'favicon'
OUTPUT_DIR = 'favicon'
OUTPUT_FILE = 'favicon.ico'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_favicon(url: str) -> tuple[bool, str | None, str]:
"""
Fetch favicon from URL.
Returns: (success, output_path, error_message)
"""
try:
import requests
except ImportError:
return False, None, 'requests library not installed'
timeout = get_env_int('TIMEOUT', 30)
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
headers = {'User-Agent': user_agent}
# Build list of possible favicon URLs
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
favicon_urls = [
urljoin(base_url, '/favicon.ico'),
urljoin(base_url, '/favicon.png'),
urljoin(base_url, '/apple-touch-icon.png'),
]
# Try to extract favicon URL from HTML link tags
try:
response = requests.get(url, timeout=timeout, headers=headers)
if response.ok:
# Look for <link rel="icon" href="...">
for match in re.finditer(
r'<link[^>]+rel=["\'](?:shortcut )?icon["\'][^>]+href=["\']([^"\']+)["\']',
response.text,
re.I
):
favicon_urls.insert(0, urljoin(url, match.group(1)))
# Also check reverse order: href before rel
for match in re.finditer(
r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\'](?:shortcut )?icon["\']',
response.text,
re.I
):
favicon_urls.insert(0, urljoin(url, match.group(1)))
except Exception:
pass # Continue with default favicon URLs
# Try each URL until we find one that works
for favicon_url in favicon_urls:
try:
response = requests.get(favicon_url, timeout=15, headers=headers)
if response.ok and len(response.content) > 0:
Path(OUTPUT_FILE).write_bytes(response.content)
return True, OUTPUT_FILE, ''
except Exception:
continue
# Try Google's favicon service as fallback
try:
google_url = f'https://www.google.com/s2/favicons?domain={parsed.netloc}'
response = requests.get(google_url, timeout=15, headers=headers)
if response.ok and len(response.content) > 0:
Path(OUTPUT_FILE).write_bytes(response.content)
return True, OUTPUT_FILE, ''
except Exception:
pass
return False, None, 'No favicon found'
@click.command()
@click.option('--url', required=True, help='URL to extract favicon from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Extract favicon from a URL."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
try:
# Run extraction
success, output, error = get_favicon(url)
status = 'succeeded' if success else 'failed'
if success:
print(f'Favicon saved ({Path(output).stat().st_size} bytes)')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,262 @@
"""
Integration tests for favicon plugin
Tests verify:
1. Plugin script exists
2. requests library is available
3. Favicon extraction works for real example.com
4. Output file is actual image data
5. Tries multiple favicon URLs
6. Falls back to Google's favicon service
7. Config options work (TIMEOUT, USER_AGENT)
8. Handles failures gracefully
"""
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
FAVICON_HOOK = PLUGIN_DIR / 'on_Snapshot__11_favicon.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
def test_requests_library_available():
"""Test that requests library is available."""
result = subprocess.run(
[sys.executable, '-c', 'import requests; print(requests.__version__)'],
capture_output=True,
text=True
)
if result.returncode != 0:
pytest.skip("requests library not installed")
assert len(result.stdout.strip()) > 0, "Should report requests version"
def test_extracts_favicon_from_example_com():
"""Test full workflow: extract favicon from real example.com.
Note: example.com doesn't have a favicon and Google's service may also fail,
so we test that the extraction completes and reports appropriate status.
"""
# Check requests is available
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run favicon extraction
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (if Google service works) or fail (if no favicon)
assert result.returncode in (0, 1), "Should complete extraction attempt"
# Verify RESULT_JSON is present
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
# If it succeeded, verify the favicon file
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'Favicon saved' in result.stdout, "Should report completion"
favicon_file = tmpdir / 'favicon.ico'
assert favicon_file.exists(), "favicon.ico not created"
# Verify file is not empty and contains actual image data
file_size = favicon_file.stat().st_size
assert file_size > 0, "Favicon file should not be empty"
assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
# Check for common image magic bytes
favicon_data = favicon_file.read_bytes()
# ICO, PNG, GIF, JPEG, or WebP
is_image = (
favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
favicon_data[:3] == b'GIF' or # GIF
favicon_data[:2] == b'\xff\xd8' or # JPEG
favicon_data[8:12] == b'WEBP' # WebP
)
assert is_image, "Favicon file should be a valid image format"
else:
# Failed as expected
assert 'STATUS=failed' in result.stdout
assert 'No favicon found' in result.stdout or 'No favicon found' in result.stderr
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
favicon_file = tmpdir / 'favicon.ico'
if favicon_file.exists():
assert favicon_file.stat().st_size > 0
def test_handles_missing_favicon_gracefully():
"""Test that favicon plugin handles sites without favicons gracefully.
Note: The plugin falls back to Google's favicon service, which generates
a generic icon even if the site doesn't have one, so extraction usually succeeds.
"""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try a URL that likely doesn't have a favicon
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (Google fallback) or fail gracefully
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
if result.returncode != 0:
combined = result.stdout + result.stderr
assert 'No favicon found' in combined or 'ERROR=' in combined
def test_reports_missing_requests_library():
"""Test that script reports error when requests library is missing."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run with PYTHONPATH cleared to simulate missing requests
import os
env = os.environ.copy()
# Keep only minimal PATH, clear PYTHONPATH
env['PYTHONPATH'] = '/nonexistent'
result = subprocess.run(
[sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env
)
# Should fail and report missing requests
if result.returncode != 0:
combined = result.stdout + result.stderr
# May report missing requests or other import errors
assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
if __name__ == '__main__':
pytest.main([__file__, '-v'])