mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
831 lines
32 KiB
Python
831 lines
32 KiB
Python
"""
|
|
Integration tests for screenshot plugin
|
|
|
|
Tests verify:
|
|
1. Hook script exists
|
|
2. Dependencies installed via chrome validation hooks
|
|
3. Verify deps with abx-pkg
|
|
4. Screenshot extraction works on https://example.com
|
|
5. JSONL output is correct
|
|
6. Filesystem output is valid PNG image
|
|
7. Config options work
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
|
get_test_env,
|
|
get_plugin_dir,
|
|
get_hook_script,
|
|
run_hook_and_parse,
|
|
LIB_DIR,
|
|
NODE_MODULES_DIR,
|
|
CHROME_PLUGIN_DIR,
|
|
)
|
|
|
|
# Import chrome test fixture to ensure puppeteer is installed
|
|
from archivebox.plugins.chrome.tests.test_chrome import ensure_chromium_and_puppeteer_installed
|
|
|
|
|
|
PLUGIN_DIR = get_plugin_dir(__file__)
|
|
SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
|
|
|
# Get Chrome hooks for setting up sessions
|
|
CHROME_LAUNCH_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
|
|
CHROME_TAB_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_tab.*')
|
|
CHROME_NAVIGATE_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_navigate.*')
|
|
|
|
TEST_URL = 'https://example.com'
|
|
|
|
|
|
def test_hook_script_exists():
|
|
"""Verify on_Snapshot hook exists."""
|
|
assert SCREENSHOT_HOOK.exists(), f"Hook not found: {SCREENSHOT_HOOK}"
|
|
|
|
|
|
def test_verify_deps_with_abx_pkg():
|
|
"""Verify dependencies are available via abx-pkg after hook installation."""
|
|
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
|
|
|
EnvProvider.model_rebuild()
|
|
|
|
# Verify node is available
|
|
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
|
node_loaded = node_binary.load()
|
|
assert node_loaded and node_loaded.abspath, "Node.js required for screenshot plugin"
|
|
|
|
|
|
def test_extracts_screenshot_from_example_com():
|
|
"""Test full workflow: extract screenshot from real example.com via hook.
|
|
|
|
Replicates production directory structure:
|
|
DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/
|
|
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/
|
|
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/
|
|
|
|
This exercises the "connect to existing session" code path which is the primary
|
|
path in production and accounts for ~50% of the code.
|
|
"""
|
|
import signal
|
|
import time
|
|
import os
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
# Replicate exact production directory structure
|
|
data_dir = Path(tmpdir)
|
|
crawl_id = 'test-screenshot-crawl'
|
|
snapshot_id = 'test-screenshot-snap'
|
|
|
|
# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
|
|
crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id
|
|
chrome_dir = crawl_dir / 'chrome'
|
|
chrome_dir.mkdir(parents=True)
|
|
|
|
# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id
|
|
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
|
snapshot_chrome_dir.mkdir(parents=True)
|
|
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir()
|
|
|
|
env = get_test_env()
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
|
|
# Step 1: Launch Chrome session at crawl level (background process)
|
|
chrome_launch_process = subprocess.Popen(
|
|
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
|
cwd=str(chrome_dir),
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
env=env
|
|
)
|
|
|
|
# Wait for Chrome to launch
|
|
for i in range(15):
|
|
if chrome_launch_process.poll() is not None:
|
|
stdout, stderr = chrome_launch_process.communicate()
|
|
pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
|
if (chrome_dir / 'cdp_url.txt').exists():
|
|
break
|
|
time.sleep(1)
|
|
|
|
assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist"
|
|
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
|
|
|
|
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
|
|
|
try:
|
|
# Step 2: Create tab at snapshot level
|
|
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
|
result = subprocess.run(
|
|
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
|
|
cwd=str(snapshot_chrome_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
env=env
|
|
)
|
|
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
|
|
assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist"
|
|
|
|
# Step 3: Navigate to URL
|
|
result = subprocess.run(
|
|
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
|
|
cwd=str(snapshot_chrome_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
assert result.returncode == 0, f"Navigation failed: {result.stderr}"
|
|
assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist"
|
|
|
|
# Step 4: Take screenshot (should connect to existing session)
|
|
# Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}"
|
|
|
|
# Parse JSONL output
|
|
result_json = None
|
|
for line in result.stdout.strip().split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('{'):
|
|
try:
|
|
record = json.loads(line)
|
|
if record.get('type') == 'ArchiveResult':
|
|
result_json = record
|
|
break
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
assert result_json, "Should have ArchiveResult JSONL output"
|
|
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
|
assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}"
|
|
|
|
# Verify filesystem output
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}"
|
|
|
|
# Verify file is valid PNG
|
|
file_size = screenshot_file.stat().st_size
|
|
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
|
|
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
|
|
|
|
# Check PNG magic bytes
|
|
screenshot_data = screenshot_file.read_bytes()
|
|
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
|
|
|
|
finally:
|
|
# Cleanup: Kill Chrome
|
|
try:
|
|
chrome_launch_process.send_signal(signal.SIGTERM)
|
|
chrome_launch_process.wait(timeout=5)
|
|
except:
|
|
pass
|
|
try:
|
|
os.kill(chrome_pid, signal.SIGKILL)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
def test_extracts_screenshot_without_session():
|
|
"""Test screenshot extraction without existing Chrome session (fallback to own browser)."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
# Create proper snapshot directory structure
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
# Don't set up Chrome session or staticfile - screenshot should launch its own browser
|
|
env = get_test_env()
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
|
|
|
# Parse JSONL output
|
|
result_json = None
|
|
for line in result.stdout.strip().split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('{'):
|
|
try:
|
|
record = json.loads(line)
|
|
if record.get('type') == 'ArchiveResult':
|
|
result_json = record
|
|
break
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
assert result_json, "Should have ArchiveResult JSONL output"
|
|
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
|
assert 'screenshot.png' in result_json['output_str']
|
|
|
|
# Verify file created
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
assert screenshot_file.exists(), "screenshot.png not created"
|
|
assert screenshot_file.stat().st_size > 1000, "Screenshot too small"
|
|
|
|
|
|
def test_skips_when_staticfile_exists():
|
|
"""Test that screenshot skips when staticfile extractor already handled the URL."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-skip'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
# Create staticfile output to simulate staticfile extractor already ran
|
|
staticfile_dir = snapshot_dir / 'staticfile'
|
|
staticfile_dir.mkdir()
|
|
(staticfile_dir / 'index.html').write_text('<html></html>')
|
|
|
|
env = get_test_env()
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-skip'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should exit successfully: {result.stderr}"
|
|
|
|
# Should emit skipped status
|
|
result_json = None
|
|
for line in result.stdout.strip().split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('{'):
|
|
try:
|
|
record = json.loads(line)
|
|
if record.get('type') == 'ArchiveResult':
|
|
result_json = record
|
|
break
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
assert result_json, "Should have ArchiveResult JSONL output"
|
|
assert result_json['status'] == 'skipped', f"Should skip: {result_json}"
|
|
|
|
|
|
def test_config_save_screenshot_false_skips():
|
|
"""Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
|
|
import os
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
env = os.environ.copy()
|
|
env['SCREENSHOT_ENABLED'] = 'False'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
|
cwd=tmpdir,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=30
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
|
|
|
# Feature disabled - temporary failure, should NOT emit JSONL
|
|
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
|
|
|
# Should NOT emit any JSONL
|
|
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
|
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
|
|
|
|
|
def test_reports_missing_chrome():
|
|
"""Test that script reports error when Chrome is not found."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
|
|
# Set CHROME_BINARY to nonexistent path
|
|
env = get_test_env()
|
|
env['CHROME_BINARY'] = '/nonexistent/chrome'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test123'],
|
|
cwd=tmpdir,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=30
|
|
)
|
|
|
|
# Should fail and report missing Chrome
|
|
if result.returncode != 0:
|
|
combined = result.stdout + result.stderr
|
|
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
|
|
|
|
|
|
def test_custom_resolution_and_user_agent():
|
|
"""Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
env['CHROME_RESOLUTION'] = '800,600'
|
|
env['CHROME_USER_AGENT'] = 'Test/1.0'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
|
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
assert screenshot_file.exists(), "screenshot.png not created"
|
|
# Resolution affects file size
|
|
assert screenshot_file.stat().st_size > 500, "Screenshot too small"
|
|
|
|
|
|
def test_ssl_check_disabled():
|
|
"""Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
env['CHROME_CHECK_SSL_VALIDITY'] = 'False'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should succeed: {result.stderr}"
|
|
assert (screenshot_dir / 'screenshot.png').exists()
|
|
|
|
|
|
def test_config_timeout_honored():
|
|
"""Test that CHROME_TIMEOUT config is respected."""
|
|
import os
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
|
|
# Set very short timeout
|
|
env = os.environ.copy()
|
|
env['CHROME_TIMEOUT'] = '5'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
|
|
cwd=tmpdir,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=30
|
|
)
|
|
|
|
# Should complete (success or fail, but not hang)
|
|
assert result.returncode in (0, 1), "Should complete without hanging"
|
|
|
|
|
|
def test_missing_url_argument():
|
|
"""Test that hook fails gracefully when URL argument is missing."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
|
|
env = get_test_env()
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), '--snapshot-id=test-missing-url'],
|
|
cwd=tmpdir,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
env=env
|
|
)
|
|
|
|
# Should exit with error
|
|
assert result.returncode != 0, "Should fail when URL is missing"
|
|
assert 'Usage:' in result.stderr or 'url' in result.stderr.lower()
|
|
|
|
|
|
def test_missing_snapshot_id_argument():
|
|
"""Test that hook fails gracefully when snapshot-id argument is missing."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
|
|
env = get_test_env()
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}'],
|
|
cwd=tmpdir,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
env=env
|
|
)
|
|
|
|
# Should exit with error
|
|
assert result.returncode != 0, "Should fail when snapshot-id is missing"
|
|
assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower()
|
|
|
|
|
|
def test_invalid_resolution_format():
|
|
"""Test that invalid CHROME_RESOLUTION format is handled gracefully."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
# Invalid resolution formats to test parseResolution error handling
|
|
for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']:
|
|
env['CHROME_RESOLUTION'] = bad_resolution
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
# Should either fail gracefully or fall back to default
|
|
# (depending on implementation - script should not crash with uncaught error)
|
|
assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
|
|
|
|
|
|
def test_boolean_env_var_parsing():
|
|
"""Test that boolean environment variables are parsed correctly."""
|
|
import time
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
|
|
# Test various boolean formats for CHROME_HEADLESS
|
|
for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']:
|
|
env['CHROME_HEADLESS'] = bool_val
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
# Should either succeed or fail, but shouldn't crash on boolean parsing
|
|
assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}"
|
|
|
|
# Clean up screenshot file if created
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
if screenshot_file.exists():
|
|
screenshot_file.unlink()
|
|
|
|
time.sleep(0.5) # Brief pause between attempts
|
|
|
|
|
|
def test_integer_env_var_parsing():
|
|
"""Test that integer environment variables are parsed correctly."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
|
|
# Test valid and invalid integer formats for CHROME_TIMEOUT
|
|
test_cases = [
|
|
('60', True), # Valid integer
|
|
('invalid', True), # Invalid - should use default
|
|
('', True), # Empty - should use default
|
|
]
|
|
|
|
for timeout_val, should_work in test_cases:
|
|
env['CHROME_TIMEOUT'] = timeout_val
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
# Should either succeed or fail gracefully, but shouldn't crash on int parsing
|
|
assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}"
|
|
|
|
# Clean up screenshot file if created
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
if screenshot_file.exists():
|
|
screenshot_file.unlink()
|
|
|
|
|
|
def test_extracts_screenshot_with_all_config_options():
|
|
"""Test screenshot with comprehensive config to exercise all code paths."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
# Set ALL config options to exercise all code paths
|
|
env = get_test_env()
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
env['CHROME_RESOLUTION'] = '800,600'
|
|
env['CHROME_USER_AGENT'] = 'TestBot/1.0'
|
|
env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch
|
|
env['CHROME_TIMEOUT'] = '60'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}"
|
|
|
|
# Verify JSONL output with success
|
|
result_json = None
|
|
for line in result.stdout.strip().split('\n'):
|
|
if line.strip().startswith('{'):
|
|
try:
|
|
record = json.loads(line)
|
|
if record.get('type') == 'ArchiveResult':
|
|
result_json = record
|
|
break
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
assert result_json, "Should have ArchiveResult JSONL output"
|
|
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
|
assert 'screenshot.png' in result_json['output_str']
|
|
|
|
# Verify file created
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
assert screenshot_file.exists(), "screenshot.png should be created"
|
|
assert screenshot_file.stat().st_size > 1000, "Screenshot should have content"
|
|
|
|
|
|
def test_headless_mode_false():
|
|
"""Test headless=false code path specifically."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
# Explicitly test headless=false (exercises the ternary false branch)
|
|
env['CHROME_HEADLESS'] = 'false'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
# Should work or fail gracefully
|
|
assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}"
|
|
|
|
|
|
def test_invalid_url_causes_error():
|
|
"""Test error path with invalid URL that causes navigation failure."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
env['CHROME_TIMEOUT'] = '5' # Short timeout
|
|
|
|
# Use invalid URL to trigger error path
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
env=env
|
|
)
|
|
|
|
# Should fail due to navigation error
|
|
assert result.returncode != 0, "Should fail on invalid URL"
|
|
# Should NOT emit JSONL (transient error)
|
|
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
|
assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}"
|
|
|
|
|
|
def test_with_corrupted_cdp_url_falls_back():
|
|
"""Test that corrupted CDP URL file causes fallback to launching browser."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
# Create chrome directory with corrupted CDP URL
|
|
chrome_dir = snapshot_dir / 'chrome'
|
|
chrome_dir.mkdir()
|
|
(chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid')
|
|
|
|
env = get_test_env()
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test
|
|
|
|
# Screenshot should try CDP, fail quickly, then fall back to launching own browser
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
env=env
|
|
)
|
|
|
|
# Should succeed by falling back to launching browser
|
|
assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}"
|
|
assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure"
|
|
|
|
# Verify screenshot was created via fallback path
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
assert screenshot_file.exists(), "Screenshot should be created via fallback"
|
|
|
|
|
|
def test_user_agent_is_applied():
|
|
"""Test that CHROME_USER_AGENT is actually applied when launching browser."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)'
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
# Should succeed with custom user agent
|
|
assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}"
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
assert screenshot_file.exists(), "Screenshot should be created"
|
|
|
|
|
|
def test_check_ssl_false_branch():
|
|
"""Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
env['CHROME_CHECK_SSL_VALIDITY'] = 'false'
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
|
|
# Test with both boolean false and string 'false'
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}"
|
|
assert (screenshot_dir / 'screenshot.png').exists()
|
|
|
|
|
|
def test_alternative_env_var_names():
|
|
"""Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc)."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
# Use alternative env var names (without CHROME_ prefix)
|
|
env['TIMEOUT'] = '45'
|
|
env['RESOLUTION'] = '1024,768'
|
|
env['USER_AGENT'] = 'AltBot/1.0'
|
|
env['CHECK_SSL_VALIDITY'] = 'false'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}"
|
|
assert (screenshot_dir / 'screenshot.png').exists()
|
|
|
|
|
|
def test_very_large_resolution():
|
|
"""Test screenshot with very large resolution."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should handle large resolution: {result.stderr}"
|
|
screenshot_file = screenshot_dir / 'screenshot.png'
|
|
assert screenshot_file.exists()
|
|
# 4K screenshot should be larger
|
|
assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial"
|
|
|
|
|
|
def test_very_small_resolution():
|
|
"""Test screenshot with very small resolution."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
data_dir = Path(tmpdir)
|
|
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small'
|
|
screenshot_dir = snapshot_dir / 'screenshot'
|
|
screenshot_dir.mkdir(parents=True)
|
|
|
|
env = get_test_env()
|
|
env['CHROME_RESOLUTION'] = '320,240' # Very small
|
|
env['CHROME_HEADLESS'] = 'true'
|
|
|
|
result = subprocess.run(
|
|
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'],
|
|
cwd=str(screenshot_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should handle small resolution: {result.stderr}"
|
|
assert (screenshot_dir / 'screenshot.png').exists()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v'])
|