mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
244 lines
9.5 KiB
Python
244 lines
9.5 KiB
Python
"""
|
|
Integration tests for infiniscroll plugin
|
|
|
|
Tests verify:
|
|
1. Hook script exists
|
|
2. Dependencies installed via chrome validation hooks
|
|
3. Verify deps with abx-pkg
|
|
4. INFINISCROLL_ENABLED=False skips without JSONL
|
|
5. Fails gracefully when no chrome session exists
|
|
6. Full integration test: scrolls page and outputs stats
|
|
7. Config options work (scroll limit, min height)
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import time
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
# Import shared Chrome test helpers
|
|
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
|
get_test_env,
|
|
chrome_session,
|
|
)
|
|
|
|
|
|
PLUGIN_DIR = Path(__file__).parent.parent
|
|
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
|
|
TEST_URL = 'https://www.singsing.movie/'
|
|
|
|
|
|
def test_hook_script_exists():
|
|
"""Verify on_Snapshot hook exists."""
|
|
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
|
|
assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}"
|
|
|
|
|
|
def test_verify_deps_with_abx_pkg():
|
|
"""Verify dependencies are available via abx-pkg after hook installation."""
|
|
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
|
|
|
EnvProvider.model_rebuild()
|
|
|
|
# Verify node is available
|
|
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
|
node_loaded = node_binary.load()
|
|
assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin"
|
|
|
|
|
|
def test_config_infiniscroll_disabled_skips():
|
|
"""Test that INFINISCROLL_ENABLED=False exits without emitting JSONL."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
env = get_test_env()
|
|
env['INFINISCROLL_ENABLED'] = 'False'
|
|
|
|
result = subprocess.run(
|
|
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
|
|
cwd=tmpdir,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=30
|
|
)
|
|
|
|
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
|
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
|
|
|
# Should NOT emit any JSONL
|
|
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
|
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
|
|
|
|
|
|
def test_fails_gracefully_without_chrome_session():
|
|
"""Test that hook fails gracefully when no chrome session exists."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
tmpdir = Path(tmpdir)
|
|
|
|
result = subprocess.run(
|
|
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
|
|
cwd=tmpdir,
|
|
capture_output=True,
|
|
text=True,
|
|
env=get_test_env(),
|
|
timeout=30
|
|
)
|
|
|
|
# Should fail (exit 1) when no chrome session
|
|
assert result.returncode != 0, "Should fail when no chrome session exists"
|
|
# Error could be about chrome/CDP not found, or puppeteer module missing
|
|
err_lower = result.stderr.lower()
|
|
assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
|
|
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
|
|
|
|
|
def test_scrolls_page_and_outputs_stats():
|
|
"""Integration test: scroll page and verify JSONL output format."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
with chrome_session(
|
|
Path(tmpdir),
|
|
crawl_id='test-infiniscroll',
|
|
snapshot_id='snap-infiniscroll',
|
|
test_url=TEST_URL,
|
|
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
|
# Create infiniscroll output directory (sibling to chrome)
|
|
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
|
infiniscroll_dir.mkdir()
|
|
|
|
# Run infiniscroll hook
|
|
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
|
|
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
|
|
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
|
|
|
|
result = subprocess.run(
|
|
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
|
cwd=str(infiniscroll_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}"
|
|
|
|
# Parse JSONL output
|
|
result_json = None
|
|
for line in result.stdout.strip().split('\n'):
|
|
line = line.strip()
|
|
if line.startswith('{'):
|
|
try:
|
|
record = json.loads(line)
|
|
if record.get('type') == 'ArchiveResult':
|
|
result_json = record
|
|
break
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}"
|
|
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
|
|
|
# Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs"
|
|
output_str = result_json.get('output_str', '')
|
|
assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}"
|
|
assert 'px' in output_str, f"output_str should contain pixel count: {output_str}"
|
|
assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}"
|
|
|
|
# Verify no files created in output directory
|
|
output_files = list(infiniscroll_dir.iterdir())
|
|
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
|
|
|
|
|
|
def test_config_scroll_limit_honored():
|
|
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
with chrome_session(
|
|
Path(tmpdir),
|
|
crawl_id='test-scroll-limit',
|
|
snapshot_id='snap-limit',
|
|
test_url=TEST_URL,
|
|
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
|
|
|
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
|
infiniscroll_dir.mkdir()
|
|
|
|
# Set scroll limit to 2 (use env from setup_chrome_session)
|
|
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
|
|
env['INFINISCROLL_SCROLL_DELAY'] = '500'
|
|
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
|
|
|
|
result = subprocess.run(
|
|
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
|
|
cwd=str(infiniscroll_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
env=env
|
|
)
|
|
|
|
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}"
|
|
|
|
# Parse output and verify scroll count
|
|
result_json = None
|
|
for line in result.stdout.strip().split('\n'):
|
|
if line.strip().startswith('{'):
|
|
try:
|
|
record = json.loads(line)
|
|
if record.get('type') == 'ArchiveResult':
|
|
result_json = record
|
|
break
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
assert result_json is not None, "Should have JSONL output"
|
|
output_str = result_json.get('output_str', '')
|
|
|
|
# Verify output format and that it completed (scroll limit enforced internally)
|
|
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
|
|
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
|
|
|
|
|
|
|
|
def test_config_timeout_honored():
|
|
"""Test that INFINISCROLL_TIMEOUT config is respected."""
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
with chrome_session(
|
|
Path(tmpdir),
|
|
crawl_id='test-timeout',
|
|
snapshot_id='snap-timeout',
|
|
test_url=TEST_URL,
|
|
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
|
|
|
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
|
infiniscroll_dir.mkdir()
|
|
|
|
# Set very short timeout (use env from setup_chrome_session)
|
|
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
|
|
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
|
|
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
|
|
env['INFINISCROLL_MIN_HEIGHT'] = '100000'
|
|
|
|
start_time = time.time()
|
|
result = subprocess.run(
|
|
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
|
|
cwd=str(infiniscroll_dir),
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
env=env
|
|
)
|
|
elapsed = time.time() - start_time
|
|
|
|
# Should complete within reasonable time (timeout + buffer)
|
|
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
|
|
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v'])
|