diff --git a/archivebox/plugins/infiniscroll/config.json b/archivebox/plugins/infiniscroll/config.json new file mode 100644 index 00000000..8f0304ad --- /dev/null +++ b/archivebox/plugins/infiniscroll/config.json @@ -0,0 +1,46 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "required_plugins": ["chrome"], + "properties": { + "INFINISCROLL_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"], + "description": "Enable infinite scroll page expansion" + }, + "INFINISCROLL_TIMEOUT": { + "type": "integer", + "default": 120, + "minimum": 10, + "x-fallback": "TIMEOUT", + "description": "Maximum timeout for scrolling in seconds" + }, + "INFINISCROLL_SCROLL_DELAY": { + "type": "integer", + "default": 2000, + "minimum": 500, + "description": "Delay between scrolls in milliseconds" + }, + "INFINISCROLL_SCROLL_DISTANCE": { + "type": "integer", + "default": 1600, + "minimum": 100, + "description": "Distance to scroll per step in pixels" + }, + "INFINISCROLL_SCROLL_LIMIT": { + "type": "integer", + "default": 10, + "minimum": 1, + "maximum": 100, + "description": "Maximum number of scroll steps" + }, + "INFINISCROLL_MIN_HEIGHT": { + "type": "integer", + "default": 16000, + "minimum": 1000, + "description": "Minimum page height to scroll to in pixels" + } + } +} diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js new file mode 100755 index 00000000..905f1c12 --- /dev/null +++ b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js @@ -0,0 +1,267 @@ +#!/usr/bin/env node +/** + * Scroll the page down to trigger infinite scroll / lazy loading. + * + * Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times, + * ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached. + * Stops early if no new content loads after a scroll. + * + * Usage: on_Snapshot__45_infiniscroll.js --url= --snapshot-id= + * Output: JSONL with scroll stats (no files created) + * + * Environment variables: + * INFINISCROLL_ENABLED: Enable/disable (default: true) + * INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120) + * INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000) + * INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600) + * INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10) + * INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000) + */ + +function getEnv(name, defaultValue = '') { + return (process.env[name] || defaultValue).trim(); +} + +function getEnvBool(name, defaultValue = false) { + const val = getEnv(name, '').toLowerCase(); + if (['true', '1', 'yes', 'on'].includes(val)) return true; + if (['false', '0', 'no', 'off'].includes(val)) return false; + return defaultValue; +} + +function getEnvInt(name, defaultValue = 0) { + const val = parseInt(getEnv(name, String(defaultValue)), 10); + return isNaN(val) ? defaultValue : val; +} + +// Check if infiniscroll is enabled BEFORE requiring puppeteer +if (!getEnvBool('INFINISCROLL_ENABLED', true)) { + console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)'); + process.exit(0); +} + +const fs = require('fs'); +const path = require('path'); +const puppeteer = require('puppeteer-core'); + +const PLUGIN_NAME = 'infiniscroll'; +const CHROME_SESSION_DIR = '../chrome'; + +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach(arg => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +function getCdpUrl() { + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); + if (fs.existsSync(cdpFile)) { + return fs.readFileSync(cdpFile, 'utf8').trim(); + } + return null; +} + +function getPageId() { + const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); + if (fs.existsSync(targetIdFile)) { + return fs.readFileSync(targetIdFile, 'utf8').trim(); + } + return null; +} + +async function waitForChromeTabLoaded(timeoutMs = 60000) { + const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); + const startTime = Date.now(); + + while (Date.now() - startTime < timeoutMs) { + if (fs.existsSync(navigationFile)) { + return true; + } + await new Promise(resolve => setTimeout(resolve, 100)); + } + return false; +} + +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function scrollDown(page, options = {}) { + const { + timeout = 120000, + scrollDelay = 2000, + scrollDistance = 1600, + scrollLimit = 10, + minHeight = 16000, + } = options; + + const startTime = Date.now(); + const startingHeight = await page.evaluate(() => document.body.scrollHeight); + let lastHeight = startingHeight; + let scrollCount = 0; + let scrollPosition = 0; + + // Scroll to top first + await page.evaluate(() => { + window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); + }); + await sleep(500); + + while (scrollCount < scrollLimit) { + // Check timeout + const elapsed = Date.now() - startTime; + if (elapsed >= timeout) { + console.error(`Timeout reached after ${scrollCount} scrolls`); + break; + } + + scrollPosition = (scrollCount + 1) * scrollDistance; + console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`); + + await page.evaluate((yOffset) => { + window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' }); + }, scrollPosition); + + scrollCount++; + await sleep(scrollDelay); + + // Check if new content was added (infinite scroll detection) + const newHeight = await page.evaluate(() => document.body.scrollHeight); + const addedPx = newHeight - lastHeight; + + if (addedPx > 0) { + console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`); + } else if (scrollPosition >= newHeight + scrollDistance) { + // Reached the bottom + if (scrollCount > 2) { + console.error(`Reached bottom of page at ${newHeight}px`); + break; + } + } + + lastHeight = newHeight; + + // Check if we've reached minimum height and can stop + if (lastHeight >= minHeight && scrollPosition >= lastHeight) { + console.error(`Reached minimum height target (${minHeight}px)`); + break; + } + } + + // Scroll to absolute bottom + if (scrollPosition < lastHeight) { + await page.evaluate(() => { + window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); + }); + await sleep(scrollDelay); + } + + // Scroll back to top + console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`); + await page.evaluate(() => { + window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); + }); + await sleep(scrollDelay); + + const totalElapsed = Date.now() - startTime; + + return { + scrollCount, + finalHeight: lastHeight, + startingHeight, + elapsedMs: totalElapsed, + }; +} + +async function main() { + const args = parseArgs(); + const url = args.url; + const snapshotId = args.snapshot_id; + + if (!url || !snapshotId) { + console.error('Usage: on_Snapshot__45_infiniscroll.js --url= --snapshot-id='); + process.exit(1); + } + + const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000; + const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000); + const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600); + const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10); + const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000); + + const cdpUrl = getCdpUrl(); + if (!cdpUrl) { + console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)'); + process.exit(1); + } + + // Wait for page to be loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)'); + process.exit(1); + } + + let browser = null; + try { + browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); + + const pages = await browser.pages(); + if (pages.length === 0) { + throw new Error('No pages found in browser'); + } + + // Find the right page by target ID + const targetId = getPageId(); + let page = null; + if (targetId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === targetId; + }); + } + if (!page) { + page = pages[pages.length - 1]; + } + + console.error(`Starting infinite scroll on ${url}`); + const result = await scrollDown(page, { + timeout, + scrollDelay, + scrollDistance, + scrollLimit, + minHeight, + }); + + browser.disconnect(); + + const elapsedSec = (result.elapsedMs / 1000).toFixed(1); + const finalHeightStr = result.finalHeight.toLocaleString(); + const addedHeight = result.finalHeight - result.startingHeight; + const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content'; + const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`; + + console.error(`Success: ${outputStr}`); + console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_str: outputStr, + })); + process.exit(0); + + } catch (e) { + if (browser) browser.disconnect(); + console.error(`ERROR: ${e.name}: ${e.message}`); + process.exit(1); + } +} + +main().catch(e => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py new file mode 100644 index 00000000..7a178958 --- /dev/null +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -0,0 +1,352 @@ +""" +Integration tests for infiniscroll plugin + +Tests verify: +1. Hook script exists +2. Dependencies installed via chrome validation hooks +3. Verify deps with abx-pkg +4. INFINISCROLL_ENABLED=False skips without JSONL +5. Fails gracefully when no chrome session exists +6. Full integration test: scrolls page and outputs stats +7. Config options work (scroll limit, min height) +""" + +import json +import os +import re +import signal +import subprocess +import time +import tempfile +from pathlib import Path + +import pytest + + +PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = PLUGIN_DIR.parent +INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None) +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' +CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js' +CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None) +TEST_URL = 'https://www.singsing.movie/' + + +def get_node_modules_dir(): + """Get NODE_MODULES_DIR for tests, checking env first.""" + # Check if NODE_PATH is already set in environment + if os.environ.get('NODE_PATH'): + return Path(os.environ['NODE_PATH']) + # Otherwise compute from LIB_DIR + from archivebox.config.common import STORAGE_CONFIG + lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) + return lib_dir / 'npm' / 'node_modules' + + +NODE_MODULES_DIR = get_node_modules_dir() + + +def get_test_env(): + """Get environment with NODE_PATH set correctly.""" + env = os.environ.copy() + env['NODE_PATH'] = str(NODE_MODULES_DIR) + return env + + +def test_hook_script_exists(): + """Verify on_Snapshot hook exists.""" + assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found" + assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}" + + +def test_verify_deps_with_abx_pkg(): + """Verify dependencies are available via abx-pkg after hook installation.""" + from abx_pkg import Binary, EnvProvider, BinProviderOverrides + + EnvProvider.model_rebuild() + + # Verify node is available + node_binary = Binary(name='node', binproviders=[EnvProvider()]) + node_loaded = node_binary.load() + assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin" + + +def test_config_infiniscroll_disabled_skips(): + """Test that INFINISCROLL_ENABLED=False exits without emitting JSONL.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = get_test_env() + env['INFINISCROLL_ENABLED'] = 'False' + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" + assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" + + # Should NOT emit any JSONL + jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] + assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}" + + +def test_fails_gracefully_without_chrome_session(): + """Test that hook fails gracefully when no chrome session exists.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'], + cwd=tmpdir, + capture_output=True, + text=True, + env=get_test_env(), + timeout=30 + ) + + # Should fail (exit 1) when no chrome session + assert result.returncode != 0, "Should fail when no chrome session exists" + # Error could be about chrome/CDP not found, or puppeteer module missing + err_lower = result.stderr.lower() + assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \ + f"Should mention chrome/CDP/puppeteer in error: {result.stderr}" + + +def setup_chrome_session(tmpdir): + """Helper to set up Chrome session with tab and navigation.""" + crawl_dir = Path(tmpdir) / 'crawl' + crawl_dir.mkdir() + chrome_dir = crawl_dir / 'chrome' + + env = get_test_env() + env['CHROME_HEADLESS'] = 'true' + + # Launch Chrome at crawl level + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'], + cwd=str(crawl_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chrome to launch + for i in range(15): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") + if (chrome_dir / 'cdp_url.txt').exists(): + break + time.sleep(1) + + if not (chrome_dir / 'cdp_url.txt').exists(): + raise RuntimeError("Chrome CDP URL not found after 15s") + + chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) + + # Create snapshot directory structure + snapshot_dir = Path(tmpdir) / 'snapshot' + snapshot_dir.mkdir() + snapshot_chrome_dir = snapshot_dir / 'chrome' + snapshot_chrome_dir.mkdir() + + # Create tab + tab_env = env.copy() + tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + result = subprocess.run( + ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=60, + env=tab_env + ) + if result.returncode != 0: + raise RuntimeError(f"Tab creation failed: {result.stderr}") + + # Navigate to URL + result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + if result.returncode != 0: + raise RuntimeError(f"Navigation failed: {result.stderr}") + + return chrome_launch_process, chrome_pid, snapshot_chrome_dir + + +def cleanup_chrome(chrome_launch_process, chrome_pid): + """Helper to clean up Chrome processes.""" + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + try: + os.kill(chrome_pid, signal.SIGKILL) + except OSError: + pass + + +def test_scrolls_page_and_outputs_stats(): + """Integration test: scroll page and verify JSONL output format.""" + with tempfile.TemporaryDirectory() as tmpdir: + chrome_launch_process = None + chrome_pid = None + try: + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + + # Create infiniscroll output directory (sibling to chrome) + infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' + infiniscroll_dir.mkdir() + + # Run infiniscroll hook + env = get_test_env() + env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test + env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling + env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], + cwd=str(infiniscroll_dir), + capture_output=True, + text=True, + timeout=60, + env=env + ) + + assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}" + + # Parse JSONL output + result_json = None + for line in result.stdout.strip().split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass + + assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}" + assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" + + # Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs" + output_str = result_json.get('output_str', '') + assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}" + assert 'px' in output_str, f"output_str should contain pixel count: {output_str}" + assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}" + + # Verify no files created in output directory + output_files = list(infiniscroll_dir.iterdir()) + assert len(output_files) == 0, f"Should not create any files, but found: {output_files}" + + finally: + if chrome_launch_process and chrome_pid: + cleanup_chrome(chrome_launch_process, chrome_pid) + + +def test_config_scroll_limit_honored(): + """Test that INFINISCROLL_SCROLL_LIMIT config is respected.""" + with tempfile.TemporaryDirectory() as tmpdir: + chrome_launch_process = None + chrome_pid = None + try: + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + + infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' + infiniscroll_dir.mkdir() + + # Set scroll limit to 2 + env = get_test_env() + env['INFINISCROLL_SCROLL_LIMIT'] = '2' + env['INFINISCROLL_SCROLL_DELAY'] = '500' + env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'], + cwd=str(infiniscroll_dir), + capture_output=True, + text=True, + timeout=60, + env=env + ) + + assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}" + + # Parse output and verify scroll count + result_json = None + for line in result.stdout.strip().split('\n'): + if line.strip().startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass + + assert result_json is not None, "Should have JSONL output" + output_str = result_json.get('output_str', '') + + # Verify output format and that it completed (scroll limit enforced internally) + assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}" + assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}" + + finally: + if chrome_launch_process and chrome_pid: + cleanup_chrome(chrome_launch_process, chrome_pid) + + +def test_config_timeout_honored(): + """Test that INFINISCROLL_TIMEOUT config is respected.""" + with tempfile.TemporaryDirectory() as tmpdir: + chrome_launch_process = None + chrome_pid = None + try: + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + + infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' + infiniscroll_dir.mkdir() + + # Set very short timeout + env = get_test_env() + env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds + env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger + env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit + env['INFINISCROLL_MIN_HEIGHT'] = '100000' + + start_time = time.time() + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'], + cwd=str(infiniscroll_dir), + capture_output=True, + text=True, + timeout=30, + env=env + ) + elapsed = time.time() - start_time + + # Should complete within reasonable time (timeout + buffer) + assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s" + assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}" + + finally: + if chrome_launch_process and chrome_pid: + cleanup_chrome(chrome_launch_process, chrome_pid) + + +if __name__ == '__main__': + pytest.main([__file__, '-v'])