add infiniscroll plugin

This commit is contained in:
Nick Sweeting
2025-12-29 13:11:20 -08:00
parent e20fdae2a5
commit 8d76b2b0c6
3 changed files with 665 additions and 0 deletions

View File

@@ -0,0 +1,46 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"INFINISCROLL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"],
"description": "Enable infinite scroll page expansion"
},
"INFINISCROLL_TIMEOUT": {
"type": "integer",
"default": 120,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Maximum timeout for scrolling in seconds"
},
"INFINISCROLL_SCROLL_DELAY": {
"type": "integer",
"default": 2000,
"minimum": 500,
"description": "Delay between scrolls in milliseconds"
},
"INFINISCROLL_SCROLL_DISTANCE": {
"type": "integer",
"default": 1600,
"minimum": 100,
"description": "Distance to scroll per step in pixels"
},
"INFINISCROLL_SCROLL_LIMIT": {
"type": "integer",
"default": 10,
"minimum": 1,
"maximum": 100,
"description": "Maximum number of scroll steps"
},
"INFINISCROLL_MIN_HEIGHT": {
"type": "integer",
"default": 16000,
"minimum": 1000,
"description": "Minimum page height to scroll to in pixels"
}
}
}

View File

@@ -0,0 +1,267 @@
#!/usr/bin/env node
/**
* Scroll the page down to trigger infinite scroll / lazy loading.
*
* Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times,
* ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
* Stops early if no new content loads after a scroll.
*
* Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
* Output: JSONL with scroll stats (no files created)
*
* Environment variables:
* INFINISCROLL_ENABLED: Enable/disable (default: true)
* INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120)
* INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000)
* INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
* INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
* INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
*/
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if infiniscroll is enabled BEFORE requiring puppeteer
if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)');
process.exit(0);
}
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'infiniscroll';
const CHROME_SESSION_DIR = '../chrome';
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
function getPageId() {
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
if (fs.existsSync(targetIdFile)) {
return fs.readFileSync(targetIdFile, 'utf8').trim();
}
return null;
}
async function waitForChromeTabLoaded(timeoutMs = 60000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function scrollDown(page, options = {}) {
const {
timeout = 120000,
scrollDelay = 2000,
scrollDistance = 1600,
scrollLimit = 10,
minHeight = 16000,
} = options;
const startTime = Date.now();
const startingHeight = await page.evaluate(() => document.body.scrollHeight);
let lastHeight = startingHeight;
let scrollCount = 0;
let scrollPosition = 0;
// Scroll to top first
await page.evaluate(() => {
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
});
await sleep(500);
while (scrollCount < scrollLimit) {
// Check timeout
const elapsed = Date.now() - startTime;
if (elapsed >= timeout) {
console.error(`Timeout reached after ${scrollCount} scrolls`);
break;
}
scrollPosition = (scrollCount + 1) * scrollDistance;
console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`);
await page.evaluate((yOffset) => {
window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' });
}, scrollPosition);
scrollCount++;
await sleep(scrollDelay);
// Check if new content was added (infinite scroll detection)
const newHeight = await page.evaluate(() => document.body.scrollHeight);
const addedPx = newHeight - lastHeight;
if (addedPx > 0) {
console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`);
} else if (scrollPosition >= newHeight + scrollDistance) {
// Reached the bottom
if (scrollCount > 2) {
console.error(`Reached bottom of page at ${newHeight}px`);
break;
}
}
lastHeight = newHeight;
// Check if we've reached minimum height and can stop
if (lastHeight >= minHeight && scrollPosition >= lastHeight) {
console.error(`Reached minimum height target (${minHeight}px)`);
break;
}
}
// Scroll to absolute bottom
if (scrollPosition < lastHeight) {
await page.evaluate(() => {
window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' });
});
await sleep(scrollDelay);
}
// Scroll back to top
console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`);
await page.evaluate(() => {
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
});
await sleep(scrollDelay);
const totalElapsed = Date.now() - startTime;
return {
scrollCount,
finalHeight: lastHeight,
startingHeight,
elapsedMs: totalElapsed,
};
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000;
const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000);
const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)');
process.exit(1);
}
// Wait for page to be loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)');
process.exit(1);
}
let browser = null;
try {
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
const pages = await browser.pages();
if (pages.length === 0) {
throw new Error('No pages found in browser');
}
// Find the right page by target ID
const targetId = getPageId();
let page = null;
if (targetId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === targetId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
console.error(`Starting infinite scroll on ${url}`);
const result = await scrollDown(page, {
timeout,
scrollDelay,
scrollDistance,
scrollLimit,
minHeight,
});
browser.disconnect();
const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
const finalHeightStr = result.finalHeight.toLocaleString();
const addedHeight = result.finalHeight - result.startingHeight;
const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`;
console.error(`Success: ${outputStr}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: outputStr,
}));
process.exit(0);
} catch (e) {
if (browser) browser.disconnect();
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,352 @@
"""
Integration tests for infiniscroll plugin
Tests verify:
1. Hook script exists
2. Dependencies installed via chrome validation hooks
3. Verify deps with abx-pkg
4. INFINISCROLL_ENABLED=False skips without JSONL
5. Fails gracefully when no chrome session exists
6. Full integration test: scrolls page and outputs stats
7. Config options work (scroll limit, min height)
"""
import json
import os
import re
import signal
import subprocess
import time
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
def get_node_modules_dir():
"""Get NODE_MODULES_DIR for tests, checking env first."""
# Check if NODE_PATH is already set in environment
if os.environ.get('NODE_PATH'):
return Path(os.environ['NODE_PATH'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
return lib_dir / 'npm' / 'node_modules'
NODE_MODULES_DIR = get_node_modules_dir()
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
EnvProvider.model_rebuild()
# Verify node is available
node_binary = Binary(name='node', binproviders=[EnvProvider()])
node_loaded = node_binary.load()
assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin"
def test_config_infiniscroll_disabled_skips():
"""Test that INFINISCROLL_ENABLED=False exits without emitting JSONL."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = get_test_env()
env['INFINISCROLL_ENABLED'] = 'False'
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
def test_fails_gracefully_without_chrome_session():
"""Test that hook fails gracefully when no chrome session exists."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
cwd=tmpdir,
capture_output=True,
text=True,
env=get_test_env(),
timeout=30
)
# Should fail (exit 1) when no chrome session
assert result.returncode != 0, "Should fail when no chrome session exists"
# Error could be about chrome/CDP not found, or puppeteer module missing
err_lower = result.stderr.lower()
assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
def setup_chrome_session(tmpdir):
"""Helper to set up Chrome session with tab and navigation."""
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(15):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
if not (chrome_dir / 'cdp_url.txt').exists():
raise RuntimeError("Chrome CDP URL not found after 15s")
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot directory structure
snapshot_dir = Path(tmpdir) / 'snapshot'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
# Create tab
tab_env = env.copy()
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=tab_env
)
if result.returncode != 0:
raise RuntimeError(f"Tab creation failed: {result.stderr}")
# Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
if result.returncode != 0:
raise RuntimeError(f"Navigation failed: {result.stderr}")
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process, chrome_pid):
"""Helper to clean up Chrome processes."""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_scrolls_page_and_outputs_stats():
"""Integration test: scroll page and verify JSONL output format."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
# Create infiniscroll output directory (sibling to chrome)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Run infiniscroll hook
env = get_test_env()
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs"
output_str = result_json.get('output_str', '')
assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}"
assert 'px' in output_str, f"output_str should contain pixel count: {output_str}"
assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}"
# Verify no files created in output directory
output_files = list(infiniscroll_dir.iterdir())
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_scroll_limit_honored():
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set scroll limit to 2
env = get_test_env()
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
env['INFINISCROLL_SCROLL_DELAY'] = '500'
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}"
# Parse output and verify scroll count
result_json = None
for line in result.stdout.strip().split('\n'):
if line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, "Should have JSONL output"
output_str = result_json.get('output_str', '')
# Verify output format and that it completed (scroll limit enforced internally)
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_timeout_honored():
"""Test that INFINISCROLL_TIMEOUT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set very short timeout
env = get_test_env()
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
env['INFINISCROLL_MIN_HEIGHT'] = '100000'
start_time = time.time()
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
elapsed = time.time() - start_time
# Should complete within reasonable time (timeout + buffer)
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
if __name__ == '__main__':
pytest.main([__file__, '-v'])