diff --git a/archivebox/plugins/infiniscroll/config.json b/archivebox/plugins/infiniscroll/config.json index 8f0304ad..5954ff11 100644 --- a/archivebox/plugins/infiniscroll/config.json +++ b/archivebox/plugins/infiniscroll/config.json @@ -41,6 +41,11 @@ "default": 16000, "minimum": 1000, "description": "Minimum page height to scroll to in pixels" + }, + "INFINISCROLL_EXPAND_DETAILS": { + "type": "boolean", + "default": true, + "description": "Expand
elements and click 'load more' buttons for comments" } } } diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js index 1f574e89..584dc727 100755 --- a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js +++ b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js @@ -6,6 +6,8 @@ * ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached. * Stops early if no new content loads after a scroll. * + * Optionally expands
elements and clicks "load more" buttons. + * * Usage: on_Snapshot__45_infiniscroll.js --url= --snapshot-id= * Output: JSONL with scroll stats (no files created) * @@ -16,6 +18,7 @@ * INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600) * INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10) * INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000) + * INFINISCROLL_EXPAND_DETAILS: Expand
and comments (default: true) */ function getEnv(name, defaultValue = '') { @@ -91,6 +94,130 @@ function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } +/** + * Expand
elements and click "load more" buttons for comments. + * Based on archivebox.ts expandComments function. + */ +async function expandDetails(page, options = {}) { + const { + timeout = 30000, + limit = 500, + delay = 500, + } = options; + + const startTime = Date.now(); + + // First, expand all
elements + const detailsExpanded = await page.evaluate(() => { + let count = 0; + // Generic
elements + document.querySelectorAll('details:not([open])').forEach(el => { + el.open = true; + count++; + }); + // Github README details sections + document.querySelectorAll('article details:not([open])').forEach(el => { + el.open = true; + count++; + }); + // Github issue discussion hidden comments + document.querySelectorAll('div.js-discussion details:not(.details-overlay):not([open])').forEach(el => { + el.open = true; + count++; + }); + // HedgeDoc/Markdown details sections + document.querySelectorAll('.markdown-body details:not([open])').forEach(el => { + el.open = true; + count++; + }); + return count; + }); + + if (detailsExpanded > 0) { + console.error(`Expanded ${detailsExpanded}
elements`); + } + + // Then click "load more" buttons for comments + const numExpanded = await page.evaluate(async ({ timeout, limit, delay }) => { + // Helper to find elements by XPath + function getElementsByXPath(xpath) { + const results = []; + const xpathResult = document.evaluate( + xpath, + document, + null, + XPathResult.ORDERED_NODE_ITERATOR_TYPE, + null + ); + let node; + while ((node = xpathResult.iterateNext()) != null) { + results.push(node); + } + return results; + } + + const wait = (ms) => new Promise(res => setTimeout(res, ms)); + + // Find all "load more" type buttons/links + const getLoadMoreLinks = () => [ + // Reddit (new) + ...document.querySelectorAll('faceplate-partial[loading=action]'), + // Reddit (old) - show more replies + ...document.querySelectorAll('a[onclick^="return morechildren"]'), + // Reddit (old) - show hidden replies + ...document.querySelectorAll('a[onclick^="return togglecomment"]'), + // Twitter/X - show more replies + ...getElementsByXPath("//*[text()='Show more replies']"), + ...getElementsByXPath("//*[text()='Show replies']"), + // Generic "load more" / "show more" buttons + ...getElementsByXPath("//*[contains(text(),'Load more')]"), + ...getElementsByXPath("//*[contains(text(),'Show more')]"), + // Hacker News + ...document.querySelectorAll('a.morelink'), + ]; + + let expanded = 0; + let loadMoreLinks = getLoadMoreLinks(); + const startTime = Date.now(); + + while (loadMoreLinks.length > 0) { + for (const link of loadMoreLinks) { + // Skip certain elements + if (link.slot === 'children') continue; + + try { + link.scrollIntoView({ behavior: 'smooth' }); + link.click(); + expanded++; + await wait(delay); + } catch (e) { + // Ignore click errors + } + + // Check limits + if (expanded >= limit) return expanded; + if (Date.now() - startTime >= timeout) return expanded; + } + + // Check for new load more links after clicking + await wait(delay); + loadMoreLinks = getLoadMoreLinks(); + } + + return expanded; + }, { timeout, limit, delay }); + + if (numExpanded > 0) { + console.error(`Clicked ${numExpanded} "load more" buttons`); + } + + return { + detailsExpanded, + commentsExpanded: numExpanded, + total: detailsExpanded + numExpanded, + }; +} + async function scrollDown(page, options = {}) { const { timeout = 120000, @@ -206,6 +333,7 @@ async function main() { const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600); const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10); const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000); + const expandDetailsEnabled = getEnvBool('INFINISCROLL_EXPAND_DETAILS', true); const cdpUrl = getCdpUrl(); if (!cdpUrl) { @@ -247,6 +375,18 @@ async function main() { await page.setViewport({ width: resolution[0] || 1440, height: resolution[1] || 2000 }); console.error(`Starting infinite scroll on ${url}`); + + // Expand
and comments before scrolling (if enabled) + let expandResult = { total: 0, detailsExpanded: 0, commentsExpanded: 0 }; + if (expandDetailsEnabled) { + console.error('Expanding
and comments...'); + expandResult = await expandDetails(page, { + timeout: Math.min(timeout / 4, 30000), + limit: 500, + delay: scrollDelay / 4, + }); + } + const result = await scrollDown(page, { timeout, scrollDelay, @@ -255,13 +395,26 @@ async function main() { minHeight, }); + // Expand again after scrolling (new content may have loaded) + if (expandDetailsEnabled) { + const expandResult2 = await expandDetails(page, { + timeout: Math.min(timeout / 4, 30000), + limit: 500, + delay: scrollDelay / 4, + }); + expandResult.total += expandResult2.total; + expandResult.detailsExpanded += expandResult2.detailsExpanded; + expandResult.commentsExpanded += expandResult2.commentsExpanded; + } + browser.disconnect(); const elapsedSec = (result.elapsedMs / 1000).toFixed(1); const finalHeightStr = result.finalHeight.toLocaleString(); const addedHeight = result.finalHeight - result.startingHeight; const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content'; - const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`; + const expandStr = expandResult.total > 0 ? `, expanded ${expandResult.total}` : ''; + const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}${expandStr}) over ${elapsedSec}s`; console.error(`Success: ${outputStr}`); console.log(JSON.stringify({