make infiniscroll plugin also expand details and comments sections

This commit is contained in:
Nick Sweeting
2025-12-29 13:55:22 -08:00
parent 621359c37c
commit 8c69124935
2 changed files with 159 additions and 1 deletions

View File

@@ -41,6 +41,11 @@
"default": 16000,
"minimum": 1000,
"description": "Minimum page height to scroll to in pixels"
},
"INFINISCROLL_EXPAND_DETAILS": {
"type": "boolean",
"default": true,
"description": "Expand <details> elements and click 'load more' buttons for comments"
}
}
}

View File

@@ -6,6 +6,8 @@
* ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
* Stops early if no new content loads after a scroll.
*
* Optionally expands <details> elements and clicks "load more" buttons.
*
* Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
* Output: JSONL with scroll stats (no files created)
*
@@ -16,6 +18,7 @@
* INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
* INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
* INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
* INFINISCROLL_EXPAND_DETAILS: Expand <details> and comments (default: true)
*/
function getEnv(name, defaultValue = '') {
@@ -91,6 +94,130 @@ function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Expand <details> elements and click "load more" buttons for comments.
* Based on archivebox.ts expandComments function.
*/
async function expandDetails(page, options = {}) {
const {
timeout = 30000,
limit = 500,
delay = 500,
} = options;
const startTime = Date.now();
// First, expand all <details> elements
const detailsExpanded = await page.evaluate(() => {
let count = 0;
// Generic <details> elements
document.querySelectorAll('details:not([open])').forEach(el => {
el.open = true;
count++;
});
// Github README details sections
document.querySelectorAll('article details:not([open])').forEach(el => {
el.open = true;
count++;
});
// Github issue discussion hidden comments
document.querySelectorAll('div.js-discussion details:not(.details-overlay):not([open])').forEach(el => {
el.open = true;
count++;
});
// HedgeDoc/Markdown details sections
document.querySelectorAll('.markdown-body details:not([open])').forEach(el => {
el.open = true;
count++;
});
return count;
});
if (detailsExpanded > 0) {
console.error(`Expanded ${detailsExpanded} <details> elements`);
}
// Then click "load more" buttons for comments
const numExpanded = await page.evaluate(async ({ timeout, limit, delay }) => {
// Helper to find elements by XPath
function getElementsByXPath(xpath) {
const results = [];
const xpathResult = document.evaluate(
xpath,
document,
null,
XPathResult.ORDERED_NODE_ITERATOR_TYPE,
null
);
let node;
while ((node = xpathResult.iterateNext()) != null) {
results.push(node);
}
return results;
}
const wait = (ms) => new Promise(res => setTimeout(res, ms));
// Find all "load more" type buttons/links
const getLoadMoreLinks = () => [
// Reddit (new)
...document.querySelectorAll('faceplate-partial[loading=action]'),
// Reddit (old) - show more replies
...document.querySelectorAll('a[onclick^="return morechildren"]'),
// Reddit (old) - show hidden replies
...document.querySelectorAll('a[onclick^="return togglecomment"]'),
// Twitter/X - show more replies
...getElementsByXPath("//*[text()='Show more replies']"),
...getElementsByXPath("//*[text()='Show replies']"),
// Generic "load more" / "show more" buttons
...getElementsByXPath("//*[contains(text(),'Load more')]"),
...getElementsByXPath("//*[contains(text(),'Show more')]"),
// Hacker News
...document.querySelectorAll('a.morelink'),
];
let expanded = 0;
let loadMoreLinks = getLoadMoreLinks();
const startTime = Date.now();
while (loadMoreLinks.length > 0) {
for (const link of loadMoreLinks) {
// Skip certain elements
if (link.slot === 'children') continue;
try {
link.scrollIntoView({ behavior: 'smooth' });
link.click();
expanded++;
await wait(delay);
} catch (e) {
// Ignore click errors
}
// Check limits
if (expanded >= limit) return expanded;
if (Date.now() - startTime >= timeout) return expanded;
}
// Check for new load more links after clicking
await wait(delay);
loadMoreLinks = getLoadMoreLinks();
}
return expanded;
}, { timeout, limit, delay });
if (numExpanded > 0) {
console.error(`Clicked ${numExpanded} "load more" buttons`);
}
return {
detailsExpanded,
commentsExpanded: numExpanded,
total: detailsExpanded + numExpanded,
};
}
async function scrollDown(page, options = {}) {
const {
timeout = 120000,
@@ -206,6 +333,7 @@ async function main() {
const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
const expandDetailsEnabled = getEnvBool('INFINISCROLL_EXPAND_DETAILS', true);
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
@@ -247,6 +375,18 @@ async function main() {
await page.setViewport({ width: resolution[0] || 1440, height: resolution[1] || 2000 });
console.error(`Starting infinite scroll on ${url}`);
// Expand <details> and comments before scrolling (if enabled)
let expandResult = { total: 0, detailsExpanded: 0, commentsExpanded: 0 };
if (expandDetailsEnabled) {
console.error('Expanding <details> and comments...');
expandResult = await expandDetails(page, {
timeout: Math.min(timeout / 4, 30000),
limit: 500,
delay: scrollDelay / 4,
});
}
const result = await scrollDown(page, {
timeout,
scrollDelay,
@@ -255,13 +395,26 @@ async function main() {
minHeight,
});
// Expand again after scrolling (new content may have loaded)
if (expandDetailsEnabled) {
const expandResult2 = await expandDetails(page, {
timeout: Math.min(timeout / 4, 30000),
limit: 500,
delay: scrollDelay / 4,
});
expandResult.total += expandResult2.total;
expandResult.detailsExpanded += expandResult2.detailsExpanded;
expandResult.commentsExpanded += expandResult2.commentsExpanded;
}
browser.disconnect();
const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
const finalHeightStr = result.finalHeight.toLocaleString();
const addedHeight = result.finalHeight - result.startingHeight;
const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`;
const expandStr = expandResult.total > 0 ? `, expanded ${expandResult.total}` : '';
const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}${expandStr}) over ${elapsedSec}s`;
console.error(`Success: ${outputStr}`);
console.log(JSON.stringify({