mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip major changes
This commit is contained in:
281
archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
Executable file
281
archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
Executable file
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract and categorize outgoing links from a page's DOM.
|
||||
*
|
||||
* Categorizes links by type:
|
||||
* - hrefs: All <a> links
|
||||
* - images: <img src>
|
||||
* - css_stylesheets: <link rel=stylesheet>
|
||||
* - css_images: CSS background-image: url()
|
||||
* - js_scripts: <script src>
|
||||
* - iframes: <iframe src>
|
||||
* - links: <link> tags with rel/href
|
||||
*
|
||||
* Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_DOM_OUTLINKS: Enable DOM outlinks extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = 'parse_dom_outlinks';
|
||||
const OUTPUT_FILE = 'outlinks.json';
|
||||
const URLS_FILE = 'urls.jsonl'; // For crawl system
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract outlinks
|
||||
async function extractOutlinks(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Extract outlinks by category
|
||||
const outlinksData = await page.evaluate(() => {
|
||||
const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
|
||||
|
||||
const filterDataUrls = (urls) => urls.filter(url => url && !url.startsWith('data:'));
|
||||
const filterW3Urls = (urls) => urls.filter(url => url && !url.startsWith('http://www.w3.org/'));
|
||||
|
||||
// Get raw links from HTML
|
||||
const html = document.documentElement.outerHTML;
|
||||
const raw = Array.from(html.matchAll(LINK_REGEX)).map(m => m[0]);
|
||||
|
||||
// Get all <a href> links
|
||||
const hrefs = Array.from(document.querySelectorAll('a[href]'))
|
||||
.map(elem => elem.href)
|
||||
.filter(url => url);
|
||||
|
||||
// Get all <link> tags (not just stylesheets)
|
||||
const linksMap = {};
|
||||
document.querySelectorAll('link[href]').forEach(elem => {
|
||||
const rel = elem.rel || '';
|
||||
const href = elem.href;
|
||||
if (href && rel !== 'stylesheet') {
|
||||
linksMap[href] = { rel, href };
|
||||
}
|
||||
});
|
||||
const links = Object.values(linksMap);
|
||||
|
||||
// Get iframes
|
||||
const iframes = Array.from(document.querySelectorAll('iframe[src]'))
|
||||
.map(elem => elem.src)
|
||||
.filter(url => url);
|
||||
|
||||
// Get images
|
||||
const images = Array.from(document.querySelectorAll('img[src]'))
|
||||
.map(elem => elem.src)
|
||||
.filter(url => url && !url.startsWith('data:'));
|
||||
|
||||
// Get CSS background images
|
||||
const css_images = Array.from(document.querySelectorAll('*'))
|
||||
.map(elem => {
|
||||
const bgImg = window.getComputedStyle(elem).getPropertyValue('background-image');
|
||||
const match = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i.exec(bgImg);
|
||||
return match ? match[1] : null;
|
||||
})
|
||||
.filter(url => url);
|
||||
|
||||
// Get stylesheets
|
||||
const css_stylesheets = Array.from(document.querySelectorAll('link[rel=stylesheet]'))
|
||||
.map(elem => elem.href)
|
||||
.filter(url => url);
|
||||
|
||||
// Get JS scripts
|
||||
const js_scripts = Array.from(document.querySelectorAll('script[src]'))
|
||||
.map(elem => elem.src)
|
||||
.filter(url => url);
|
||||
|
||||
return {
|
||||
url: window.location.href,
|
||||
raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
|
||||
hrefs: [...new Set(filterDataUrls(hrefs))],
|
||||
links,
|
||||
iframes: [...new Set(iframes)],
|
||||
images: [...new Set(filterDataUrls(images))],
|
||||
css_images: [...new Set(filterDataUrls(css_images))],
|
||||
css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
|
||||
js_scripts: [...new Set(filterDataUrls(js_scripts))],
|
||||
};
|
||||
});
|
||||
|
||||
// Write detailed output (for archival)
|
||||
fs.writeFileSync(outputPath, JSON.stringify(outlinksData, null, 2));
|
||||
|
||||
// Write urls.jsonl for crawl system (only hrefs that are crawlable pages)
|
||||
const urlsPath = path.join(OUTPUT_DIR, URLS_FILE);
|
||||
const crawlableUrls = outlinksData.hrefs.filter(href => {
|
||||
// Only include http/https URLs, exclude static assets
|
||||
if (!href.startsWith('http://') && !href.startsWith('https://')) return false;
|
||||
// Exclude common static file extensions
|
||||
const staticExts = ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot', '.mp4', '.webm', '.mp3', '.pdf'];
|
||||
const urlPath = href.split('?')[0].split('#')[0].toLowerCase();
|
||||
return !staticExts.some(ext => urlPath.endsWith(ext));
|
||||
});
|
||||
|
||||
const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
|
||||
type: 'Snapshot',
|
||||
url: href,
|
||||
via_extractor: EXTRACTOR_NAME,
|
||||
})).join('\n');
|
||||
|
||||
if (urlsJsonl) {
|
||||
fs.writeFileSync(urlsPath, urlsJsonl + '\n');
|
||||
}
|
||||
|
||||
return { success: true, output: outputPath, outlinksData, crawlableCount: crawlableUrls.length };
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_DOM_OUTLINKS', true)) {
|
||||
console.log('Skipping DOM outlinks (SAVE_DOM_OUTLINKS=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await extractOutlinks(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const total = result.outlinksData.hrefs.length;
|
||||
const crawlable = result.crawlableCount;
|
||||
const images = result.outlinksData.images.length;
|
||||
const scripts = result.outlinksData.js_scripts.length;
|
||||
console.log(`DOM outlinks extracted: ${total} links (${crawlable} crawlable), ${images} images, ${scripts} scripts`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user