wip major changes

2026-04-06 07:47:53 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -0,0 +1,281 @@
+#!/usr/bin/env node
+/**
+ * Extract and categorize outgoing links from a page's DOM.
+ *
+ * Categorizes links by type:
+ * - hrefs: All <a> links
+ * - images: <img src>
+ * - css_stylesheets: <link rel=stylesheet>
+ * - css_images: CSS background-image: url()
+ * - js_scripts: <script src>
+ * - iframes: <iframe src>
+ * - links: <link> tags with rel/href
+ *
+ * Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
+ *
+ * Environment variables:
+ *     SAVE_DOM_OUTLINKS: Enable DOM outlinks extraction (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'parse_dom_outlinks';
+const OUTPUT_DIR = 'parse_dom_outlinks';
+const OUTPUT_FILE = 'outlinks.json';
+const URLS_FILE = 'urls.jsonl';  // For crawl system
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Extract outlinks
+async function extractOutlinks(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Extract outlinks by category
+        const outlinksData = await page.evaluate(() => {
+            const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
+
+            const filterDataUrls = (urls) => urls.filter(url => url && !url.startsWith('data:'));
+            const filterW3Urls = (urls) => urls.filter(url => url && !url.startsWith('http://www.w3.org/'));
+
+            // Get raw links from HTML
+            const html = document.documentElement.outerHTML;
+            const raw = Array.from(html.matchAll(LINK_REGEX)).map(m => m[0]);
+
+            // Get all <a href> links
+            const hrefs = Array.from(document.querySelectorAll('a[href]'))
+                .map(elem => elem.href)
+                .filter(url => url);
+
+            // Get all <link> tags (not just stylesheets)
+            const linksMap = {};
+            document.querySelectorAll('link[href]').forEach(elem => {
+                const rel = elem.rel || '';
+                const href = elem.href;
+                if (href && rel !== 'stylesheet') {
+                    linksMap[href] = { rel, href };
+                }
+            });
+            const links = Object.values(linksMap);
+
+            // Get iframes
+            const iframes = Array.from(document.querySelectorAll('iframe[src]'))
+                .map(elem => elem.src)
+                .filter(url => url);
+
+            // Get images
+            const images = Array.from(document.querySelectorAll('img[src]'))
+                .map(elem => elem.src)
+                .filter(url => url && !url.startsWith('data:'));
+
+            // Get CSS background images
+            const css_images = Array.from(document.querySelectorAll('*'))
+                .map(elem => {
+                    const bgImg = window.getComputedStyle(elem).getPropertyValue('background-image');
+                    const match = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i.exec(bgImg);
+                    return match ? match[1] : null;
+                })
+                .filter(url => url);
+
+            // Get stylesheets
+            const css_stylesheets = Array.from(document.querySelectorAll('link[rel=stylesheet]'))
+                .map(elem => elem.href)
+                .filter(url => url);
+
+            // Get JS scripts
+            const js_scripts = Array.from(document.querySelectorAll('script[src]'))
+                .map(elem => elem.src)
+                .filter(url => url);
+
+            return {
+                url: window.location.href,
+                raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
+                hrefs: [...new Set(filterDataUrls(hrefs))],
+                links,
+                iframes: [...new Set(iframes)],
+                images: [...new Set(filterDataUrls(images))],
+                css_images: [...new Set(filterDataUrls(css_images))],
+                css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
+                js_scripts: [...new Set(filterDataUrls(js_scripts))],
+            };
+        });
+
+        // Write detailed output (for archival)
+        fs.writeFileSync(outputPath, JSON.stringify(outlinksData, null, 2));
+
+        // Write urls.jsonl for crawl system (only hrefs that are crawlable pages)
+        const urlsPath = path.join(OUTPUT_DIR, URLS_FILE);
+        const crawlableUrls = outlinksData.hrefs.filter(href => {
+            // Only include http/https URLs, exclude static assets
+            if (!href.startsWith('http://') && !href.startsWith('https://')) return false;
+            // Exclude common static file extensions
+            const staticExts = ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot', '.mp4', '.webm', '.mp3', '.pdf'];
+            const urlPath = href.split('?')[0].split('#')[0].toLowerCase();
+            return !staticExts.some(ext => urlPath.endsWith(ext));
+        });
+
+        const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
+            type: 'Snapshot',
+            url: href,
+            via_extractor: EXTRACTOR_NAME,
+        })).join('\n');
+
+        if (urlsJsonl) {
+            fs.writeFileSync(urlsPath, urlsJsonl + '\n');
+        }
+
+        return { success: true, output: outputPath, outlinksData, crawlableCount: crawlableUrls.length };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_DOM_OUTLINKS', true)) {
+            console.log('Skipping DOM outlinks (SAVE_DOM_OUTLINKS=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await extractOutlinks(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            const total = result.outlinksData.hrefs.length;
+            const crawlable = result.crawlableCount;
+            const images = result.outlinksData.images.length;
+            const scripts = result.outlinksData.js_scripts.length;
+            console.log(`DOM outlinks extracted: ${total} links (${crawlable} crawlable), ${images} images, ${scripts} scripts`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});