continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

2026-04-05 15:27:53 +10:00 · 2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -23,7 +23,7 @@ const path = require('path');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'parse_dom_outlinks';
+const PLUGIN_NAME = 'parse_dom_outlinks';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'outlinks.json';
 const URLS_FILE = 'urls.jsonl';  // For crawl system
@@ -190,7 +190,7 @@ async function extractOutlinks(url) {
        const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
            type: 'Snapshot',
            url: href,
-            via_extractor: EXTRACTOR_NAME,
+            plugin: PLUGIN_NAME,
        })).join('\n');

        if (urlsJsonl) {
@@ -236,10 +236,14 @@ async function main() {
            process.exit(0);
        }

-        // Wait for page to be fully loaded
-        const pageLoaded = await waitForChromeTabLoaded(60000);
-        if (!pageLoaded) {
-            throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+        // Check if Chrome session exists, then wait for page load
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            // Wait for page to be fully loaded
+            const pageLoaded = await waitForChromeTabLoaded(60000);
+            if (!pageLoaded) {
+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+            }
        }

        const result = await extractOutlinks(url);