continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

2026-04-04 06:47:57 +10:00 · 2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions
--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -20,7 +20,7 @@ const path = require('path');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'accessibility';
+const PLUGIN_NAME = 'accessibility';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'accessibility.json';
 const CHROME_SESSION_DIR = '../chrome';
@@ -223,10 +223,14 @@ async function main() {
            process.exit(0);
        }

-        // Wait for page to be fully loaded
-        const pageLoaded = await waitForChromeTabLoaded(60000);
-        if (!pageLoaded) {
-            throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+        // Check if Chrome session exists, then wait for page load
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            // Wait for page to be fully loaded
+            const pageLoaded = await waitForChromeTabLoaded(60000);
+            if (!pageLoaded) {
+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+            }
        }

        const result = await extractAccessibility(url);
--- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
+++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
@@ -25,7 +25,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'archive_org'
+PLUGIN_NAME = 'archive_org'
 OUTPUT_DIR = '.'
 OUTPUT_FILE = 'archive.org.txt'

--- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
@@ -26,101 +26,23 @@ const { spawn } = require('child_process');
 const http = require('http');

 // Extractor metadata
-const EXTRACTOR_NAME = 'chrome_launch';
+const PLUGIN_NAME = 'chrome_launch';
 const OUTPUT_DIR = 'chrome';

-// Helper: Write PID file with mtime set to process start time
+// Helpers for PID file creation
 function writePidWithMtime(filePath, pid, startTimeSeconds) {
    fs.writeFileSync(filePath, String(pid));
-    // Set both atime and mtime to process start time for validation
    const startTimeMs = startTimeSeconds * 1000;
    fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
 }

-// Helper: Write command script for validation
 function writeCmdScript(filePath, binary, args) {
-    // Shell escape arguments containing spaces or special characters
-    const escapedArgs = args.map(arg => {
-        if (arg.includes(' ') || arg.includes('"') || arg.includes('$')) {
-            return `"${arg.replace(/"/g, '\\"')}"`;
-        }
-        return arg;
-    });
-    const script = `#!/bin/bash\n${binary} ${escapedArgs.join(' ')}\n`;
-    fs.writeFileSync(filePath, script);
+    const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$'))
+        ? `"${arg.replace(/"/g, '\\"')}"` : arg;
+    fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`);
    fs.chmodSync(filePath, 0o755);
 }

-// Helper: Get process start time (cross-platform)
-function getProcessStartTime(pid) {
-    try {
-        const { execSync } = require('child_process');
-        if (process.platform === 'darwin') {
-            // macOS: ps -p PID -o lstart= gives start time
-            const output = execSync(`ps -p ${pid} -o lstart=`, { encoding: 'utf8', timeout: 1000 });
-            return Date.parse(output.trim()) / 1000;  // Convert to epoch seconds
-        } else {
-            // Linux: read /proc/PID/stat field 22 (starttime in clock ticks)
-            const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
-            const match = stat.match(/\) \w+ (\d+)/);
-            if (match) {
-                const startTicks = parseInt(match[1], 10);
-                // Convert clock ticks to seconds (assuming 100 ticks/sec)
-                const uptimeSeconds = parseFloat(fs.readFileSync('/proc/uptime', 'utf8').split(' ')[0]);
-                const bootTime = Date.now() / 1000 - uptimeSeconds;
-                return bootTime + (startTicks / 100);
-            }
-        }
-    } catch (e) {
-        // Can't get start time
-        return null;
-    }
-    return null;
-}
-
-// Helper: Validate PID using mtime and command
-function validatePid(pid, pidFile, cmdFile) {
-    try {
-        // Check process exists
-        try {
-            process.kill(pid, 0);  // Signal 0 = check existence
-        } catch (e) {
-            return false;  // Process doesn't exist
-        }
-
-        // Check mtime matches process start time (within 5 sec tolerance)
-        const fileStat = fs.statSync(pidFile);
-        const fileMtime = fileStat.mtimeMs / 1000;  // Convert to seconds
-        const procStartTime = getProcessStartTime(pid);
-
-        if (procStartTime === null) {
-            // Can't validate - fall back to basic existence check
-            return true;
-        }
-
-        if (Math.abs(fileMtime - procStartTime) > 5) {
-            // PID was reused by different process
-            return false;
-        }
-
-        // Validate command if available
-        if (fs.existsSync(cmdFile)) {
-            const cmd = fs.readFileSync(cmdFile, 'utf8');
-            // Check for Chrome/Chromium and debug port
-            if (!cmd.includes('chrome') && !cmd.includes('chromium')) {
-                return false;
-            }
-            if (!cmd.includes('--remote-debugging-port')) {
-                return false;
-            }
-        }
-
-        return true;
-    } catch (e) {
-        return false;
-    }
-}
-
 // Global state for cleanup
 let chromePid = null;

@@ -332,20 +254,20 @@ function killZombieChrome() {
                        const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
                        if (isNaN(pid) || pid <= 0) continue;

-                        // Validate PID before killing
-                        const cmdFile = path.join(chromeDir, 'cmd.sh');
-                        if (!validatePid(pid, pidFile, cmdFile)) {
-                            // PID reused or validation failed
-                            console.error(`[!] PID ${pid} failed validation (reused or wrong process) - cleaning up`);
+                        // Check if process exists (simple check, Python will validate properly)
+                        try {
+                            process.kill(pid, 0);
+                        } catch (e) {
+                            // Process dead, remove stale PID file
                            try { fs.unlinkSync(pidFile); } catch (e) {}
                            continue;
                        }

-                        // Process alive, validated, and crawl is stale - zombie!
-                        console.error(`[!] Found validated zombie (PID ${pid}) from stale crawl ${crawl.name}`);
+                        // Process alive and crawl is stale - zombie!
+                        console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);

                        try {
-                            // Kill process group first
+                            // Kill process group
                            try {
                                process.kill(-pid, 'SIGKILL');
                            } catch (e) {
@@ -354,14 +276,10 @@ function killZombieChrome() {

                            killed++;
                            console.error(`[+] Killed zombie (PID ${pid})`);
-
-                            // Remove PID file
                            try { fs.unlinkSync(pidFile); } catch (e) {}
-
                        } catch (e) {
                            console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
                        }
-
                    } catch (e) {
                        // Skip invalid PID files
                    }
--- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
@@ -29,7 +29,7 @@ const http = require('http');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'chrome_tab';
+const PLUGIN_NAME = 'chrome_tab';
 const OUTPUT_DIR = '.';  // Hook already runs in chrome/ output directory
 const CHROME_SESSION_DIR = '.';

--- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
@@ -19,7 +19,7 @@ const fs = require('fs');
 const path = require('path');
 const puppeteer = require('puppeteer-core');

-const EXTRACTOR_NAME = 'chrome_navigate';
+const PLUGIN_NAME = 'chrome_navigate';
 const CHROME_SESSION_DIR = '.';
 const OUTPUT_DIR = '.';

--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 const path = require('path');
 const puppeteer = require('puppeteer-core');

-const EXTRACTOR_NAME = 'consolelog';
+const PLUGIN_NAME = 'consolelog';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'console.jsonl';
 const PID_FILE = 'hook.pid';
--- a/archivebox/plugins/dom/on_Snapshot__36_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js
@@ -23,7 +23,7 @@ const path = require('path');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'dom';
+const PLUGIN_NAME = 'dom';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.html';
 const CHROME_SESSION_DIR = '../chrome';
@@ -252,10 +252,14 @@ async function main() {
            }));
            process.exit(0);
        } else {
-            // Wait for page to be fully loaded
-            const pageLoaded = await waitForChromeTabLoaded(60000);
-            if (!pageLoaded) {
-                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+            // Only wait for page load if using shared Chrome session
+            const cdpUrl = getCdpUrl();
+            if (cdpUrl) {
+                // Wait for page to be fully loaded
+                const pageLoaded = await waitForChromeTabLoaded(60000);
+                if (!pageLoaded) {
+                    throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+                }
            }

            const result = await dumpDom(url);
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -27,7 +27,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'favicon'
+PLUGIN_NAME = 'favicon'
 OUTPUT_DIR = '.'
 OUTPUT_FILE = 'favicon.ico'

--- a/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py
+++ b/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py
@@ -31,7 +31,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'forumdl'
+PLUGIN_NAME = 'forumdl'
 BIN_NAME = 'forum-dl'
 BIN_PROVIDERS = 'pip,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py
+++ b/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py
@@ -32,7 +32,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'gallerydl'
+PLUGIN_NAME = 'gallerydl'
 BIN_NAME = 'gallery-dl'
 BIN_PROVIDERS = 'pip,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/git/on_Snapshot__12_git.py
+++ b/archivebox/plugins/git/on_Snapshot__12_git.py
@@ -24,7 +24,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'git'
+PLUGIN_NAME = 'git'
 BIN_NAME = 'git'
 BIN_PROVIDERS = 'apt,brew,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/headers/on_Snapshot__33_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js
@@ -21,7 +21,7 @@ const https = require('https');
 const http = require('http');

 // Extractor metadata
-const EXTRACTOR_NAME = 'headers';
+const PLUGIN_NAME = 'headers';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'headers.json';
 const CHROME_SESSION_DIR = '../chrome';
--- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
@@ -26,7 +26,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'htmltotext'
+PLUGIN_NAME = 'htmltotext'
 OUTPUT_DIR = '.'
 OUTPUT_FILE = 'htmltotext.txt'

--- a/archivebox/plugins/media/on_Snapshot__51_media.py
+++ b/archivebox/plugins/media/on_Snapshot__51_media.py
@@ -34,7 +34,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'media'
+PLUGIN_NAME = 'media'
 BIN_NAME = 'yt-dlp'
 BIN_PROVIDERS = 'pip,apt,brew,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
@@ -25,7 +25,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'mercury'
+PLUGIN_NAME = 'mercury'
 BIN_NAME = 'postlight-parser'
 BIN_PROVIDERS = 'npm,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py
+++ b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py
@@ -28,7 +28,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'papersdl'
+PLUGIN_NAME = 'papersdl'
 BIN_NAME = 'papers-dl'
 BIN_PROVIDERS = 'pip,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -23,7 +23,7 @@ const path = require('path');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'parse_dom_outlinks';
+const PLUGIN_NAME = 'parse_dom_outlinks';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'outlinks.json';
 const URLS_FILE = 'urls.jsonl';  // For crawl system
@@ -190,7 +190,7 @@ async function extractOutlinks(url) {
        const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
            type: 'Snapshot',
            url: href,
-            via_extractor: EXTRACTOR_NAME,
+            plugin: PLUGIN_NAME,
        })).join('\n');

        if (urlsJsonl) {
@@ -236,10 +236,14 @@ async function main() {
            process.exit(0);
        }

-        // Wait for page to be fully loaded
-        const pageLoaded = await waitForChromeTabLoaded(60000);
-        if (!pageLoaded) {
-            throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+        // Check if Chrome session exists, then wait for page load
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            // Wait for page to be fully loaded
+            const pageLoaded = await waitForChromeTabLoaded(60000);
+            if (!pageLoaded) {
+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+            }
        }

        const result = await extractOutlinks(url);
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
@@ -28,7 +28,7 @@ from urllib.parse import urljoin, urlparse

 import rich_click as click

-EXTRACTOR_NAME = 'parse_html_urls'
+PLUGIN_NAME = 'parse_html_urls'

 # Check if parse_dom_outlinks extractor already ran
 DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
@@ -179,7 +179,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
        record = {
            'type': 'Snapshot',
            'url': found_url,
-            'via_extractor': EXTRACTOR_NAME,
+            'plugin': PLUGIN_NAME,
            'depth': depth + 1,
        }
        if snapshot_id:
--- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
@@ -233,7 +233,7 @@ class TestParseHtmlUrls:
        entry = json.loads(output_file.read_text().strip())
        assert entry['url'] == 'https://example.com'
        assert 'type' in entry
-        assert 'via_extractor' in entry
+        assert 'plugin' in entry


 if __name__ == '__main__':
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
@@ -24,7 +24,7 @@ from urllib.parse import urlparse

 import rich_click as click

-EXTRACTOR_NAME = 'parse_jsonl_urls'
+PLUGIN_NAME = 'parse_jsonl_urls'


 def parse_bookmarked_at(link: dict) -> str | None:
@@ -75,7 +75,7 @@ def json_object_to_entry(link: dict) -> dict | None:
    entry = {
        'type': 'Snapshot',
        'url': unescape(url),
-        'via_extractor': EXTRACTOR_NAME,
+        'plugin': PLUGIN_NAME,
    }

    # Parse title
--- a/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
@@ -265,7 +265,7 @@ class TestParseJsonlUrls:
        entry = json.loads(output_file.read_text().strip())
        assert entry['url'] == 'https://example.com'
        assert 'type' in entry
-        assert 'via_extractor' in entry
+        assert 'plugin' in entry


 if __name__ == '__main__':
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
@@ -22,7 +22,7 @@ from urllib.parse import urlparse

 import rich_click as click

-EXTRACTOR_NAME = 'parse_netscape_urls'
+PLUGIN_NAME = 'parse_netscape_urls'

 # Constants for timestamp epoch detection
 UNIX_EPOCH = 0  # 1970-01-01 00:00:00 UTC
@@ -187,7 +187,7 @@ def main(url: str, snapshot_id: str = None):
            entry = {
                'type': 'Snapshot',
                'url': unescape(bookmark_url),
-                'via_extractor': EXTRACTOR_NAME,
+                'plugin': PLUGIN_NAME,
            }
            if title:
                entry['title'] = unescape(title)
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
@@ -23,7 +23,7 @@ from urllib.parse import urlparse

 import rich_click as click

-EXTRACTOR_NAME = 'parse_rss_urls'
+PLUGIN_NAME = 'parse_rss_urls'

 try:
    import feedparser
@@ -107,7 +107,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
        entry = {
            'type': 'Snapshot',
            'url': unescape(item_url),
-            'via_extractor': EXTRACTOR_NAME,
+            'plugin': PLUGIN_NAME,
            'depth': depth + 1,
        }
        if snapshot_id:
--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
@@ -47,7 +47,7 @@ class TestRssVariants:

        assert entry['url'] == 'https://example.com/article1'
        assert entry['title'] == 'RSS 0.91 Article'
-        assert entry['via_extractor'] == 'parse_rss_urls'
+        assert entry['plugin'] == 'parse_rss_urls'

    def test_rss_10_rdf(self, tmp_path):
        """Test RSS 1.0 (RDF) format."""
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
@@ -25,7 +25,7 @@ from urllib.request import urlopen

 import rich_click as click

-EXTRACTOR_NAME = 'parse_txt_urls'
+PLUGIN_NAME = 'parse_txt_urls'

 # URL regex from archivebox/misc/util.py
 # https://mathiasbynens.be/demo/url-regex
@@ -127,7 +127,7 @@ def main(url: str, snapshot_id: str = None):
            f.write(json.dumps({
                'type': 'Snapshot',
                'url': found_url,
-                'via_extractor': EXTRACTOR_NAME,
+                'plugin': PLUGIN_NAME,
            }) + '\n')

    click.echo(f'Found {len(urls_found)} URLs')
--- a/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
@@ -186,7 +186,7 @@ https://other.com
        entry = json.loads(output_file.read_text().strip())
        assert entry['url'] == 'https://example.com'
        assert 'type' in entry
-        assert 'via_extractor' in entry
+        assert 'plugin' in entry


 if __name__ == '__main__':
--- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
@@ -22,7 +22,7 @@ const path = require('path');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'pdf';
+const PLUGIN_NAME = 'pdf';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.pdf';
 const CHROME_SESSION_DIR = '../chrome';
@@ -254,10 +254,14 @@ async function main() {
            }));
            process.exit(0);  // Permanent skip - staticfile already handled
        } else {
-            // Wait for page to be fully loaded
-            const pageLoaded = await waitForChromeTabLoaded(60000);
-            if (!pageLoaded) {
-                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+            // Only wait for page load if using shared Chrome session
+            const cdpUrl = getCdpUrl();
+            if (cdpUrl) {
+                // Wait for page to be fully loaded
+                const pageLoaded = await waitForChromeTabLoaded(60000);
+                if (!pageLoaded) {
+                    throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+                }
            }

            const result = await printToPdf(url);
--- a/archivebox/plugins/extractor_utils.py
+++ b/archivebox/plugins/extractor_utils.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
 """
-Shared utilities for extractor hooks.
+Shared utilities for extractor plugin hooks.

-This module provides common functionality for all extractors to ensure
+This module provides common functionality for all extractor plugins to ensure
 consistent behavior, output format, error handling, and timing.

-All extractors should:
+All extractor plugins should:
 1. Import and use these utilities
 2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
 3. Write all files to $PWD
@@ -35,7 +35,7 @@ STATIC_EXTENSIONS = (


 def is_static_file(url: str) -> bool:
-    """Check if URL points to a static file that may not need browser extraction."""
+    """Check if URL points to a static file that may not need browser-based extractor plugins."""
    return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)


@@ -96,7 +96,7 @@ def get_version(binary: str, version_args: list[str] | None = None) -> str:

 class ExtractorResult:
    """
-    Tracks extractor execution and produces consistent output.
+    Tracks extractor plugin execution and produces consistent output.

    Usage:
        result = ExtractorResult(name='wget', url=url)
@@ -152,7 +152,7 @@ class ExtractorResult:
        return 1

    def finish(self, status: str | None = None):
-        """Mark extraction as finished and print results."""
+        """Mark extractor plugin execution as finished and print results."""
        self.end_ts = datetime.now(timezone.utc)
        if status:
            self.status = status
--- a/archivebox/plugins/readability/on_Snapshot__52_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py
@@ -27,7 +27,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'readability'
+PLUGIN_NAME = 'readability'
 BIN_NAME = 'readability-extractor'
 BIN_PROVIDERS = 'npm,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js
+++ b/archivebox/plugins/redirects/on_Snapshot__31_redirects.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 const path = require('path');
 const puppeteer = require('puppeteer-core');

-const EXTRACTOR_NAME = 'redirects';
+const PLUGIN_NAME = 'redirects';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'redirects.jsonl';
 const PID_FILE = 'hook.pid';
@@ -235,7 +235,7 @@ function handleShutdown(signal) {
        type: 'ArchiveResult',
        status: 'succeeded',
        output_str: OUTPUT_FILE,
-        extractor: EXTRACTOR_NAME,
+        plugin: PLUGIN_NAME,
        original_url: originalUrl,
        final_url: finalUrl || originalUrl,
        redirect_count: redirectChain.length,
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
@@ -15,7 +15,7 @@ const path = require('path');
 const crypto = require('crypto');
 const puppeteer = require('puppeteer-core');

-const EXTRACTOR_NAME = 'responses';
+const PLUGIN_NAME = 'responses';
 const OUTPUT_DIR = '.';
 const PID_FILE = 'hook.pid';
 const CHROME_SESSION_DIR = '../chrome';
--- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
@@ -22,7 +22,7 @@ const path = require('path');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'screenshot';
+const PLUGIN_NAME = 'screenshot';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'screenshot.png';
 const CHROME_SESSION_DIR = '../chrome';
@@ -250,10 +250,14 @@ async function main() {
            }));
            process.exit(0);  // Permanent skip - staticfile already handled
        } else {
-            // Wait for page to be fully loaded
-            const pageLoaded = await waitForChromeTabLoaded(60000);
-            if (!pageLoaded) {
-                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+            // Only wait for page load if using shared Chrome session
+            const cdpUrl = getCdpUrl();
+            if (cdpUrl) {
+                // Wait for page to be fully loaded
+                const pageLoaded = await waitForChromeTabLoaded(60000);
+                if (!pageLoaded) {
+                    throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+                }
            }

            const result = await takeScreenshot(url);
--- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
+++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
@@ -27,7 +27,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'index_sonic'
+PLUGIN_NAME = 'index_sonic'
 OUTPUT_DIR = '.'

 # Text file patterns to index
@@ -83,14 +83,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
    cwd = Path.cwd()

    for extractor, file_pattern in INDEXABLE_FILES:
-        extractor_dir = cwd / extractor
-        if not extractor_dir.exists():
+        plugin_dir = cwd / extractor
+        if not plugin_dir.exists():
            continue

        if '*' in file_pattern:
-            matches = list(extractor_dir.glob(file_pattern))
+            matches = list(plugin_dir.glob(file_pattern))
        else:
-            match = extractor_dir / file_pattern
+            match = plugin_dir / file_pattern
            matches = [match] if match.exists() else []

        for match in matches:
--- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
+++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
@@ -25,7 +25,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'index_sqlite'
+PLUGIN_NAME = 'index_sqlite'
 OUTPUT_DIR = '.'

 # Text file patterns to index, in priority order
@@ -74,14 +74,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
    cwd = Path.cwd()

    for extractor, file_pattern in INDEXABLE_FILES:
-        extractor_dir = cwd / extractor
-        if not extractor_dir.exists():
+        plugin_dir = cwd / extractor
+        if not plugin_dir.exists():
            continue

        if '*' in file_pattern:
-            matches = list(extractor_dir.glob(file_pattern))
+            matches = list(plugin_dir.glob(file_pattern))
        else:
-            match = extractor_dir / file_pattern
+            match = plugin_dir / file_pattern
            matches = [match] if match.exists() else []

        for match in matches:
--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -20,7 +20,7 @@ const path = require('path');
 const puppeteer = require('puppeteer-core');

 // Extractor metadata
-const EXTRACTOR_NAME = 'seo';
+const PLUGIN_NAME = 'seo';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'seo.json';
 const CHROME_SESSION_DIR = '../chrome';
@@ -177,10 +177,14 @@ async function main() {
            process.exit(0);
        }

-        // Wait for page to be fully loaded
-        const pageLoaded = await waitForChromeTabLoaded(60000);
-        if (!pageLoaded) {
-            throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+        // Check if Chrome session exists, then wait for page load
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            // Wait for page to be fully loaded
+            const pageLoaded = await waitForChromeTabLoaded(60000);
+            if (!pageLoaded) {
+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
+            }
        }

        const result = await extractSeo(url);
--- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
@@ -36,7 +36,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'singlefile'
+PLUGIN_NAME = 'singlefile'
 BIN_NAME = 'single-file'
 BIN_PROVIDERS = 'npm,env'
 OUTPUT_DIR = '.'
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 const path = require('path');
 const puppeteer = require('puppeteer-core');

-const EXTRACTOR_NAME = 'ssl';
+const PLUGIN_NAME = 'ssl';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'ssl.jsonl';
 const PID_FILE = 'hook.pid';
--- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js
+++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.bg.js
@@ -14,7 +14,7 @@ const fs = require('fs');
 const path = require('path');
 const puppeteer = require('puppeteer-core');

-const EXTRACTOR_NAME = 'staticfile';
+const PLUGIN_NAME = 'staticfile';
 const OUTPUT_DIR = '.';
 const PID_FILE = 'hook.pid';
 const CHROME_SESSION_DIR = '../chrome';
@@ -326,7 +326,7 @@ function handleShutdown(signal) {
            type: 'ArchiveResult',
            status: 'skipped',
            output_str: 'No Content-Type detected',
-            extractor: EXTRACTOR_NAME,
+            plugin: PLUGIN_NAME,
        };
    } else if (!isStaticFile) {
        // Not a static file (normal case for HTML pages)
@@ -334,7 +334,7 @@ function handleShutdown(signal) {
            type: 'ArchiveResult',
            status: 'skipped',
            output_str: `Not a static file (Content-Type: ${detectedContentType})`,
-            extractor: EXTRACTOR_NAME,
+            plugin: PLUGIN_NAME,
            content_type: detectedContentType,
        };
    } else if (downloadError) {
@@ -343,7 +343,7 @@ function handleShutdown(signal) {
            type: 'ArchiveResult',
            status: 'failed',
            output_str: downloadError,
-            extractor: EXTRACTOR_NAME,
+            plugin: PLUGIN_NAME,
            content_type: detectedContentType,
        };
    } else if (downloadedFilePath) {
@@ -352,7 +352,7 @@ function handleShutdown(signal) {
            type: 'ArchiveResult',
            status: 'succeeded',
            output_str: downloadedFilePath,
-            extractor: EXTRACTOR_NAME,
+            plugin: PLUGIN_NAME,
            content_type: detectedContentType,
        };
    } else {
@@ -361,7 +361,7 @@ function handleShutdown(signal) {
            type: 'ArchiveResult',
            status: 'failed',
            output_str: 'Static file detected but download did not complete',
-            extractor: EXTRACTOR_NAME,
+            plugin: PLUGIN_NAME,
            content_type: detectedContentType,
        };
    }
--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -20,7 +20,7 @@ const https = require('https');
 const http = require('http');

 // Extractor metadata
-const EXTRACTOR_NAME = 'title';
+const PLUGIN_NAME = 'title';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'title.txt';
 const CHROME_SESSION_DIR = '../chrome';
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -39,7 +39,7 @@ import rich_click as click


 # Extractor metadata
-EXTRACTOR_NAME = 'wget'
+PLUGIN_NAME = 'wget'
 BIN_NAME = 'wget'
 BIN_PROVIDERS = 'apt,brew,env'
 OUTPUT_DIR = '.'