tons of fixes with codex

2026-04-06 07:47:53 +10:00 · 2026-01-19 01:00:53 -08:00
parent eaf7256345
commit c7b2217cd6
184 changed files with 3943 additions and 2420 deletions
--- a/archivebox/plugins/accessibility/templates/icon.html
+++ b/archivebox/plugins/accessibility/templates/icon.html
@@ -0,0 +1 @@
+<span class="abx-output-icon abx-output-icon--accessibility" title="Accessibility"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="4.5" r="2" fill="currentColor" stroke="none"/><path d="M4 7.5h16"/><path d="M12 7.5v12"/><path d="M7 20l5-6 5 6"/></svg></span>
--- a/archivebox/plugins/accessibility/tests/init.py
+++ b/archivebox/plugins/accessibility/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the accessibility plugin."""
--- a/archivebox/plugins/apt/on_Binary__13_apt_install.py
+++ b/archivebox/plugins/apt/on_Binary__13_apt_install.py
@@ -10,7 +10,7 @@ import json
 import sys

 import rich_click as click
-from abx_pkg import Binary, AptProvider
+from abx_pkg import Binary, AptProvider, BinProviderOverrides

 # Fix pydantic forward reference issue
 AptProvider.model_rebuild()
--- a/archivebox/plugins/apt/tests/init.py
+++ b/archivebox/plugins/apt/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the apt binary provider plugin."""
--- a/archivebox/plugins/apt/tests/test_apt_provider.py
+++ b/archivebox/plugins/apt/tests/test_apt_provider.py
@@ -21,7 +21,7 @@ from django.test import TestCase

 # Get the path to the apt provider hook
 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_apt_provider.py'
+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_apt_install.py'), None)


 def apt_available() -> bool:
@@ -48,7 +48,7 @@ class TestAptProviderHook(TestCase):

    def test_hook_script_exists(self):
        """Hook script should exist."""
-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")

    def test_hook_skips_when_apt_not_allowed(self):
        """Hook should skip when apt not in allowed binproviders."""
--- a/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py
+++ b/archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py
@@ -47,6 +47,9 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:

    Returns: (success, output_path, error_message)
    """
+    def log(message: str) -> None:
+        print(f'[archivedotorg] {message}', file=sys.stderr)
+
    try:
        import requests
    except ImportError:
@@ -56,6 +59,8 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
    user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')

    submit_url = f'https://web.archive.org/save/{url}'
+    log(f'Submitting to Wayback Machine (timeout={timeout}s)')
+    log(f'GET {submit_url}')

    try:
        response = requests.get(
@@ -64,31 +69,40 @@ def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
            headers={'User-Agent': user_agent},
            allow_redirects=True,
        )
+        log(f'HTTP {response.status_code} final_url={response.url}')

        # Check for successful archive
        content_location = response.headers.get('Content-Location', '')
        x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
+        if content_location:
+            log(f'Content-Location: {content_location}')
+        if x_archive_orig_url:
+            log(f'X-Archive-Orig-Url: {x_archive_orig_url}')

        # Build archive URL
        if content_location:
            archive_url = f'https://web.archive.org{content_location}'
            Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
+            log(f'Saved archive URL -> {archive_url}')
            return True, OUTPUT_FILE, ''
        elif 'web.archive.org' in response.url:
            # We were redirected to an archive page
            Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
+            log(f'Redirected to archive page -> {response.url}')
            return True, OUTPUT_FILE, ''
        else:
            # Check for errors in response
            if 'RobotAccessControlException' in response.text:
                # Blocked by robots.txt - save submit URL for manual retry
                Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
+                log('Blocked by robots.txt, saved submit URL for manual retry')
                return True, OUTPUT_FILE, ''  # Consider this a soft success
            elif response.status_code >= 400:
                return False, None, f'HTTP {response.status_code}'
            else:
                # Save submit URL anyway
                Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
+                log('No archive URL returned, saved submit URL for manual retry')
                return True, OUTPUT_FILE, ''

    except requests.Timeout:
--- a/archivebox/plugins/archivedotorg/templates/icon.html
+++ b/archivebox/plugins/archivedotorg/templates/icon.html
@@ -1 +1 @@
-🏛️
+<span class="abx-output-icon abx-output-icon--archivedotorg" title="Archive.org"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M3 7h18"/><rect x="3" y="7" width="18" height="13" rx="2"/><path d="M9 12h6"/></svg></span>
--- a/archivebox/plugins/chrome/binaries.jsonl
+++ b/archivebox/plugins/chrome/binaries.jsonl
@@ -1 +0,0 @@
-{"type": "Binary", "name": "chrome", "binproviders": "npm,env,brew,apt", "overrides": {"npm": {"packages": ["@puppeteer/browsers"]}}}
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -1253,7 +1253,7 @@ function getExtensionTargets(browser) {
 }

 /**
- * Find Chromium/Chrome binary path.
+ * Find Chromium binary path.
 * Checks CHROME_BINARY env var first, then falls back to system locations.
 *
 * @returns {string|null} - Absolute path to browser binary or null if not found
@@ -1276,7 +1276,9 @@ function findChromium() {
    const chromeBinary = getEnv('CHROME_BINARY');
    if (chromeBinary) {
        const absPath = path.resolve(chromeBinary);
-        if (validateBinary(absPath)) {
+        if (absPath.includes('Google Chrome') || absPath.includes('google-chrome')) {
+            console.error('[!] Warning: CHROME_BINARY points to Chrome. Chromium is required for extension support.');
+        } else if (validateBinary(absPath)) {
            return absPath;
        }
        console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`);
@@ -1309,7 +1311,7 @@ function findChromium() {
        return null;
    };

-    // 3. Search fallback locations (Chromium first, then Chrome)
+    // 3. Search fallback locations (Chromium only)
    const fallbackLocations = [
        // System Chromium
        '/Applications/Chromium.app/Contents/MacOS/Chromium',
@@ -1318,10 +1320,6 @@ function findChromium() {
        // Puppeteer cache
        path.join(process.env.HOME || '', '.cache/puppeteer/chromium'),
        path.join(process.env.HOME || '', '.cache/puppeteer'),
-        // Chrome (fallback - extensions may not work in 137+)
-        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
-        '/usr/bin/google-chrome',
-        '/usr/bin/google-chrome-stable',
    ];

    for (const loc of fallbackLocations) {
@@ -1332,9 +1330,6 @@ function findChromium() {
                return binary;
            }
        } else if (validateBinary(loc)) {
-            if (loc.includes('Google Chrome') || loc.includes('google-chrome')) {
-                console.error('[!] Warning: Using Chrome instead of Chromium. Extension loading may not work in Chrome 137+');
-            }
            return loc;
        }
    }
@@ -1699,10 +1694,10 @@ module.exports = {
    // Chrome launching
    launchChromium,
    killChrome,
-    // Chrome/Chromium install
+    // Chromium install
    installChromium,
    installPuppeteerCore,
-    // Chrome/Chromium binary finding
+    // Chromium binary finding
    findChromium,
    // Extension utilities
    getExtensionId,
@@ -1744,7 +1739,7 @@ if (require.main === module) {
        console.log('Usage: chrome_utils.js <command> [args...]');
        console.log('');
        console.log('Commands:');
-        console.log('  findChromium              Find Chrome/Chromium binary');
+        console.log('  findChromium              Find Chromium binary');
        console.log('  installChromium           Install Chromium via @puppeteer/browsers');
        console.log('  installPuppeteerCore      Install puppeteer-core npm package');
        console.log('  launchChromium            Launch Chrome with CDP debugging');
--- a/archivebox/plugins/chrome/config.json
+++ b/archivebox/plugins/chrome/config.json
@@ -7,13 +7,13 @@
      "type": "boolean",
      "default": true,
      "x-aliases": ["USE_CHROME"],
-      "description": "Enable Chrome/Chromium browser integration for archiving"
+      "description": "Enable Chromium browser integration for archiving"
    },
    "CHROME_BINARY": {
      "type": "string",
      "default": "chromium",
      "x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
-      "description": "Path to Chrome/Chromium binary"
+      "description": "Path to Chromium binary"
    },
    "CHROME_NODE_BINARY": {
      "type": "string",
--- a/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
+++ b/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
@@ -1,265 +0,0 @@
-#!/usr/bin/env python3
-"""
-Install hook for Chrome/Chromium and puppeteer-core.
-
-Runs at crawl start to install/find Chromium and puppeteer-core.
-Also validates config and computes derived values.
-
-Outputs:
-    - JSONL for Binary and Machine config updates
-    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
-
-Respects CHROME_BINARY env var for custom binary paths.
-Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
-
-NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
-loading unpacked extensions in headless mode.
-"""
-
-import os
-import sys
-import json
-import subprocess
-from pathlib import Path
-
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def detect_docker() -> bool:
-    """Detect if running inside Docker container."""
-    return (
-        os.path.exists('/.dockerenv') or
-        os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
-        os.path.exists('/run/.containerenv')
-    )
-
-
-def get_chrome_version(binary_path: str) -> str | None:
-    """Get Chrome/Chromium version string."""
-    try:
-        result = subprocess.run(
-            [binary_path, '--version'],
-            capture_output=True,
-            text=True,
-            timeout=5
-        )
-        if result.returncode == 0:
-            return result.stdout.strip()
-    except Exception:
-        pass
-    return None
-
-
-def install_puppeteer_core() -> bool:
-    """Install puppeteer-core to NODE_MODULES_DIR if not present."""
-    node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
-    if not node_modules_dir:
-        # No isolated node_modules, skip (will use global)
-        return True
-
-    node_modules_path = Path(node_modules_dir)
-    if (node_modules_path / 'puppeteer-core').exists():
-        return True
-
-    # Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
-    npm_prefix = node_modules_path.parent
-
-    try:
-        print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
-        result = subprocess.run(
-            ['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
-            capture_output=True,
-            text=True,
-            timeout=60
-        )
-        if result.returncode == 0:
-            print(f"[+] puppeteer-core installed", file=sys.stderr)
-            return True
-        else:
-            print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
-            return False
-    except Exception as e:
-        print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
-        return False
-
-
-def install_chromium() -> dict | None:
-    """Install Chromium using @puppeteer/browsers and parse output for binary path.
-
-    Output format: "chromium@<version> <path_to_binary>"
-    e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
-
-    Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
-    """
-    try:
-        print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
-
-        # Use --path to install to puppeteer's standard cache location
-        cache_path = os.path.expanduser('~/.cache/puppeteer')
-
-        result = subprocess.run(
-            ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
-            capture_output=True,
-            text=True,
-            stdin=subprocess.DEVNULL,
-            timeout=300
-        )
-
-        if result.returncode != 0:
-            print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
-            return None
-
-        # Parse output: "chromium@1563294 /path/to/Chromium"
-        output = result.stdout.strip()
-        parts = output.split(' ', 1)
-        if len(parts) != 2:
-            print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
-            return None
-
-        version_str = parts[0]  # "chromium@1563294"
-        binary_path = parts[1].strip()
-
-        if not binary_path or not os.path.exists(binary_path):
-            print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
-            return None
-
-        # Extract version number
-        version = version_str.split('@')[1] if '@' in version_str else None
-
-        print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
-
-        return {
-            'name': 'chromium',
-            'abspath': binary_path,
-            'version': version,
-            'binprovider': 'puppeteer',
-        }
-
-    except subprocess.TimeoutExpired:
-        print("[!] Chromium install timed out", file=sys.stderr)
-    except FileNotFoundError:
-        print("[!] npx not found - is Node.js installed?", file=sys.stderr)
-    except Exception as e:
-        print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
-
-    return None
-
-
-def main():
-    warnings = []
-    errors = []
-    computed = {}
-
-    # Install puppeteer-core if NODE_MODULES_DIR is set
-    install_puppeteer_core()
-
-    # Check if Chrome is enabled
-    chrome_enabled = get_env_bool('CHROME_ENABLED', True)
-
-    # Detect Docker and adjust sandbox
-    in_docker = detect_docker()
-    computed['IN_DOCKER'] = str(in_docker).lower()
-
-    chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
-    if in_docker and chrome_sandbox:
-        warnings.append(
-            "Running in Docker with CHROME_SANDBOX=true. "
-            "Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
-        )
-        # Auto-disable sandbox in Docker unless explicitly set
-        if not get_env('CHROME_SANDBOX'):
-            computed['CHROME_SANDBOX'] = 'false'
-
-    # Check Node.js availability
-    node_binary = get_env('NODE_BINARY', 'node')
-    computed['NODE_BINARY'] = node_binary
-
-    # Check if CHROME_BINARY is already set and valid
-    configured_binary = get_env('CHROME_BINARY', '')
-    if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
-        version = get_chrome_version(configured_binary)
-        computed['CHROME_BINARY'] = configured_binary
-        computed['CHROME_VERSION'] = version or 'unknown'
-
-        print(json.dumps({
-            'type': 'Binary',
-            'name': 'chromium',
-            'abspath': configured_binary,
-            'version': version,
-            'binprovider': 'env',
-        }))
-
-        # Output computed values
-        for key, value in computed.items():
-            print(f"COMPUTED:{key}={value}")
-        for warning in warnings:
-            print(f"WARNING:{warning}", file=sys.stderr)
-
-        sys.exit(0)
-
-    # Install/find Chromium via puppeteer
-    result = install_chromium()
-
-    if result and result.get('abspath'):
-        computed['CHROME_BINARY'] = result['abspath']
-        computed['CHROME_VERSION'] = result['version'] or 'unknown'
-
-        print(json.dumps({
-            'type': 'Binary',
-            'name': result['name'],
-            'abspath': result['abspath'],
-            'version': result['version'],
-            'binprovider': result['binprovider'],
-        }))
-
-        print(json.dumps({
-            'type': 'Machine',
-            '_method': 'update',
-            'key': 'config/CHROME_BINARY',
-            'value': result['abspath'],
-        }))
-
-        if result['version']:
-            print(json.dumps({
-                'type': 'Machine',
-                '_method': 'update',
-                'key': 'config/CHROMIUM_VERSION',
-                'value': result['version'],
-            }))
-
-        # Output computed values
-        for key, value in computed.items():
-            print(f"COMPUTED:{key}={value}")
-        for warning in warnings:
-            print(f"WARNING:{warning}", file=sys.stderr)
-
-        sys.exit(0)
-    else:
-        errors.append("Chromium binary not found")
-        computed['CHROME_BINARY'] = ''
-
-        # Output computed values and errors
-        for key, value in computed.items():
-            print(f"COMPUTED:{key}={value}")
-        for warning in warnings:
-            print(f"WARNING:{warning}", file=sys.stderr)
-        for error in errors:
-            print(f"ERROR:{error}", file=sys.stderr)
-
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py
+++ b/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+"""
+Emit Chromium Binary dependency for the crawl.
+
+NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
+--load-extension and --disable-extensions-except flags, which are needed for
+loading unpacked extensions in headless mode.
+"""
+
+import json
+import os
+import sys
+
+
+def main():
+    # Check if Chrome is enabled
+    chrome_enabled = os.environ.get('CHROME_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off')
+    if not chrome_enabled:
+        sys.exit(0)
+
+    record = {
+        'type': 'Binary',
+        'name': 'chromium',
+        'binproviders': 'puppeteer,env',
+        'overrides': {
+            'puppeteer': ['chromium@latest', '--install-deps'],
+        },
+    }
+    print(json.dumps(record))
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js
@@ -3,12 +3,12 @@
 * Launch a shared Chromium browser session for the entire crawl.
 *
 * This runs once per crawl and keeps Chromium alive for all snapshots to share.
- * Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
+ * Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js.
 *
 * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
 * --load-extension and --disable-extensions-except flags.
 *
- * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
+ * Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
 * Output: Writes to current directory (executor creates chrome/ dir):
 *   - cdp_url.txt: WebSocket URL for CDP connection
 *   - chrome.pid: Chromium process ID (for cleanup)
@@ -31,7 +31,7 @@ if (process.env.NODE_MODULES_DIR) {

 const fs = require('fs');
 const path = require('path');
-const puppeteer = require('puppeteer-core');
+const puppeteer = require('puppeteer');
 const {
    findChromium,
    launchChromium,
--- a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
@@ -2,11 +2,11 @@
 /**
 * Create a Chrome tab for this snapshot in the shared crawl Chrome session.
 *
- * If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
+ * If a crawl-level Chrome session exists (from on_Crawl__90_chrome_launch.bg.js),
 * this connects to it and creates a new tab. Otherwise, falls back to launching
 * its own Chrome instance.
 *
- * Usage: on_Snapshot__20_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
+ * Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
 * Output: Creates chrome/ directory under snapshot output dir with:
 *   - cdp_url.txt: WebSocket URL for CDP connection
 *   - chrome.pid: Chrome process ID (from crawl)
@@ -15,11 +15,14 @@
 *
 * Environment variables:
 *     CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
- *     CHROME_BINARY: Path to Chrome/Chromium binary (for fallback)
+ *     CHROME_BINARY: Path to Chromium binary (for fallback)
 *     CHROME_RESOLUTION: Page resolution (default: 1440,2000)
 *     CHROME_USER_AGENT: User agent string (optional)
 *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
 *     CHROME_HEADLESS: Run in headless mode (default: true)
+ *
+ * This is a background hook that stays alive until SIGTERM so the tab
+ * can be closed cleanly at the end of the snapshot run.
 */

 const fs = require('fs');
@@ -28,7 +31,7 @@ const { spawn } = require('child_process');
 // Add NODE_MODULES_DIR to module resolution paths if set
 if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);

-const puppeteer = require('puppeteer-core');
+const puppeteer = require('puppeteer');
 const {
    findChromium,
    getEnv,
@@ -43,6 +46,11 @@ const PLUGIN_NAME = 'chrome_tab';
 const OUTPUT_DIR = '.';  // Hook already runs in chrome/ output directory
 const CHROME_SESSION_DIR = '.';

+let finalStatus = 'failed';
+let finalOutput = '';
+let finalError = '';
+let cmdVersion = '';
+let finalized = false;

 // Parse command line arguments
 function parseArgs() {
@@ -56,8 +64,31 @@ function parseArgs() {
    return args;
 }

+function emitResult(statusOverride) {
+    if (finalized) return;
+    finalized = true;
+
+    const status = statusOverride || finalStatus;
+    const outputStr = status === 'succeeded'
+        ? finalOutput
+        : (finalError || finalOutput || '');
+
+    const result = {
+        type: 'ArchiveResult',
+        status,
+        output_str: outputStr,
+    };
+    if (cmdVersion) {
+        result.cmd_version = cmdVersion;
+    }
+    console.log(JSON.stringify(result));
+}
+
 // Cleanup handler for SIGTERM - close this snapshot's tab
-async function cleanup() {
+async function cleanup(signal) {
+    if (signal) {
+        console.error(`\nReceived ${signal}, closing chrome tab...`);
+    }
    try {
        const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
        const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt');
@@ -78,12 +109,13 @@ async function cleanup() {
    } catch (e) {
        // Best effort
    }
-    process.exit(0);
+    emitResult();
+    process.exit(finalStatus === 'succeeded' ? 0 : 1);
 }

 // Register signal handlers
-process.on('SIGTERM', cleanup);
-process.on('SIGINT', cleanup);
+process.on('SIGTERM', () => cleanup('SIGTERM'));
+process.on('SIGINT', () => cleanup('SIGINT'));

 // Try to find the crawl's Chrome session
 function findCrawlChromeSession(crawlId) {
@@ -272,23 +304,22 @@ async function main() {
    const crawlId = args.crawl_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__20_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
+        console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
        process.exit(1);
    }

-    const startTs = new Date();
    let status = 'failed';
-    let output = null;
+    let output = '';
    let error = '';
    let version = '';

    try {
        const binary = findChromium();
        if (!binary) {
-            console.error('ERROR: Chrome/Chromium binary not found');
-            console.error('DEPENDENCY_NEEDED=chrome');
+            console.error('ERROR: Chromium binary not found');
+            console.error('DEPENDENCY_NEEDED=chromium');
            console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
-            console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
+            console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
            process.exit(1);
        }

@@ -327,24 +358,22 @@ async function main() {
        status = 'failed';
    }

-    const endTs = new Date();
-
    if (error) {
        console.error(`ERROR: ${error}`);
    }

-    // Output clean JSONL (no RESULT_JSON= prefix)
-    const result = {
-        type: 'ArchiveResult',
-        status,
-        output_str: output || error || '',
-    };
-    if (version) {
-        result.cmd_version = version;
-    }
-    console.log(JSON.stringify(result));
+    finalStatus = status;
+    finalOutput = output || '';
+    finalError = error || '';
+    cmdVersion = version || '';

-    process.exit(status === 'succeeded' ? 0 : 1);
+    if (status !== 'succeeded') {
+        emitResult(status);
+        process.exit(1);
+    }
+
+    console.log('[*] Chrome tab created, waiting for cleanup signal...');
+    await new Promise(() => {}); // Keep alive until SIGTERM
 }

 main().catch(e => {
--- a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js
+++ b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js
@@ -0,0 +1,76 @@
+#!/usr/bin/env node
+/**
+ * Wait for Chrome session files to exist (cdp_url.txt + target_id.txt).
+ *
+ * This is a foreground hook that blocks until the Chrome tab is ready,
+ * so downstream hooks can safely connect to CDP.
+ *
+ * Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>
+ */
+
+const fs = require('fs');
+const path = require('path');
+// Add NODE_MODULES_DIR to module resolution paths if set
+if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
+
+const {
+    getEnvInt,
+    waitForChromeSession,
+    readCdpUrl,
+    readTargetId,
+} = require('./chrome_utils.js');
+
+const CHROME_SESSION_DIR = '.';
+
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
+    const timeoutMs = timeoutSeconds * 1000;
+
+    console.error(`[chrome_wait] Waiting for Chrome session (timeout=${timeoutSeconds}s)...`);
+
+    const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs);
+    if (!ready) {
+        const error = `Chrome session not ready after ${timeoutSeconds}s (cdp_url.txt/target_id.txt missing)`;
+        console.error(`[chrome_wait] ERROR: ${error}`);
+        console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
+        process.exit(1);
+    }
+
+    const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
+    const targetId = readTargetId(CHROME_SESSION_DIR);
+    if (!cdpUrl || !targetId) {
+        const error = 'Chrome session files incomplete (cdp_url.txt/target_id.txt missing)';
+        console.error(`[chrome_wait] ERROR: ${error}`);
+        console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
+        process.exit(1);
+    }
+
+    console.error(`[chrome_wait] Chrome session ready (cdp_url=${cdpUrl.slice(0, 32)}..., target_id=${targetId}).`);
+    console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', output_str: 'chrome session ready' }));
+    process.exit(0);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js
@@ -19,7 +19,7 @@ const fs = require('fs');
 const path = require('path');
 // Add NODE_MODULES_DIR to module resolution paths if set
 if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
-const puppeteer = require('puppeteer-core');
+const puppeteer = require('puppeteer');

 const PLUGIN_NAME = 'chrome_navigate';
 const CHROME_SESSION_DIR = '.';
--- a/archivebox/plugins/chrome/templates/icon.html
+++ b/archivebox/plugins/chrome/templates/icon.html
@@ -0,0 +1 @@
+<span class="abx-output-icon abx-output-icon--chrome" title="Chrome"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M3 9h18"/><circle cx="7" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="11" cy="7" r="1" fill="currentColor" stroke="none"/></svg></span>
--- a/archivebox/plugins/chrome/tests/init.py
+++ b/archivebox/plugins/chrome/tests/init.py
--- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py
+++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
@@ -60,6 +60,7 @@ import os
 import platform
 import signal
 import subprocess
+import sys
 import time
 from datetime import datetime
 from pathlib import Path
@@ -72,11 +73,14 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent

 # Hook script locations
-CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py'
-CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
-CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
+CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__70_chrome_install.py'
+CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__90_chrome_launch.bg.js'
+CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__10_chrome_tab.bg.js'
 CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
 CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
+PUPPETEER_BINARY_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Binary__12_puppeteer_install.py'
+PUPPETEER_CRAWL_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Crawl__60_puppeteer_install.py'
+NPM_BINARY_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__10_npm_install.py'


 # =============================================================================
@@ -402,7 +406,7 @@ def run_hook(

    # Determine interpreter based on file extension
    if hook_script.suffix == '.py':
-        cmd = ['python', str(hook_script)]
+        cmd = [sys.executable, str(hook_script)]
    elif hook_script.suffix == '.js':
        cmd = ['node', str(hook_script)]
    else:
@@ -451,6 +455,128 @@ def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optio
    return None


+def parse_jsonl_records(stdout: str) -> List[Dict[str, Any]]:
+    """Parse all JSONL records from stdout."""
+    records: List[Dict[str, Any]] = []
+    for line in stdout.strip().split('\n'):
+        line = line.strip()
+        if not line.startswith('{'):
+            continue
+        try:
+            records.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+    return records
+
+
+def apply_machine_updates(records: List[Dict[str, Any]], env: dict) -> None:
+    """Apply Machine update records to env dict in-place."""
+    for record in records:
+        if record.get('type') != 'Machine':
+            continue
+        config = record.get('config')
+        if not isinstance(config, dict):
+            continue
+        env.update(config)
+
+
+def install_chromium_with_hooks(env: dict, timeout: int = 300) -> str:
+    """Install Chromium via chrome crawl hook + puppeteer/npm hooks.
+
+    Returns absolute path to Chromium binary.
+    """
+    puppeteer_result = subprocess.run(
+        [sys.executable, str(PUPPETEER_CRAWL_HOOK)],
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        env=env,
+    )
+    if puppeteer_result.returncode != 0:
+        raise RuntimeError(f"Puppeteer crawl hook failed: {puppeteer_result.stderr}")
+
+    puppeteer_record = parse_jsonl_output(puppeteer_result.stdout, record_type='Binary') or {}
+    if not puppeteer_record or puppeteer_record.get('name') != 'puppeteer':
+        raise RuntimeError("Puppeteer Binary record not emitted by crawl hook")
+
+    npm_cmd = [
+        sys.executable,
+        str(NPM_BINARY_HOOK),
+        '--machine-id=test-machine',
+        '--binary-id=test-puppeteer',
+        '--name=puppeteer',
+        f"--binproviders={puppeteer_record.get('binproviders', '*')}",
+    ]
+    puppeteer_overrides = puppeteer_record.get('overrides')
+    if puppeteer_overrides:
+        npm_cmd.append(f'--overrides={json.dumps(puppeteer_overrides)}')
+
+    npm_result = subprocess.run(
+        npm_cmd,
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        env=env,
+    )
+    if npm_result.returncode != 0:
+        raise RuntimeError(f"Npm install failed: {npm_result.stderr}")
+
+    apply_machine_updates(parse_jsonl_records(npm_result.stdout), env)
+
+    chrome_result = subprocess.run(
+        [sys.executable, str(CHROME_INSTALL_HOOK)],
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        env=env,
+    )
+    if chrome_result.returncode != 0:
+        raise RuntimeError(f"Chrome install hook failed: {chrome_result.stderr}")
+
+    chrome_record = parse_jsonl_output(chrome_result.stdout, record_type='Binary') or {}
+    if not chrome_record or chrome_record.get('name') not in ('chromium', 'chrome'):
+        raise RuntimeError("Chrome Binary record not emitted by crawl hook")
+
+    chromium_cmd = [
+        sys.executable,
+        str(PUPPETEER_BINARY_HOOK),
+        '--machine-id=test-machine',
+        '--binary-id=test-chromium',
+        f"--name={chrome_record.get('name', 'chromium')}",
+        f"--binproviders={chrome_record.get('binproviders', '*')}",
+    ]
+    chrome_overrides = chrome_record.get('overrides')
+    if chrome_overrides:
+        chromium_cmd.append(f'--overrides={json.dumps(chrome_overrides)}')
+
+    result = subprocess.run(
+        chromium_cmd,
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        env=env,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"Puppeteer chromium install failed: {result.stderr}")
+
+    records = parse_jsonl_records(result.stdout)
+    chromium_record = None
+    for record in records:
+        if record.get('type') == 'Binary' and record.get('name') in ('chromium', 'chrome'):
+            chromium_record = record
+            break
+    if not chromium_record:
+        chromium_record = parse_jsonl_output(result.stdout, record_type='Binary')
+
+    chromium_path = chromium_record.get('abspath')
+    if not chromium_path or not Path(chromium_path).exists():
+        raise RuntimeError(f"Chromium binary not found after install: {chromium_path}")
+
+    env['CHROME_BINARY'] = chromium_path
+    apply_machine_updates(records, env)
+    return chromium_path
+
+
 def run_hook_and_parse(
    hook_script: Path,
    url: str,
@@ -499,7 +625,7 @@ def setup_test_env(tmpdir: Path) -> dict:
                    crawls/
                    snapshots/

-    Calls chrome install hook which handles puppeteer-core and chromium installation.
+    Calls chrome install hook + puppeteer/npm hooks for Chromium installation.
    Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc.

    Args:
@@ -559,31 +685,10 @@ def setup_test_env(tmpdir: Path) -> dict:
    if 'CHROME_HEADLESS' not in os.environ:
        env['CHROME_HEADLESS'] = 'true'

-    # Call chrome install hook (installs puppeteer-core and chromium, outputs JSONL)
-    result = subprocess.run(
-        ['python', str(CHROME_INSTALL_HOOK)],
-        capture_output=True, text=True, timeout=120, env=env
-    )
-    if result.returncode != 0:
-        pytest.skip(f"Chrome install hook failed: {result.stderr}")
-
-    # Parse JSONL output to get CHROME_BINARY
-    chrome_binary = None
-    for line in result.stdout.strip().split('\n'):
-        if not line.strip():
-            continue
-        try:
-            data = json.loads(line)
-            if data.get('type') == 'Binary' and data.get('abspath'):
-                chrome_binary = data['abspath']
-                break
-        except json.JSONDecodeError:
-            continue
-
-    if not chrome_binary or not Path(chrome_binary).exists():
-        pytest.skip(f"Chromium binary not found: {chrome_binary}")
-
-    env['CHROME_BINARY'] = chrome_binary
+    try:
+        install_chromium_with_hooks(env)
+    except RuntimeError as e:
+        pytest.skip(str(e))
    return env


@@ -790,17 +895,8 @@ def chrome_session(
            'CHROME_HEADLESS': 'true',
        })

-        # CRITICAL: Run chrome install hook first (installs puppeteer-core and chromium)
-        # chrome_launch assumes chrome_install has already run
-        install_result = subprocess.run(
-            ['python', str(CHROME_INSTALL_HOOK)],
-            capture_output=True,
-            text=True,
-            timeout=120,
-            env=env
-        )
-        if install_result.returncode != 0:
-            raise RuntimeError(f"Chrome install failed: {install_result.stderr}")
+        # Install Chromium via npm + puppeteer hooks using normal Binary flow
+        install_chromium_with_hooks(env)

        # Launch Chrome at crawl level
        chrome_launch_process = subprocess.Popen(
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -30,9 +30,8 @@ import platform

 from archivebox.plugins.chrome.tests.chrome_test_helpers import (
    get_test_env,
-    get_lib_dir,
-    get_node_modules_dir,
    find_chromium_binary,
+    install_chromium_with_hooks,
    CHROME_PLUGIN_DIR as PLUGIN_DIR,
    CHROME_LAUNCH_HOOK,
    CHROME_TAB_HOOK,
@@ -41,58 +40,24 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (

@pytest.fixture(scope="session", autouse=True)
 def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
-    """Ensure Chromium and puppeteer are installed before running tests.
-
-    Puppeteer handles Chromium installation automatically in its own cache.
-    We only need to install puppeteer itself to LIB_DIR/npm.
-    """
-    from abx_pkg import Binary, NpmProvider, BinProviderOverrides
-
-    # Set DATA_DIR if not already set (required by abx_pkg)
+    """Ensure Chromium and puppeteer are installed before running tests."""
    if not os.environ.get('DATA_DIR'):
-        # Use isolated temp dir for direct pytest runs
        test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
        os.environ['DATA_DIR'] = str(test_data_dir)
+    env = get_test_env()

-    # Compute paths AFTER setting DATA_DIR
-    lib_dir = get_lib_dir()
-    node_modules_dir = get_node_modules_dir()
-    npm_prefix = lib_dir / 'npm'
+    try:
+        chromium_binary = install_chromium_with_hooks(env)
+    except RuntimeError as e:
+        pytest.skip(str(e))

-    # Rebuild pydantic models
-    NpmProvider.model_rebuild()
-
-    # Install puppeteer if not available (it will handle Chromium in its own cache)
-    puppeteer_core_path = node_modules_dir / 'puppeteer-core'
-    if not puppeteer_core_path.exists():
-        print(f"\n[*] Installing puppeteer to {npm_prefix}...")
-        npm_prefix.mkdir(parents=True, exist_ok=True)
-
-        provider = NpmProvider(npm_prefix=npm_prefix)
-        try:
-            binary = Binary(
-                name='puppeteer',
-                binproviders=[provider],
-                overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
-            )
-            binary.install()
-            print(f"[*] Puppeteer installed successfully to {npm_prefix}")
-        except Exception as e:
-            pytest.skip(f"Failed to install puppeteer: {e}")
-
-    # Find Chromium binary (puppeteer installs it automatically in its cache)
-    chromium_binary = find_chromium_binary()
    if not chromium_binary:
-        pytest.skip("Chromium not found - puppeteer should install it automatically")
+        pytest.skip("Chromium not found after install")

-    # Set CHROME_BINARY env var for tests
    os.environ['CHROME_BINARY'] = chromium_binary
-
-
-# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__)
-LIB_DIR = get_lib_dir()
-NODE_MODULES_DIR = get_node_modules_dir()
-NPM_PREFIX = LIB_DIR / 'npm'
+    for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):
+        if env.get(key):
+            os.environ[key] = env[key]


 def test_hook_scripts_exist():
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
@@ -32,6 +32,13 @@ const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'console.jsonl';
 const CHROME_SESSION_DIR = '../chrome';

+let browser = null;
+let page = null;
+let logCount = 0;
+let errorCount = 0;
+let requestFailCount = 0;
+let shuttingDown = false;
+
 async function serializeArgs(args) {
    const serialized = [];
    for (const arg of args) {
@@ -73,6 +80,7 @@ async function setupListeners() {
                location: msg.location(),
            };
            fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+            logCount += 1;
        } catch (e) {
            // Ignore errors
        }
@@ -87,6 +95,7 @@ async function setupListeners() {
                stack: error.stack || '',
            };
            fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+            errorCount += 1;
        } catch (e) {
            // Ignore
        }
@@ -103,6 +112,7 @@ async function setupListeners() {
                url: request.url(),
            };
            fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+            requestFailCount += 1;
        } catch (e) {
            // Ignore
        }
@@ -111,6 +121,29 @@ async function setupListeners() {
    return { browser, page };
 }

+function emitResult(status = 'succeeded') {
+    if (shuttingDown) return;
+    shuttingDown = true;
+
+    const counts = `${logCount} console, ${errorCount} errors, ${requestFailCount} failed requests`;
+    console.log(JSON.stringify({
+        type: 'ArchiveResult',
+        status,
+        output_str: `${OUTPUT_FILE} (${counts})`,
+    }));
+}
+
+async function handleShutdown(signal) {
+    console.error(`\nReceived ${signal}, emitting final results...`);
+    emitResult('succeeded');
+    if (browser) {
+        try {
+            browser.disconnect();
+        } catch (e) {}
+    }
+    process.exit(0);
+}
+
 async function main() {
    const args = parseArgs();
    const url = args.url;
@@ -127,23 +160,27 @@ async function main() {
        process.exit(0);
    }

-    const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
-
    try {
        // Set up listeners BEFORE navigation
-        await setupListeners();
+        const connection = await setupListeners();
+        browser = connection.browser;
+        page = connection.page;

-        // Wait for chrome_navigate to complete (BLOCKING)
-        await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
+        // Register signal handlers for graceful shutdown
+        process.on('SIGTERM', () => handleShutdown('SIGTERM'));
+        process.on('SIGINT', () => handleShutdown('SIGINT'));

-        // Output clean JSONL
-        console.log(JSON.stringify({
-            type: 'ArchiveResult',
-            status: 'succeeded',
-            output_str: OUTPUT_FILE,
-        }));
+        // Wait for chrome_navigate to complete (non-fatal)
+        try {
+            const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
+            await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
+        } catch (e) {
+            console.error(`WARN: ${e.message}`);
+        }

-        process.exit(0);
+        // console.error('Consolelog active, waiting for cleanup signal...');
+        await new Promise(() => {}); // Keep alive until SIGTERM
+        return;

    } catch (e) {
        const error = `${e.name}: ${e.message}`;
--- a/archivebox/plugins/consolelog/templates/icon.html
+++ b/archivebox/plugins/consolelog/templates/icon.html
@@ -0,0 +1 @@
+<span class="abx-output-icon abx-output-icon--consolelog" title="Console Log"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M7 12l2 2-2 2"/><path d="M11 16h6"/></svg></span>
--- a/archivebox/plugins/consolelog/tests/init.py
+++ b/archivebox/plugins/consolelog/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the consolelog plugin."""
--- a/archivebox/plugins/consolelog/tests/test_consolelog.py
+++ b/archivebox/plugins/consolelog/tests/test_consolelog.py
@@ -10,6 +10,7 @@ import shutil
 import subprocess
 import sys
 import tempfile
+import time
 from pathlib import Path

 import pytest
@@ -76,26 +77,33 @@ class TestConsolelogWithChrome(TestCase):
                # Use the environment from chrome_session (already has CHROME_HEADLESS=true)


-                # Run consolelog hook with the active Chrome session
-                result = subprocess.run(
+                # Run consolelog hook with the active Chrome session (background hook)
+                result = subprocess.Popen(
                    ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
                    cwd=str(snapshot_chrome_dir),
-                    capture_output=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
                    text=True,
-                    timeout=120,  # Longer timeout as it waits for navigation
                    env=env
                )

                # Check for output file
                console_output = snapshot_chrome_dir / 'console.jsonl'

-                # Verify hook ran (may succeed or timeout waiting for navigation)
-                # The hook is designed to wait for page_loaded.txt from chrome_navigate
-                # In test mode, that file may not exist, so hook may timeout
-                # But it should still create the console.jsonl file
+                # Allow it to run briefly, then terminate (background hook)
+                time.sleep(3)
+                if result.poll() is None:
+                    result.terminate()
+                    try:
+                        stdout, stderr = result.communicate(timeout=5)
+                    except subprocess.TimeoutExpired:
+                        result.kill()
+                        stdout, stderr = result.communicate()
+                else:
+                    stdout, stderr = result.communicate()

                # At minimum, verify no crash
-                self.assertNotIn('Traceback', result.stderr)
+                self.assertNotIn('Traceback', stderr)

                # If output file exists, verify it's valid JSONL
                if console_output.exists():
--- a/archivebox/plugins/custom/on_Binary__14_custom_install.py
+++ b/archivebox/plugins/custom/on_Binary__14_custom_install.py
@@ -59,9 +59,16 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
    provider = EnvProvider()
    try:
        binary = Binary(name=name, binproviders=[provider]).load()
-    except Exception as e:
-        click.echo(f"{name} not found after custom install: {e}", err=True)
-        sys.exit(1)
+    except Exception:
+        try:
+            binary = Binary(
+                name=name,
+                binproviders=[provider],
+                overrides={'env': {'version': '0.0.1'}},
+            ).load()
+        except Exception as e:
+            click.echo(f"{name} not found after custom install: {e}", err=True)
+            sys.exit(1)

    if not binary.abspath:
        click.echo(f"{name} not found after custom install", err=True)
--- a/archivebox/plugins/custom/tests/init.py
+++ b/archivebox/plugins/custom/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the custom binary provider plugin."""
--- a/archivebox/plugins/custom/tests/test_custom_provider.py
+++ b/archivebox/plugins/custom/tests/test_custom_provider.py
@@ -17,7 +17,7 @@ from django.test import TestCase

 # Get the path to the custom provider hook
 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_custom_bash.py'
+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_custom_install.py'), None)


 class TestCustomProviderHook(TestCase):
@@ -34,7 +34,7 @@ class TestCustomProviderHook(TestCase):

    def test_hook_script_exists(self):
        """Hook script should exist."""
-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")

    def test_hook_skips_when_custom_not_allowed(self):
        """Hook should skip when custom not in allowed binproviders."""
--- a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js
+++ b/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js
@@ -32,6 +32,11 @@ const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'dns.jsonl';
 const CHROME_SESSION_DIR = '../chrome';

+let browser = null;
+let page = null;
+let recordCount = 0;
+let shuttingDown = false;
+
 function extractHostname(url) {
    try {
        const urlObj = new URL(url);
@@ -121,6 +126,7 @@ async function setupListener(targetUrl) {

            // Append to output file
            fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
+            recordCount += 1;

        } catch (e) {
            // Ignore errors
@@ -170,6 +176,7 @@ async function setupListener(targetUrl) {
                };

                fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
+                recordCount += 1;
            }
        } catch (e) {
            // Ignore errors
@@ -179,6 +186,28 @@ async function setupListener(targetUrl) {
    return { browser, page, client };
 }

+function emitResult(status = 'succeeded') {
+    if (shuttingDown) return;
+    shuttingDown = true;
+
+    console.log(JSON.stringify({
+        type: 'ArchiveResult',
+        status,
+        output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
+    }));
+}
+
+async function handleShutdown(signal) {
+    console.error(`\nReceived ${signal}, emitting final results...`);
+    emitResult('succeeded');
+    if (browser) {
+        try {
+            browser.disconnect();
+        } catch (e) {}
+    }
+    process.exit(0);
+}
+
 async function main() {
    const args = parseArgs();
    const url = args.url;
@@ -195,31 +224,27 @@ async function main() {
        process.exit(0);
    }

-    const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
-
    try {
        // Set up listener BEFORE navigation
-        await setupListener(url);
+        const connection = await setupListener(url);
+        browser = connection.browser;
+        page = connection.page;

-        // Wait for chrome_navigate to complete (BLOCKING)
-        await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
+        // Register signal handlers for graceful shutdown
+        process.on('SIGTERM', () => handleShutdown('SIGTERM'));
+        process.on('SIGINT', () => handleShutdown('SIGINT'));

-        // Count DNS records
-        const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
-        let recordCount = 0;
-        if (fs.existsSync(outputPath)) {
-            const content = fs.readFileSync(outputPath, 'utf8');
-            recordCount = content.split('\n').filter(line => line.trim()).length;
+        // Wait for chrome_navigate to complete (non-fatal)
+        try {
+            const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
+            await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
+        } catch (e) {
+            console.error(`WARN: ${e.message}`);
        }

-        // Output clean JSONL
-        console.log(JSON.stringify({
-            type: 'ArchiveResult',
-            status: 'succeeded',
-            output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
-        }));
-
-        process.exit(0);
+        // console.error('DNS listener active, waiting for cleanup signal...');
+        await new Promise(() => {}); // Keep alive until SIGTERM
+        return;

    } catch (e) {
        const error = `${e.name}: ${e.message}`;
--- a/archivebox/plugins/dns/templates/icon.html
+++ b/archivebox/plugins/dns/templates/icon.html
@@ -0,0 +1 @@
+<span class="abx-output-icon abx-output-icon--dns" title="DNS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="12" r="2"/><circle cx="18" cy="6" r="2"/><circle cx="18" cy="18" r="2"/><path d="M8 12h6"/><path d="M16 8l-2 2"/><path d="M16 16l-2-2"/></svg></span>
--- a/archivebox/plugins/dom/on_Snapshot__53_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js
@@ -52,7 +52,21 @@ const CHROME_SESSION_DIR = '../chrome';
 // Check if staticfile extractor already downloaded this URL
 const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
-    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
+    if (!fs.existsSync(STATICFILE_DIR)) return false;
+    const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
+    if (!fs.existsSync(stdoutPath)) return false;
+    const stdout = fs.readFileSync(stdoutPath, 'utf8');
+    for (const line of stdout.split('\n')) {
+        const trimmed = line.trim();
+        if (!trimmed.startsWith('{')) continue;
+        try {
+            const record = JSON.parse(trimmed);
+            if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
+                return true;
+            }
+        } catch (e) {}
+    }
+    return false;
 }

 // Wait for chrome tab to be fully loaded
--- a/archivebox/plugins/dom/templates/icon.html
+++ b/archivebox/plugins/dom/templates/icon.html
@@ -1 +1 @@
-🌐
+<span class="abx-output-icon abx-output-icon--dom" title="DOM"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>
--- a/archivebox/plugins/dom/tests/test_dom.py
+++ b/archivebox/plugins/dom/tests/test_dom.py
@@ -142,7 +142,7 @@ def test_staticfile_present_skips():
        #   dom/         <- dom extractor runs here, looks for ../staticfile
        staticfile_dir = tmpdir / 'staticfile'
        staticfile_dir.mkdir()
-        (staticfile_dir / 'index.html').write_text('<html>test</html>')
+        (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')

        dom_dir = tmpdir / 'dom'
        dom_dir.mkdir()
--- a/archivebox/plugins/env/on_Binary__15_env_install.py
+++ b/archivebox/plugins/env/on_Binary__15_env_install.py
@@ -25,7 +25,8 @@ from abx_pkg import Binary, EnvProvider
@click.option('--binary-id', required=True, help="Dependency UUID")
@click.option('--name', required=True, help="Binary name to find")
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
-def main(binary_id: str, machine_id: str, name: str, binproviders: str):
+@click.option('--overrides', default=None, help="JSON-encoded overrides dict (unused)")
+def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None):
    """Check if binary is available in PATH and record it."""

    # Check if env provider is allowed
--- a/archivebox/plugins/env/tests/init.py
+++ b/archivebox/plugins/env/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the env binary provider plugin."""
--- a/archivebox/plugins/env/tests/test_env_provider.py
+++ b/archivebox/plugins/env/tests/test_env_provider.py
@@ -17,7 +17,7 @@ from django.test import TestCase

 # Get the path to the env provider hook
 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_env_provider.py'
+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_env_install.py'), None)


 class TestEnvProviderHook(TestCase):
@@ -34,7 +34,7 @@ class TestEnvProviderHook(TestCase):

    def test_hook_script_exists(self):
        """Hook script should exist."""
-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")

    def test_hook_finds_python(self):
        """Hook should find python3 binary in PATH."""
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -126,7 +126,12 @@ def main(url: str, snapshot_id: str):
    try:
        # Run extraction
        success, output, error = get_favicon(url)
-        status = 'succeeded' if success else 'failed'
+        if success:
+            status = 'succeeded'
+        elif error == 'No favicon found':
+            status = 'skipped'
+        else:
+            status = 'failed'

    except Exception as e:
        error = f'{type(e).__name__}: {e}'
@@ -143,7 +148,7 @@ def main(url: str, snapshot_id: str):
    }
    print(json.dumps(result))

-    sys.exit(0 if status == 'succeeded' else 1)
+    sys.exit(0 if status in ('succeeded', 'skipped') else 1)


 if __name__ == '__main__':
--- a/archivebox/plugins/favicon/templates/icon.html
+++ b/archivebox/plugins/favicon/templates/icon.html
@@ -1 +1 @@
-⭐
+<span class="abx-output-icon abx-output-icon--favicon" title="Favicon"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 3l2.5 5.5 6 .5-4.5 3.8 1.5 5.7L12 15.5 6.5 18.5 8 12.8 3.5 9l6-.5z"/></svg></span>
--- a/archivebox/plugins/forumdl/binaries.jsonl
+++ b/archivebox/plugins/forumdl/binaries.jsonl
@@ -1 +0,0 @@
-{"type": "Binary", "name": "forum-dl", "binproviders": "pip,env"}
--- a/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
+++ b/archivebox/plugins/forumdl/on_Crawl__13_forumdl_install.py
@@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-"""
-Detect forum-dl binary and emit Binary JSONL record.
-
-Output: Binary JSONL record to stdout if forum-dl is found
-"""
-
-import json
-import os
-import sys
-
-from abx_pkg import Binary, EnvProvider
-
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def output_binary_found(binary: Binary, name: str):
-    """Output Binary JSONL record for an installed binary."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'abspath': str(binary.abspath),
-        'version': str(binary.version) if binary.version else '',
-        'sha256': binary.sha256 or '',
-        'binprovider': 'env',  # Already installed
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def output_binary_missing(name: str, binproviders: str):
-    """Output Binary JSONL record for a missing binary that needs installation."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'binproviders': binproviders,  # Providers that can install it
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def main():
-    forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
-    forumdl_binary = get_env('FORUMDL_BINARY', 'forum-dl')
-
-    if not forumdl_enabled:
-        sys.exit(0)
-
-    provider = EnvProvider()
-    try:
-        binary = Binary(name=forumdl_binary, binproviders=[provider]).load()
-        if binary.abspath:
-            # Binary found
-            output_binary_found(binary, name='forum-dl')
-        else:
-            # Binary not found
-            output_binary_missing(name='forum-dl', binproviders='pip')
-    except Exception:
-        # Binary not found
-        output_binary_missing(name='forum-dl', binproviders='pip')
-
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py
+++ b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""
+Emit forum-dl Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary(name: str, binproviders: str, overrides: dict | None = None):
+    """Output Binary JSONL record for a dependency."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,
+        'machine_id': machine_id,
+    }
+    if overrides:
+        record['overrides'] = overrides
+    print(json.dumps(record))
+
+
+def main():
+    forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
+
+    if not forumdl_enabled:
+        sys.exit(0)
+
+    output_binary(
+        name='forum-dl',
+        binproviders='pip,env',
+        overrides={
+            'pip': {
+                'packages': [
+                    '--no-deps',
+                    'forum-dl',
+                    'pydantic',
+                    'pydantic-core',
+                    'typing-extensions',
+                    'annotated-types',
+                    'typing-inspection',
+                    'beautifulsoup4',
+                    'soupsieve',
+                    'lxml',
+                    'requests',
+                    'urllib3',
+                    'certifi',
+                    'idna',
+                    'charset-normalizer',
+                    'tenacity',
+                    'python-dateutil',
+                    'six',
+                    'html2text',
+                    'warcio',
+                ]
+            }
+        },
+    )
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py
+++ b/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py
@@ -2,7 +2,7 @@
 """
 Download forum content from a URL using forum-dl.

-Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
+Usage: on_Snapshot__04_forumdl.bg.py --url=<url> --snapshot-id=<uuid>
 Output: Downloads forum content to $PWD/

 Environment variables:
@@ -19,6 +19,7 @@ import json
 import os
 import subprocess
 import sys
+import threading
 from pathlib import Path

 import rich_click as click
@@ -131,13 +132,41 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
    cmd.append(url)

    try:
-        result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
+        print(f'[forumdl] Starting download (timeout={timeout}s)', file=sys.stderr)
+        output_lines: list[str] = []
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+
+        def _read_output() -> None:
+            if not process.stdout:
+                return
+            for line in process.stdout:
+                output_lines.append(line)
+                sys.stderr.write(line)
+
+        reader = threading.Thread(target=_read_output, daemon=True)
+        reader.start()
+
+        try:
+            process.wait(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            reader.join(timeout=1)
+            return False, None, f'Timed out after {timeout} seconds'
+
+        reader.join(timeout=1)
+        combined_output = ''.join(output_lines)

        # Check if output file was created
        if output_file.exists() and output_file.stat().st_size > 0:
            return True, str(output_file), ''
        else:
-            stderr = result.stderr
+            stderr = combined_output

            # These are NOT errors - page simply has no downloadable forum content
            stderr_lower = stderr.lower()
@@ -147,7 +176,7 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
                return True, None, ''  # No forum found - success, no output
            if 'extractornotfounderror' in stderr_lower:
                return True, None, ''  # No forum extractor for this URL - success, no output
-            if result.returncode == 0:
+            if process.returncode == 0:
                return True, None, ''  # forum-dl exited cleanly, just no forum - success

            # These ARE errors - something went wrong
--- a/archivebox/plugins/forumdl/templates/icon.html
+++ b/archivebox/plugins/forumdl/templates/icon.html
@@ -1 +1 @@
-💬
+<span class="abx-output-icon abx-output-icon--forumdl" title="Forum"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 5h16v10H7l-3 3V5z"/></svg></span>
--- a/archivebox/plugins/gallerydl/binaries.jsonl
+++ b/archivebox/plugins/gallerydl/binaries.jsonl
@@ -1 +0,0 @@
-{"type": "Binary", "name": "gallery-dl", "binproviders": "pip,brew,apt,env"}
--- a/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
+++ b/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
@@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-"""
-Detect gallery-dl binary and emit Binary JSONL record.
-
-Output: Binary JSONL record to stdout if gallery-dl is found
-"""
-
-import json
-import os
-import sys
-
-from abx_pkg import Binary, EnvProvider
-
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def output_binary_found(binary: Binary, name: str):
-    """Output Binary JSONL record for an installed binary."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'abspath': str(binary.abspath),
-        'version': str(binary.version) if binary.version else '',
-        'sha256': binary.sha256 or '',
-        'binprovider': 'env',  # Already installed
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def output_binary_missing(name: str, binproviders: str):
-    """Output Binary JSONL record for a missing binary that needs installation."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'binproviders': binproviders,  # Providers that can install it
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def main():
-    gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
-    gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
-
-    if not gallerydl_enabled:
-        sys.exit(0)
-
-    provider = EnvProvider()
-    try:
-        binary = Binary(name=gallerydl_binary, binproviders=[provider]).load()
-        if binary.abspath:
-            # Binary found
-            output_binary_found(binary, name='gallery-dl')
-        else:
-            # Binary not found
-            output_binary_missing(name='gallery-dl', binproviders='pip')
-    except Exception:
-        # Binary not found
-        output_binary_missing(name='gallery-dl', binproviders='pip')
-
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py
+++ b/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Emit gallery-dl Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary(name: str, binproviders: str):
+    """Output Binary JSONL record for a dependency."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
+
+    if not gallerydl_enabled:
+        sys.exit(0)
+
+    output_binary(name='gallery-dl', binproviders='pip,brew,apt,env')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py
+++ b/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py
@@ -2,7 +2,7 @@
 """
 Download image galleries from a URL using gallery-dl.

-Usage: on_Snapshot__gallerydl.py --url=<url> --snapshot-id=<uuid>
+Usage: on_Snapshot__03_gallerydl.bg.py --url=<url> --snapshot-id=<uuid>
 Output: Downloads gallery images to $PWD/gallerydl/

 Environment variables:
@@ -19,6 +19,7 @@ import json
 import os
 import subprocess
 import sys
+import threading
 from pathlib import Path

 import rich_click as click
@@ -70,7 +71,22 @@ STATICFILE_DIR = '../staticfile'
 def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""
    staticfile_dir = Path(STATICFILE_DIR)
-    return staticfile_dir.exists() and any(staticfile_dir.iterdir())
+    if not staticfile_dir.exists():
+        return False
+    stdout_log = staticfile_dir / 'stdout.log'
+    if not stdout_log.exists():
+        return False
+    for line in stdout_log.read_text(errors='ignore').splitlines():
+        line = line.strip()
+        if not line.startswith('{'):
+            continue
+        try:
+            record = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
+            return True
+    return False


 def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
@@ -109,7 +125,35 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
    cmd.append(url)

    try:
-        result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
+        print(f'[gallerydl] Starting download (timeout={timeout}s)', file=sys.stderr)
+        output_lines: list[str] = []
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+
+        def _read_output() -> None:
+            if not process.stdout:
+                return
+            for line in process.stdout:
+                output_lines.append(line)
+                sys.stderr.write(line)
+
+        reader = threading.Thread(target=_read_output, daemon=True)
+        reader.start()
+
+        try:
+            process.wait(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            reader.join(timeout=1)
+            return False, None, f'Timed out after {timeout} seconds'
+
+        reader.join(timeout=1)
+        combined_output = ''.join(output_lines)

        # Check if any gallery files were downloaded (search recursively)
        gallery_extensions = (
@@ -132,7 +176,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
            output = str(image_files[0]) if image_files else str(downloaded_files[0])
            return True, output, ''
        else:
-            stderr = result.stderr
+            stderr = combined_output

            # These are NOT errors - page simply has no downloadable gallery
            # Return success with no output (legitimate "nothing to download")
@@ -141,7 +185,7 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
                return True, None, ''  # Not a gallery site - success, no output
            if 'no results' in stderr_lower:
                return True, None, ''  # No gallery found - success, no output
-            if result.returncode == 0:
+            if process.returncode == 0:
                return True, None, ''  # gallery-dl exited cleanly, just no gallery - success

            # These ARE errors - something went wrong
--- a/archivebox/plugins/gallerydl/templates/icon.html
+++ b/archivebox/plugins/gallerydl/templates/icon.html
@@ -1 +1 @@
-🖼️
+<span class="abx-output-icon abx-output-icon--gallerydl" title="Gallery"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><circle cx="8" cy="10" r="1.5" fill="currentColor" stroke="none"/><path d="M21 17l-5-5-5 5"/></svg></span>
--- a/archivebox/plugins/git/binaries.jsonl
+++ b/archivebox/plugins/git/binaries.jsonl
@@ -1 +0,0 @@
-{"type": "Binary", "name": "git", "binproviders": "apt,brew,env"}
--- a/archivebox/plugins/git/on_Crawl__05_git_install.py
+++ b/archivebox/plugins/git/on_Crawl__05_git_install.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Emit git Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary(name: str, binproviders: str):
+    """Output Binary JSONL record for a dependency."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    git_enabled = get_env_bool('GIT_ENABLED', True)
+
+    if not git_enabled:
+        sys.exit(0)
+
+    output_binary(name='git', binproviders='apt,brew,env')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/git/on_Crawl__09_git_install.py
+++ b/archivebox/plugins/git/on_Crawl__09_git_install.py
@@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-"""
-Detect git binary and emit Binary JSONL record.
-
-Output: Binary JSONL record to stdout if git is found
-"""
-
-import json
-import os
-import sys
-
-from abx_pkg import Binary, EnvProvider
-
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def output_binary_found(binary: Binary, name: str):
-    """Output Binary JSONL record for an installed binary."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'abspath': str(binary.abspath),
-        'version': str(binary.version) if binary.version else '',
-        'sha256': binary.sha256 or '',
-        'binprovider': 'env',  # Already installed
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def output_binary_missing(name: str, binproviders: str):
-    """Output Binary JSONL record for a missing binary that needs installation."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'binproviders': binproviders,  # Providers that can install it
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def main():
-    git_enabled = get_env_bool('GIT_ENABLED', True)
-    git_binary = get_env('GIT_BINARY', 'git')
-
-    if not git_enabled:
-        sys.exit(0)
-
-    provider = EnvProvider()
-    try:
-        binary = Binary(name=git_binary, binproviders=[provider]).load()
-        if binary.abspath:
-            # Binary found
-            output_binary_found(binary, name='git')
-        else:
-            # Binary not found
-            output_binary_missing(name='git', binproviders='apt,brew')
-    except Exception:
-        # Binary not found
-        output_binary_missing(name='git', binproviders='apt,brew')
-
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/git/on_Snapshot__05_git.bg.py
+++ b/archivebox/plugins/git/on_Snapshot__05_git.bg.py
@@ -2,7 +2,7 @@
 """
 Clone a git repository from a URL.

-Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
+Usage: on_Snapshot__05_git.bg.py --url=<url> --snapshot-id=<uuid>
 Output: Clones repository to $PWD/repo

 Environment variables:
--- a/archivebox/plugins/git/templates/icon.html
+++ b/archivebox/plugins/git/templates/icon.html
@@ -1 +1 @@
-📂
+<span class="abx-output-icon abx-output-icon--git" title="Git"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="6" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="12" r="2"/><path d="M8 6h5a3 3 0 0 1 3 3v1"/><path d="M8 18h5a3 3 0 0 0 3-3v-1"/></svg></span>
--- a/archivebox/plugins/headers/templates/icon.html
+++ b/archivebox/plugins/headers/templates/icon.html
@@ -1 +1 @@
-📋
+<span class="abx-output-icon abx-output-icon--headers" title="Headers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="4" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="17" r="1" fill="currentColor" stroke="none"/><path d="M7 7h13"/><path d="M7 12h13"/><path d="M7 17h13"/></svg></span>
--- a/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py
@@ -76,22 +76,28 @@ def find_html_source() -> str | None:
    # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
    search_patterns = [
        'singlefile/singlefile.html',
+        '*_singlefile/singlefile.html',
        'singlefile/*.html',
+        '*_singlefile/*.html',
        'dom/output.html',
+        '*_dom/output.html',
        'dom/*.html',
+        '*_dom/*.html',
        'wget/**/*.html',
+        '*_wget/**/*.html',
        'wget/**/*.htm',
+        '*_wget/**/*.htm',
    ]

-    cwd = Path.cwd()
-    for pattern in search_patterns:
-        matches = list(cwd.glob(pattern))
-        for match in matches:
-            if match.is_file() and match.stat().st_size > 0:
-                try:
-                    return match.read_text(errors='ignore')
-                except Exception:
-                    continue
+    for base in (Path.cwd(), Path.cwd().parent):
+        for pattern in search_patterns:
+            matches = list(base.glob(pattern))
+            for match in matches:
+                if match.is_file() and match.stat().st_size > 0:
+                    try:
+                        return match.read_text(errors='ignore')
+                    except Exception:
+                        continue

    return None

--- a/archivebox/plugins/htmltotext/templates/icon.html
+++ b/archivebox/plugins/htmltotext/templates/icon.html
@@ -1 +1 @@
-📃
+<span class="abx-output-icon abx-output-icon--htmltotext" title="HTML to Text"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 7h16"/><path d="M4 12h12"/><path d="M4 17h14"/></svg></span>
--- a/archivebox/plugins/infiniscroll/templates/icon.html
+++ b/archivebox/plugins/infiniscroll/templates/icon.html
@@ -0,0 +1 @@
+<span class="abx-output-icon abx-output-icon--infiniscroll" title="Infinite Scroll"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 5v9"/><path d="M8 10l4 4 4-4"/><circle cx="6" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="18" cy="19" r="1" fill="currentColor" stroke="none"/></svg></span>
--- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js
+++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js
@@ -7,7 +7,7 @@
 *
 * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
 *
- * Priority: 02 (early) - Must install before Chrome session starts at Crawl level
+ * Priority: 81 - Must install before Chrome session starts at Crawl level
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * This extension automatically:
--- a/archivebox/plugins/mercury/binaries.jsonl
+++ b/archivebox/plugins/mercury/binaries.jsonl
@@ -1 +0,0 @@
-{"type": "Binary", "name": "postlight-parser", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["@postlight/parser"]}}}
--- a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
+++ b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
@@ -1,85 +0,0 @@
-#!/usr/bin/env python3
-"""
-Detect postlight-parser binary and emit Binary JSONL record.
-
-Output: Binary JSONL record to stdout if postlight-parser is found
-"""
-
-import json
-import os
-import sys
-
-from abx_pkg import Binary, EnvProvider
-
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def output_binary_found(binary: Binary, name: str):
-    """Output Binary JSONL record for an installed binary."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'abspath': str(binary.abspath),
-        'version': str(binary.version) if binary.version else '',
-        'sha256': binary.sha256 or '',
-        'binprovider': 'env',  # Already installed
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def output_binary_missing(name: str, binproviders: str):
-    """Output Binary JSONL record for a missing binary that needs installation."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'binproviders': binproviders,  # Providers that can install it
-        'overrides': {
-            'npm': {
-                'packages': ['@postlight/parser'],
-            }
-        },
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def main():
-    mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
-    mercury_binary = get_env('MERCURY_BINARY', 'postlight-parser')
-
-    if not mercury_enabled:
-        sys.exit(0)
-
-    provider = EnvProvider()
-    try:
-        binary = Binary(name=mercury_binary, binproviders=[provider]).load()
-        if binary.abspath:
-            # Binary found
-            output_binary_found(binary, name='postlight-parser')
-        else:
-            # Binary not found
-            output_binary_missing(name='postlight-parser', binproviders='npm')
-    except Exception:
-        # Binary not found
-        output_binary_missing(name='postlight-parser', binproviders='npm')
-
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py
+++ b/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""
+Emit postlight-parser Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary(name: str, binproviders: str):
+    """Output Binary JSONL record for a dependency."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,
+        'overrides': {
+            'npm': {
+                'packages': ['@postlight/parser'],
+            }
+        },
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
+
+    if not mercury_enabled:
+        sys.exit(0)
+
+    output_binary(name='postlight-parser', binproviders='npm,env')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/mercury/templates/icon.html
+++ b/archivebox/plugins/mercury/templates/icon.html
@@ -1 +1 @@
-☿️
+<span class="abx-output-icon abx-output-icon--mercury" title="Mercury"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><path d="M7 9h6"/><path d="M7 13h10"/><path d="M15 9h3"/></svg></span>
--- a/archivebox/plugins/merkletree/templates/icon.html
+++ b/archivebox/plugins/merkletree/templates/icon.html
@@ -0,0 +1 @@
+<span class="abx-output-icon abx-output-icon--merkletree" title="Merkle Tree"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>
--- a/archivebox/plugins/merkletree/tests/init.py
+++ b/archivebox/plugins/merkletree/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the merkletree plugin."""
--- a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js
+++ b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js
@@ -287,7 +287,7 @@ async function main() {
            page = pages[pages.length - 1];
        }

-        console.error(`Modalcloser listening on ${url}`);
+        // console.error(`Modalcloser listening on ${url}`);

        // Set up dialog handler (for JS alert/confirm/prompt/beforeunload)
        page.on('dialog', async (dialog) => {
--- a/archivebox/plugins/modalcloser/templates/icon.html
+++ b/archivebox/plugins/modalcloser/templates/icon.html
@@ -0,0 +1 @@
+<span class="abx-output-icon abx-output-icon--modalcloser" title="Modal Closer"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="4" y="4" width="16" height="16" rx="3"/><path d="M9 9l6 6"/><path d="M15 9l-6 6"/></svg></span>
--- a/archivebox/plugins/npm/on_Binary__10_npm_install.py
+++ b/archivebox/plugins/npm/on_Binary__10_npm_install.py
@@ -90,30 +90,34 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
    }
    print(json.dumps(record))

-    # Emit PATH update if npm bin dir not already in PATH
-    npm_bin_dir = str(npm_prefix / 'bin')
+    # Emit PATH update for npm bin dirs (node_modules/.bin preferred)
+    npm_bin_dirs = [
+        str(npm_prefix / 'node_modules' / '.bin'),
+        str(npm_prefix / 'bin'),
+    ]
    current_path = os.environ.get('PATH', '')
+    path_dirs = current_path.split(':') if current_path else []
+    new_path = current_path

-    # Check if npm_bin_dir is already in PATH
-    path_dirs = current_path.split(':')
-    if npm_bin_dir not in path_dirs:
-        # Prepend npm_bin_dir to PATH
-        new_path = f"{npm_bin_dir}:{current_path}" if current_path else npm_bin_dir
-        print(json.dumps({
-            'type': 'Machine',
-            '_method': 'update',
-            'key': 'config/PATH',
-            'value': new_path,
-        }))
-        click.echo(f"  Added {npm_bin_dir} to PATH", err=True)
+    for npm_bin_dir in npm_bin_dirs:
+        if npm_bin_dir and npm_bin_dir not in path_dirs:
+            new_path = f"{npm_bin_dir}:{new_path}" if new_path else npm_bin_dir
+            path_dirs.insert(0, npm_bin_dir)
+
+    print(json.dumps({
+        'type': 'Machine',
+        'config': {
+            'PATH': new_path,
+        },
+    }))

    # Also emit NODE_MODULES_DIR for JS module resolution
    node_modules_dir = str(npm_prefix / 'node_modules')
    print(json.dumps({
        'type': 'Machine',
-        '_method': 'update',
-        'key': 'config/NODE_MODULES_DIR',
-        'value': node_modules_dir,
+        'config': {
+            'NODE_MODULES_DIR': node_modules_dir,
+        },
    }))

    # Log human-readable info to stderr
--- a/archivebox/plugins/npm/on_Crawl__00_npm_install.py
+++ b/archivebox/plugins/npm/on_Crawl__00_npm_install.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""
+Emit node/npm Binary dependencies for the crawl.
+
+This hook runs early in the Crawl lifecycle so node/npm are installed
+before any npm-based extractors (e.g., puppeteer) run.
+"""
+
+import json
+import os
+import sys
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def output_binary(name: str, binproviders: str, overrides: dict | None = None) -> None:
+    machine_id = os.environ.get('MACHINE_ID', '')
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,
+        'machine_id': machine_id,
+    }
+    if overrides:
+        record['overrides'] = overrides
+    print(json.dumps(record))
+
+
+def main() -> None:
+    output_binary(
+        name='node',
+        binproviders='apt,brew,env',
+        overrides={'apt': {'packages': ['nodejs']}},
+    )
+
+    output_binary(
+        name='npm',
+        binproviders='apt,brew,env',
+        overrides={
+            'apt': {'packages': ['nodejs', 'npm']},
+            'brew': {'packages': ['node']},
+        },
+    )
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/npm/tests/init.py
+++ b/archivebox/plugins/npm/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the npm binary provider plugin."""
--- a/archivebox/plugins/npm/tests/test_npm_provider.py
+++ b/archivebox/plugins/npm/tests/test_npm_provider.py
@@ -22,7 +22,7 @@ from django.test import TestCase

 # Get the path to the npm provider hook
 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_npm_provider.py'
+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_npm_install.py'), None)


 def npm_available() -> bool:
@@ -45,7 +45,7 @@ class TestNpmProviderHook(TestCase):

    def test_hook_script_exists(self):
        """Hook script should exist."""
-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")

    def test_hook_requires_lib_dir(self):
        """Hook should fail when LIB_DIR is not set."""
--- a/archivebox/plugins/papersdl/binaries.jsonl
+++ b/archivebox/plugins/papersdl/binaries.jsonl
@@ -1 +0,0 @@
-{"type": "Binary", "name": "papers-dl", "binproviders": "pip,env"}
--- a/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
+++ b/archivebox/plugins/papersdl/on_Crawl__14_papersdl_install.py
@@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-"""
-Detect papers-dl binary and emit Binary JSONL record.
-
-Output: Binary JSONL record to stdout if papers-dl is found
-"""
-
-import json
-import os
-import sys
-
-from abx_pkg import Binary, EnvProvider
-
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def output_binary_found(binary: Binary, name: str):
-    """Output Binary JSONL record for an installed binary."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'abspath': str(binary.abspath),
-        'version': str(binary.version) if binary.version else '',
-        'sha256': binary.sha256 or '',
-        'binprovider': 'env',  # Already installed
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def output_binary_missing(name: str, binproviders: str):
-    """Output Binary JSONL record for a missing binary that needs installation."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'binproviders': binproviders,  # Providers that can install it
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def main():
-    papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
-    papersdl_binary = get_env('PAPERSDL_BINARY', 'papers-dl')
-
-    if not papersdl_enabled:
-        sys.exit(0)
-
-    provider = EnvProvider()
-    try:
-        binary = Binary(name=papersdl_binary, binproviders=[provider]).load()
-        if binary.abspath:
-            # Binary found
-            output_binary_found(binary, name='papers-dl')
-        else:
-            # Binary not found
-            output_binary_missing(name='papers-dl', binproviders='pip')
-    except Exception:
-        # Binary not found
-        output_binary_missing(name='papers-dl', binproviders='pip')
-
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py
+++ b/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+"""
+Emit papers-dl Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary(name: str, binproviders: str):
+    """Output Binary JSONL record for a dependency."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True)
+
+    if not papersdl_enabled:
+        sys.exit(0)
+
+    output_binary(name='papers-dl', binproviders='pip,env')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py
+++ b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py
@@ -23,6 +23,7 @@ import os
 import re
 import subprocess
 import sys
+import threading
 from pathlib import Path

 import rich_click as click
@@ -108,7 +109,35 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
        cmd.extend(papersdl_args_extra)

    try:
-        result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
+        print(f'[papersdl] Starting download (timeout={timeout}s)', file=sys.stderr)
+        output_lines: list[str] = []
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+
+        def _read_output() -> None:
+            if not process.stdout:
+                return
+            for line in process.stdout:
+                output_lines.append(line)
+                sys.stderr.write(line)
+
+        reader = threading.Thread(target=_read_output, daemon=True)
+        reader.start()
+
+        try:
+            process.wait(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            reader.join(timeout=1)
+            return False, None, f'Timed out after {timeout} seconds'
+
+        reader.join(timeout=1)
+        combined_output = ''.join(output_lines)

        # Check if any PDF files were downloaded
        pdf_files = list(output_dir.glob('*.pdf'))
@@ -117,8 +146,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
            # Return first PDF file
            return True, str(pdf_files[0]), ''
        else:
-            stderr = result.stderr
-            stdout = result.stdout
+            stderr = combined_output
+            stdout = combined_output

            # These are NOT errors - page simply has no downloadable paper
            stderr_lower = stderr.lower()
@@ -127,7 +156,7 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
                return True, None, ''  # Paper not available - success, no output
            if 'no results' in stderr_lower or 'no results' in stdout_lower:
                return True, None, ''  # No paper found - success, no output
-            if result.returncode == 0:
+            if process.returncode == 0:
                return True, None, ''  # papers-dl exited cleanly, just no paper - success

            # These ARE errors - something went wrong
--- a/archivebox/plugins/papersdl/templates/icon.html
+++ b/archivebox/plugins/papersdl/templates/icon.html
@@ -1 +1 @@
-📄
+<span class="abx-output-icon abx-output-icon--papersdl" title="Papers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M14 3H6a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V9z"/><path d="M14 3v6h6"/><path d="M12 12v5"/><path d="M9.5 14.5L12 17l2.5-2.5"/></svg></span>
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
@@ -193,6 +193,9 @@ async function extractOutlinks(url) {
            type: 'Snapshot',
            url: href,
            plugin: PLUGIN_NAME,
+            depth: depth + 1,
+            parent_snapshot_id: snapshotId || undefined,
+            crawl_id: crawlId || undefined,
        })).join('\n');

        if (urlsJsonl) {
@@ -214,6 +217,8 @@ async function main() {
    const args = parseArgs();
    const url = args.url;
    const snapshotId = args.snapshot_id;
+    const crawlId = args.crawl_id || process.env.CRAWL_ID;
+    const depth = parseInt(args.depth || process.env.SNAPSHOT_DEPTH || '0', 10) || 0;

    if (!url || !snapshotId) {
        console.error('Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
--- a/archivebox/plugins/parse_dom_outlinks/templates/icon.html
+++ b/archivebox/plugins/parse_dom_outlinks/templates/icon.html
@@ -1 +1 @@
-🔗
+<span class="abx-output-icon abx-output-icon--parse_dom_outlinks" title="Outlinks"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M10 13a4 4 0 0 1 0-6l2-2a4 4 0 0 1 6 6l-1 1"/><path d="M14 11a4 4 0 0 1 0 6l-2 2a4 4 0 0 1-6-6l1-1"/></svg></span>
--- a/archivebox/plugins/parse_dom_outlinks/tests/init.py
+++ b/archivebox/plugins/parse_dom_outlinks/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the parse_dom_outlinks plugin."""
--- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
+++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
@@ -79,8 +79,7 @@ class TestParseDomOutlinksWithChrome(TestCase):
                # Run outlinks hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir,
-            env=get_test_env()),
+                    cwd=str(snapshot_chrome_dir),
                    capture_output=True,
                    text=True,
                    timeout=60,
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
@@ -24,14 +24,15 @@ from datetime import datetime, timezone
 from html import unescape
 from html.parser import HTMLParser
 from pathlib import Path
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, urlparse, urlunparse

 import rich_click as click

 PLUGIN_NAME = 'parse_html_urls'

-# Check if parse_dom_outlinks extractor already ran
-DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
+# Check if parse_dom_outlinks extractor already ran (sibling plugin output dir)
+DOM_OUTLINKS_URLS_FILE = Path('..') / 'parse_dom_outlinks' / 'urls.jsonl'
+URLS_FILE = Path('urls.jsonl')


 # URL regex from archivebox/misc/util.py
@@ -95,8 +96,9 @@ def fix_urljoin_bug(url: str, nesting_limit=5) -> str:

 def normalize_url(url: str, root_url: str = None) -> str:
    """Normalize a URL, resolving relative paths if root_url provided."""
+    url = clean_url_candidate(url)
    if not root_url:
-        return url
+        return _normalize_trailing_slash(url)

    url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')

@@ -110,7 +112,40 @@ def normalize_url(url: str, root_url: str = None) -> str:
    if did_urljoin_misbehave(root_url, url, resolved):
        resolved = fix_urljoin_bug(resolved)

-    return resolved
+    return _normalize_trailing_slash(resolved)
+
+
+def _normalize_trailing_slash(url: str) -> str:
+    """Drop trailing slash for non-root paths when no query/fragment."""
+    try:
+        parsed = urlparse(url)
+        path = parsed.path or ''
+        if path != '/' and path.endswith('/') and not parsed.query and not parsed.fragment:
+            path = path.rstrip('/')
+            return urlunparse((parsed.scheme, parsed.netloc, path, parsed.params, parsed.query, parsed.fragment))
+    except Exception:
+        pass
+    return url
+
+
+def clean_url_candidate(url: str) -> str:
+    """Strip obvious surrounding/trailing punctuation from extracted URLs."""
+    cleaned = (url or '').strip()
+    if not cleaned:
+        return cleaned
+
+    # Strip common wrappers
+    cleaned = cleaned.strip(' \t\r\n')
+    cleaned = cleaned.strip('"\''"'"'<>[]()')
+
+    # Strip trailing punctuation and escape artifacts
+    cleaned = cleaned.rstrip('.,;:!?)\\\'"')
+    cleaned = cleaned.rstrip('"')
+
+    # Strip leading punctuation artifacts
+    cleaned = cleaned.lstrip('("'\''<')
+
+    return cleaned


 def fetch_content(url: str) -> str:
@@ -131,6 +166,43 @@ def fetch_content(url: str) -> str:
            return response.read().decode('utf-8', errors='replace')


+def find_html_sources() -> list[str]:
+    """Find HTML content from other extractors in the snapshot directory."""
+    search_patterns = [
+        'readability/content.html',
+        '*_readability/content.html',
+        'mercury/content.html',
+        '*_mercury/content.html',
+        'singlefile/singlefile.html',
+        '*_singlefile/singlefile.html',
+        'singlefile/*.html',
+        '*_singlefile/*.html',
+        'dom/output.html',
+        '*_dom/output.html',
+        'dom/*.html',
+        '*_dom/*.html',
+        'wget/**/*.html',
+        '*_wget/**/*.html',
+        'wget/**/*.htm',
+        '*_wget/**/*.htm',
+        'wget/**/*.htm*',
+        '*_wget/**/*.htm*',
+    ]
+
+    sources: list[str] = []
+    for base in (Path.cwd(), Path.cwd().parent):
+        for pattern in search_patterns:
+            for match in base.glob(pattern):
+                if not match.is_file() or match.stat().st_size == 0:
+                    continue
+                try:
+                    sources.append(match.read_text(errors='ignore'))
+                except Exception:
+                    continue
+
+    return sources
+
+
@click.command()
@click.option('--url', required=True, help='HTML URL to parse')
@click.option('--snapshot-id', required=False, help='Parent Snapshot UUID')
@@ -138,6 +210,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
    """Parse HTML and extract href URLs."""
+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
+    if env_depth is not None:
+        try:
+            depth = int(env_depth)
+        except Exception:
+            pass
+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')

    # Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
    # If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
@@ -145,32 +224,38 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
        click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
        sys.exit(0)

-    try:
-        content = fetch_content(url)
-    except Exception as e:
-        click.echo(f'Failed to fetch {url}: {e}', err=True)
-        sys.exit(1)
-
-    # Parse HTML for hrefs
-    parser = HrefParser()
-    try:
-        parser.feed(content)
-    except Exception as e:
-        click.echo(f'Failed to parse HTML: {e}', err=True)
-        sys.exit(1)
+    contents = find_html_sources()
+    if not contents:
+        try:
+            contents = [fetch_content(url)]
+        except Exception as e:
+            click.echo(f'Failed to fetch {url}: {e}', err=True)
+            sys.exit(1)

    urls_found = set()
-    for href in parser.urls:
-        # Normalize URL
-        normalized = normalize_url(href, root_url=url)
+    for content in contents:
+        # Parse HTML for hrefs
+        parser = HrefParser()
+        try:
+            parser.feed(content)
+        except Exception:
+            pass

-        # Only include http/https URLs
-        if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
-            # Skip the source URL itself
-            if normalized != url:
-                urls_found.add(unescape(normalized))
+        for href in parser.urls:
+            normalized = normalize_url(href, root_url=url)
+            if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
+                if normalized != url:
+                    urls_found.add(unescape(normalized))

-    # Emit Snapshot records to stdout (JSONL)
+        # Also capture explicit URLs in the HTML text
+        for match in URL_REGEX.findall(content):
+            normalized = normalize_url(match, root_url=url)
+            if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
+                if normalized != url:
+                    urls_found.add(unescape(normalized))
+
+    # Emit Snapshot records to stdout (JSONL) and urls.jsonl for crawl system
+    records = []
    for found_url in sorted(urls_found):
        record = {
            'type': 'Snapshot',
@@ -183,8 +268,12 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
        if crawl_id:
            record['crawl_id'] = crawl_id

+        records.append(record)
        print(json.dumps(record))

+    if records:
+        URLS_FILE.write_text('\n'.join(json.dumps(r) for r in records) + '\n')
+
    # Emit ArchiveResult record to mark completion
    status = 'succeeded' if urls_found else 'skipped'
    output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
--- a/archivebox/plugins/parse_html_urls/templates/icon.html
+++ b/archivebox/plugins/parse_html_urls/templates/icon.html
@@ -1 +1 @@
-🔗
+<span class="abx-output-icon abx-output-icon--parse_html_urls" title="HTML URLs"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py
@@ -132,6 +132,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
    """Parse JSONL bookmark file and extract URLs."""
+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
+    if env_depth is not None:
+        try:
+            depth = int(env_depth)
+        except Exception:
+            pass
+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')

    try:
        content = fetch_content(url)
--- a/archivebox/plugins/parse_jsonl_urls/templates/icon.html
+++ b/archivebox/plugins/parse_jsonl_urls/templates/icon.html
@@ -1 +1 @@
-📋
+<span class="abx-output-icon abx-output-icon--parse_jsonl_urls" title="JSONL URLs"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 4H5v16h3"/><path d="M16 4h3v16h-3"/><circle cx="12" cy="8" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="16" r="1" fill="currentColor" stroke="none"/></svg></span>
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py
@@ -168,6 +168,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
    """Parse Netscape bookmark HTML and extract URLs."""
+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
+    if env_depth is not None:
+        try:
+            depth = int(env_depth)
+        except Exception:
+            pass
+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')

    try:
        content = fetch_content(url)
--- a/archivebox/plugins/parse_netscape_urls/templates/icon.html
+++ b/archivebox/plugins/parse_netscape_urls/templates/icon.html
@@ -1 +1 @@
-🔖
+<span class="abx-output-icon abx-output-icon--parse_netscape_urls" title="Netscape Bookmarks"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M6 4h12v16l-6-4-6 4z"/></svg></span>
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
@@ -56,6 +56,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
    """Parse RSS/Atom feed and extract article URLs."""
+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
+    if env_depth is not None:
+        try:
+            depth = int(env_depth)
+        except Exception:
+            pass
+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')

    if feedparser is None:
        click.echo('feedparser library not installed', err=True)
--- a/archivebox/plugins/parse_rss_urls/templates/icon.html
+++ b/archivebox/plugins/parse_rss_urls/templates/icon.html
@@ -1 +1 @@
-📡
+<span class="abx-output-icon abx-output-icon--parse_rss_urls" title="RSS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="5" cy="19" r="1.5" fill="currentColor" stroke="none"/><path d="M5 11a8 8 0 0 1 8 8"/><path d="M5 5a14 14 0 0 1 14 14"/></svg></span>
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
@@ -105,6 +105,13 @@ def fetch_content(url: str) -> str:
@click.option('--depth', type=int, default=0, help='Current depth level')
 def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0):
    """Parse plain text and extract URLs."""
+    env_depth = os.environ.get('SNAPSHOT_DEPTH')
+    if env_depth is not None:
+        try:
+            depth = int(env_depth)
+        except Exception:
+            pass
+    crawl_id = crawl_id or os.environ.get('CRAWL_ID')

    try:
        content = fetch_content(url)
--- a/archivebox/plugins/parse_txt_urls/templates/icon.html
+++ b/archivebox/plugins/parse_txt_urls/templates/icon.html
@@ -1 +1 @@
-📃
+<span class="abx-output-icon abx-output-icon--parse_txt_urls" title="Text URLs"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M6 3h8l4 4v14H6z"/><path d="M14 3v5h5"/><path d="M8 12h8"/><path d="M8 16h6"/></svg></span>
--- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
@@ -52,7 +52,21 @@ const CHROME_SESSION_DIR = '../chrome';
 // Check if staticfile extractor already downloaded this URL
 const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
-    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
+    if (!fs.existsSync(STATICFILE_DIR)) return false;
+    const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
+    if (!fs.existsSync(stdoutPath)) return false;
+    const stdout = fs.readFileSync(stdoutPath, 'utf8');
+    for (const line of stdout.split('\n')) {
+        const trimmed = line.trim();
+        if (!trimmed.startsWith('{')) continue;
+        try {
+            const record = JSON.parse(trimmed);
+            if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
+                return true;
+            }
+        } catch (e) {}
+    }
+    return false;
 }

 // Wait for chrome tab to be fully loaded
--- a/archivebox/plugins/pdf/templates/icon.html
+++ b/archivebox/plugins/pdf/templates/icon.html
@@ -1 +1 @@
-📄
+<span class="abx-output-icon abx-output-icon--pdf" title="PDF"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M14 3H6a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V9z"/><path d="M14 3v6h6"/><rect x="8" y="12" width="8" height="4" rx="1"/></svg></span>
--- a/archivebox/plugins/pip/on_Binary__11_pip_install.py
+++ b/archivebox/plugins/pip/on_Binary__11_pip_install.py
@@ -11,6 +11,8 @@ Environment variables:

 import json
 import os
+import shutil
+import subprocess
 import sys
 from pathlib import Path

@@ -46,6 +48,26 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
    # Structure: lib/arm64-darwin/pip/venv (PipProvider will create venv automatically)
    pip_venv_path = Path(lib_dir) / 'pip' / 'venv'
    pip_venv_path.parent.mkdir(parents=True, exist_ok=True)
+    venv_python = pip_venv_path / 'bin' / 'python'
+
+    # Prefer a stable system python for venv creation if provided/available
+    preferred_python = os.environ.get('PIP_VENV_PYTHON', '').strip()
+    if not preferred_python:
+        for candidate in ('python3.12', 'python3.11', 'python3.10'):
+            if shutil.which(candidate):
+                preferred_python = candidate
+                break
+    if preferred_python and not venv_python.exists():
+        try:
+            subprocess.run(
+                [preferred_python, '-m', 'venv', str(pip_venv_path), '--upgrade-deps'],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+        except Exception:
+            # Fall back to PipProvider-managed venv creation
+            pass

    # Use abx-pkg PipProvider to install binary with custom venv
    provider = PipProvider(pip_venv=pip_venv_path)
@@ -87,22 +109,21 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
    }
    print(json.dumps(record))

-    # Emit PATH update if pip bin dir not already in PATH
+    # Emit PATH update for pip bin dir
    pip_bin_dir = str(pip_venv_path / 'bin')
    current_path = os.environ.get('PATH', '')

    # Check if pip_bin_dir is already in PATH
    path_dirs = current_path.split(':')
-    if pip_bin_dir not in path_dirs:
-        # Prepend pip_bin_dir to PATH
-        new_path = f"{pip_bin_dir}:{current_path}" if current_path else pip_bin_dir
-        print(json.dumps({
-            'type': 'Machine',
-            '_method': 'update',
-            'key': 'config/PATH',
-            'value': new_path,
-        }))
-        click.echo(f"  Added {pip_bin_dir} to PATH", err=True)
+    new_path = f"{pip_bin_dir}:{current_path}" if current_path else pip_bin_dir
+    if pip_bin_dir in path_dirs:
+        new_path = current_path
+    print(json.dumps({
+        'type': 'Machine',
+        'config': {
+            'PATH': new_path,
+        },
+    }))

    # Log human-readable info to stderr
    click.echo(f"Installed {name} at {binary.abspath}", err=True)
--- a/archivebox/plugins/pip/tests/init.py
+++ b/archivebox/plugins/pip/tests/init.py
@@ -1 +0,0 @@
-"""Tests for the pip binary provider plugin."""
--- a/archivebox/plugins/pip/tests/test_pip_provider.py
+++ b/archivebox/plugins/pip/tests/test_pip_provider.py
@@ -22,7 +22,7 @@ from django.test import TestCase

 # Get the path to the pip provider hook
 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_HOOK = PLUGIN_DIR / 'on_Binary__install_using_pip_provider.py'
+INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_pip_install.py'), None)


 class TestPipProviderHook(TestCase):
@@ -33,6 +33,10 @@ class TestPipProviderHook(TestCase):
        self.temp_dir = tempfile.mkdtemp()
        self.output_dir = Path(self.temp_dir) / 'output'
        self.output_dir.mkdir()
+        self.lib_dir = Path(self.temp_dir) / 'lib' / 'x86_64-linux'
+        self.lib_dir.mkdir(parents=True, exist_ok=True)
+        self.lib_dir = Path(self.temp_dir) / 'lib' / 'x86_64-linux'
+        self.lib_dir.mkdir(parents=True, exist_ok=True)

    def tearDown(self):
        """Clean up."""
@@ -41,7 +45,7 @@ class TestPipProviderHook(TestCase):

    def test_hook_script_exists(self):
        """Hook script should exist."""
-        self.assertTrue(INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
+        self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")

    def test_hook_help(self):
        """Hook should accept --help without error."""
@@ -55,16 +59,19 @@ class TestPipProviderHook(TestCase):
        # At minimum should not crash with Python error
        self.assertNotIn('Traceback', result.stderr)

-    def test_hook_finds_python(self):
-        """Hook should find Python binary."""
+    def test_hook_finds_pip(self):
+        """Hook should find pip binary."""
        env = os.environ.copy()
        env['DATA_DIR'] = self.temp_dir
+        env['LIB_DIR'] = str(self.lib_dir)

        result = subprocess.run(
            [
                sys.executable, str(INSTALL_HOOK),
-                '--name=python3',
-                '--binproviders=pip,env',
+                '--name=pip',
+                '--binproviders=pip',
+                '--binary-id=test-uuid',
+                '--machine-id=test-machine',
            ],
            capture_output=True,
            text=True,
@@ -80,7 +87,7 @@ class TestPipProviderHook(TestCase):
            if line.startswith('{'):
                try:
                    record = json.loads(line)
-                    if record.get('type') == 'Binary' and record.get('name') == 'python3':
+                    if record.get('type') == 'Binary' and record.get('name') == 'pip':
                        jsonl_found = True
                        # Verify structure
                        self.assertIn('abspath', record)
@@ -92,19 +99,22 @@ class TestPipProviderHook(TestCase):
        # Should not crash
        self.assertNotIn('Traceback', result.stderr)

-        # Should find python3 via pip or env provider
-        self.assertTrue(jsonl_found, "Expected to find python3 binary in JSONL output")
+        # Should find pip via pip provider
+        self.assertTrue(jsonl_found, "Expected to find pip binary in JSONL output")

    def test_hook_unknown_package(self):
        """Hook should handle unknown packages gracefully."""
        env = os.environ.copy()
        env['DATA_DIR'] = self.temp_dir
+        env['LIB_DIR'] = str(self.lib_dir)

        result = subprocess.run(
            [
                sys.executable, str(INSTALL_HOOK),
                '--name=nonexistent_package_xyz123',
                '--binproviders=pip',
+                '--binary-id=test-uuid',
+                '--machine-id=test-machine',
            ],
            capture_output=True,
            text=True,
@@ -148,6 +158,8 @@ class TestPipProviderIntegration(TestCase):
                sys.executable, str(INSTALL_HOOK),
                '--name=pip',
                '--binproviders=pip,env',
+                '--binary-id=test-uuid',
+                '--machine-id=test-machine',
            ],
            capture_output=True,
            text=True,
--- a/archivebox/plugins/puppeteer/init.py
+++ b/archivebox/plugins/puppeteer/init.py
@@ -0,0 +1 @@
+# Plugin namespace for puppeteer utilities.
--- a/archivebox/plugins/puppeteer/on_Binary__12_puppeteer_install.py
+++ b/archivebox/plugins/puppeteer/on_Binary__12_puppeteer_install.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""
+Install Chromium via the Puppeteer CLI.
+
+Usage: on_Binary__12_puppeteer_install.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
+Output: Binary JSONL record to stdout after installation
+"""
+
+import json
+import os
+import re
+import sys
+from pathlib import Path
+
+import rich_click as click
+from abx_pkg import Binary, EnvProvider, NpmProvider, BinProviderOverrides
+
+# Fix pydantic forward reference issue
+NpmProvider.model_rebuild()
+
+
+@click.command()
+@click.option('--machine-id', required=True, help='Machine UUID')
+@click.option('--binary-id', required=True, help='Binary UUID')
+@click.option('--name', required=True, help='Binary name to install')
+@click.option('--binproviders', default='*', help='Allowed providers (comma-separated)')
+@click.option('--overrides', default=None, help='JSON-encoded overrides dict')
+def main(machine_id: str, binary_id: str, name: str, binproviders: str, overrides: str | None) -> None:
+    if binproviders != '*' and 'puppeteer' not in binproviders.split(','):
+        sys.exit(0)
+
+    if name not in ('chromium', 'chrome'):
+        sys.exit(0)
+
+    lib_dir = os.environ.get('LIB_DIR', '').strip()
+    if not lib_dir:
+        click.echo('ERROR: LIB_DIR environment variable not set', err=True)
+        sys.exit(1)
+
+    npm_prefix = Path(lib_dir) / 'npm'
+    npm_prefix.mkdir(parents=True, exist_ok=True)
+    npm_provider = NpmProvider(npm_prefix=npm_prefix)
+    cache_dir = Path(lib_dir) / 'puppeteer'
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    os.environ.setdefault('PUPPETEER_CACHE_DIR', str(cache_dir))
+
+    puppeteer_binary = Binary(
+        name='puppeteer',
+        binproviders=[npm_provider, EnvProvider()],
+        overrides={'npm': {'packages': ['puppeteer']}},
+    ).load()
+
+    if not puppeteer_binary.abspath:
+        click.echo('ERROR: puppeteer binary not found (install puppeteer first)', err=True)
+        sys.exit(1)
+
+    install_args = _parse_override_packages(overrides, default=['chromium@latest', '--install-deps'])
+    cmd = ['browsers', 'install', *install_args]
+    proc = puppeteer_binary.exec(cmd=cmd, timeout=300)
+    if proc.returncode != 0:
+        click.echo(proc.stdout.strip(), err=True)
+        click.echo(proc.stderr.strip(), err=True)
+        click.echo(f'ERROR: puppeteer install failed ({proc.returncode})', err=True)
+        sys.exit(1)
+
+    chromium_binary = _load_chromium_binary(proc.stdout + '\n' + proc.stderr)
+    if not chromium_binary or not chromium_binary.abspath:
+        click.echo('ERROR: failed to locate Chromium after install', err=True)
+        sys.exit(1)
+
+    _emit_chromium_binary_record(
+        binary=chromium_binary,
+        machine_id=machine_id,
+        binary_id=binary_id,
+    )
+
+    config_patch = {
+        'CHROME_BINARY': str(chromium_binary.abspath),
+        'CHROMIUM_VERSION': str(chromium_binary.version) if chromium_binary.version else '',
+    }
+
+    print(json.dumps({
+        'type': 'Machine',
+        'config': config_patch,
+    }))
+
+    sys.exit(0)
+
+
+def _parse_override_packages(overrides: str | None, default: list[str]) -> list[str]:
+    if not overrides:
+        return default
+    try:
+        overrides_dict = json.loads(overrides)
+    except json.JSONDecodeError:
+        return default
+
+    if isinstance(overrides_dict, dict):
+        provider_overrides = overrides_dict.get('puppeteer')
+        if isinstance(provider_overrides, dict):
+            packages = provider_overrides.get('packages')
+            if isinstance(packages, list) and packages:
+                return [str(arg) for arg in packages]
+        if isinstance(provider_overrides, list) and provider_overrides:
+            return [str(arg) for arg in provider_overrides]
+    if isinstance(overrides_dict, list) and overrides_dict:
+        return [str(arg) for arg in overrides_dict]
+
+    return default
+
+
+def _emit_chromium_binary_record(binary: Binary, machine_id: str, binary_id: str) -> None:
+    record = {
+        'type': 'Binary',
+        'name': 'chromium',
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'puppeteer',
+        'machine_id': machine_id,
+        'binary_id': binary_id,
+    }
+    print(json.dumps(record))
+
+
+def _load_chromium_binary(output: str) -> Binary | None:
+    candidates: list[Path] = []
+    match = re.search(r'(?:chromium|chrome)@[^\s]+\s+(\S+)', output)
+    if match:
+        candidates.append(Path(match.group(1)))
+
+    cache_dirs: list[Path] = []
+    cache_env = os.environ.get('PUPPETEER_CACHE_DIR')
+    if cache_env:
+        cache_dirs.append(Path(cache_env))
+
+    home = Path.home()
+    cache_dirs.extend([
+        home / '.cache' / 'puppeteer',
+        home / 'Library' / 'Caches' / 'puppeteer',
+    ])
+
+    for base in cache_dirs:
+        for root in (base, base / 'chromium', base / 'chrome'):
+            try:
+                candidates.extend(root.rglob('Chromium.app/Contents/MacOS/Chromium'))
+            except Exception:
+                pass
+            try:
+                candidates.extend(root.rglob('chrome'))
+            except Exception:
+                pass
+
+    for candidate in candidates:
+        try:
+            binary = Binary(
+                name='chromium',
+                binproviders=[EnvProvider()],
+                overrides={'env': {'abspath': str(candidate)}},
+            ).load()
+        except Exception:
+            continue
+        if binary.abspath:
+            return binary
+
+    return None
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/puppeteer/on_Crawl__60_puppeteer_install.py
+++ b/archivebox/plugins/puppeteer/on_Crawl__60_puppeteer_install.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+"""
+Emit Puppeteer Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+
+
+def main() -> None:
+    enabled = os.environ.get('PUPPETEER_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off')
+    if not enabled:
+        sys.exit(0)
+
+    record = {
+        'type': 'Binary',
+        'name': 'puppeteer',
+        'binproviders': 'npm,env',
+        'overrides': {
+            'npm': {
+                'packages': ['puppeteer'],
+            }
+        },
+    }
+    print(json.dumps(record))
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/puppeteer/tests/test_puppeteer.py
+++ b/archivebox/plugins/puppeteer/tests/test_puppeteer.py
@@ -0,0 +1,124 @@
+"""Integration tests for puppeteer plugin."""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from archivebox.plugins.chrome.tests.chrome_test_helpers import (
+    get_plugin_dir,
+    get_hook_script,
+)
+
+
+PLUGIN_DIR = get_plugin_dir(__file__)
+CRAWL_HOOK = get_hook_script(PLUGIN_DIR, 'on_Crawl__*_puppeteer_install.py')
+BINARY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Binary__*_puppeteer_install.py')
+NPM_BINARY_HOOK = PLUGIN_DIR.parent / 'npm' / 'on_Binary__10_npm_install.py'
+
+
+def test_hook_scripts_exist():
+    assert CRAWL_HOOK and CRAWL_HOOK.exists(), f"Hook not found: {CRAWL_HOOK}"
+    assert BINARY_HOOK and BINARY_HOOK.exists(), f"Hook not found: {BINARY_HOOK}"
+
+
+def test_crawl_hook_emits_puppeteer_binary():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        env = os.environ.copy()
+        result = subprocess.run(
+            [sys.executable, str(CRAWL_HOOK)],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30,
+        )
+
+        assert result.returncode == 0, f"crawl hook failed: {result.stderr}"
+        records = [json.loads(line) for line in result.stdout.splitlines() if line.strip().startswith('{')]
+        binaries = [r for r in records if r.get('type') == 'Binary' and r.get('name') == 'puppeteer']
+        assert binaries, f"Expected Binary record for puppeteer, got: {records}"
+        assert 'npm' in binaries[0].get('binproviders', ''), "puppeteer should be installable via npm provider"
+
+
+@pytest.mark.skipif(shutil.which('npm') is None, reason='npm is required for puppeteer installation')
+def test_puppeteer_installs_chromium():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+        lib_dir = tmpdir / 'lib' / 'arm64-darwin'
+        lib_dir.mkdir(parents=True, exist_ok=True)
+
+        env = os.environ.copy()
+        env['LIB_DIR'] = str(lib_dir)
+
+        crawl_result = subprocess.run(
+            [sys.executable, str(CRAWL_HOOK)],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30,
+        )
+        assert crawl_result.returncode == 0, f"crawl hook failed: {crawl_result.stderr}"
+        crawl_records = [json.loads(line) for line in crawl_result.stdout.splitlines() if line.strip().startswith('{')]
+        puppeteer_record = next(
+            (r for r in crawl_records if r.get('type') == 'Binary' and r.get('name') == 'puppeteer'),
+            None,
+        )
+        assert puppeteer_record, f"Expected puppeteer Binary record, got: {crawl_records}"
+
+        npm_result = subprocess.run(
+            [
+                sys.executable,
+                str(NPM_BINARY_HOOK),
+                '--machine-id=test-machine',
+                '--binary-id=test-puppeteer',
+                '--name=puppeteer',
+                f"--binproviders={puppeteer_record.get('binproviders', '*')}",
+                '--overrides=' + json.dumps(puppeteer_record.get('overrides') or {}),
+            ],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=120,
+        )
+        assert npm_result.returncode == 0, (
+            "puppeteer npm install failed\n"
+            f"stdout:\n{npm_result.stdout}\n"
+            f"stderr:\n{npm_result.stderr}"
+        )
+
+        result = subprocess.run(
+            [
+                sys.executable,
+                str(BINARY_HOOK),
+                '--machine-id=test-machine',
+                '--binary-id=test-binary',
+                '--name=chromium',
+                '--binproviders=puppeteer',
+                '--overrides=' + json.dumps({'puppeteer': ['chromium@latest', '--install-deps']}),
+            ],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=120,
+        )
+
+        assert result.returncode == 0, (
+            "puppeteer binary hook failed\n"
+            f"stdout:\n{result.stdout}\n"
+            f"stderr:\n{result.stderr}"
+        )
+
+        records = [json.loads(line) for line in result.stdout.splitlines() if line.strip().startswith('{')]
+        binaries = [r for r in records if r.get('type') == 'Binary' and r.get('name') == 'chromium']
+        assert binaries, f"Expected Binary record for chromium, got: {records}"
+        abspath = binaries[0].get('abspath')
+        assert abspath and Path(abspath).exists(), f"Chromium binary path invalid: {abspath}"
--- a/archivebox/plugins/readability/binaries.jsonl
+++ b/archivebox/plugins/readability/binaries.jsonl
@@ -1 +0,0 @@
-{"type": "Binary", "name": "readability-extractor", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["https://github.com/ArchiveBox/readability-extractor"]}}}
--- a/archivebox/plugins/readability/on_Crawl__11_readability_install.py
+++ b/archivebox/plugins/readability/on_Crawl__11_readability_install.py
@@ -1,83 +0,0 @@
-#!/usr/bin/env python3
-"""
-Detect readability-extractor binary and emit Binary JSONL record.
-
-Output: Binary JSONL record to stdout if readability is found
-"""
-
-import json
-import os
-import sys
-
-from abx_pkg import Binary, EnvProvider
-
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def output_binary_found(binary: Binary, name: str):
-    """Output Binary JSONL record for an installed binary."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'abspath': str(binary.abspath),
-        'version': str(binary.version) if binary.version else '',
-        'sha256': binary.sha256 or '',
-        'binprovider': 'env',  # Already installed
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def output_binary_missing(name: str, binproviders: str):
-    """Output Binary JSONL record for a missing binary that needs installation."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'binproviders': binproviders,  # Providers that can install it
-        'overrides': {
-            'packages': ['git+https://github.com/ArchiveBox/readability-extractor.git'],
-        },
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def main():
-    readability_enabled = get_env_bool('READABILITY_ENABLED', True)
-    readability_binary = get_env('READABILITY_BINARY', 'readability-extractor')
-
-    if not readability_enabled:
-        sys.exit(0)
-
-    provider = EnvProvider()
-    try:
-        binary = Binary(name=readability_binary, binproviders=[provider]).load()
-        if binary.abspath:
-            # Binary found
-            output_binary_found(binary, name='readability-extractor')
-        else:
-            # Binary not found
-            output_binary_missing(name='readability-extractor', binproviders='npm')
-    except Exception:
-        # Binary not found
-        output_binary_missing(name='readability-extractor', binproviders='npm')
-
-    sys.exit(0)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/readability/on_Crawl__35_readability_install.py
+++ b/archivebox/plugins/readability/on_Crawl__35_readability_install.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""
+Emit readability-extractor Binary dependency for the crawl.
+"""
+
+import json
+import os
+import sys
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def output_binary(name: str, binproviders: str):
+    """Output Binary JSONL record for a dependency."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'Binary',
+        'name': name,
+        'binproviders': binproviders,
+        'overrides': {
+            'npm': {
+                'packages': ['https://github.com/ArchiveBox/readability-extractor'],
+            },
+        },
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    readability_enabled = get_env_bool('READABILITY_ENABLED', True)
+
+    if not readability_enabled:
+        sys.exit(0)
+
+    output_binary(name='readability-extractor', binproviders='npm,env')
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`<span class="abx-output-icon abx-output-icon--accessibility" title="Accessibility"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="4.5" r="2" fill="currentColor" stroke="none"/><path d="M4 7.5h16"/><path d="M12 7.5v12"/><path d="M7 20l5-6 5 6"/></svg></span>`
				`@@ -1 +0,0 @@`
				`"""Tests for the apt binary provider plugin."""`
				`@@ -1 +0,0 @@`
				`{"type": "Binary", "name": "chrome", "binproviders": "npm,env,brew,apt", "overrides": {"npm": {"packages": ["@puppeteer/browsers"]}}}`
				`@@ -1 +0,0 @@`
				`"""Tests for the custom binary provider plugin."""`
				`@@ -1 +0,0 @@`
				`"""Tests for the env binary provider plugin."""`
				`@@ -1 +0,0 @@`
				`{"type": "Binary", "name": "forum-dl", "binproviders": "pip,env"}`
				`@@ -1 +0,0 @@`
				`{"type": "Binary", "name": "gallery-dl", "binproviders": "pip,brew,apt,env"}`
				`@@ -1 +0,0 @@`
				`{"type": "Binary", "name": "git", "binproviders": "apt,brew,env"}`
				`@@ -0,0 +1 @@`
				<span class="abx-output-icon abx-output-icon--infiniscroll" title="Infinite Scroll"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 5v9"/><path d="M8 10l4 4 4-4"/><circle cx="6" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="18" cy="19" r="1" fill="currentColor" stroke="none"/></svg></span>
				`@@ -1 +0,0 @@`
				`{"type": "Binary", "name": "postlight-parser", "binproviders": "npm,env", "overrides": {"npm": {"packages": ["@postlight/parser"]}}}`