wip major changes

2026-04-04 14:57:56 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -0,0 +1,266 @@
+#!/usr/bin/env node
+/**
+ * Extract accessibility tree and page outline from a URL.
+ *
+ * Extracts:
+ * - Page outline (headings h1-h6, sections, articles)
+ * - Iframe tree
+ * - Accessibility snapshot
+ * - ARIA labels and roles
+ *
+ * Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes accessibility/accessibility.json
+ *
+ * Environment variables:
+ *     SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'accessibility';
+const OUTPUT_DIR = 'accessibility';
+const OUTPUT_FILE = 'accessibility.json';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Extract accessibility info
+async function extractAccessibility(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Get accessibility snapshot
+        const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true });
+
+        // Extract page outline (headings, sections, etc.)
+        const outline = await page.evaluate(() => {
+            const headings = [];
+            const elements = document.querySelectorAll(
+                'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe'
+            );
+
+            elements.forEach(elem => {
+                // Skip unnamed anchors
+                if (elem.tagName.toLowerCase() === 'a' && !elem.name) return;
+
+                const tagName = elem.tagName.toLowerCase();
+                const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || '';
+                const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .');
+                const action = elem.action?.split('/').pop() || '';
+
+                let summary = (elem.innerText || '').slice(0, 128);
+                if (summary.length >= 128) summary += '...';
+
+                let prefix = '';
+                let title = '';
+
+                // Format headings with # prefix
+                const level = parseInt(tagName.replace('h', ''));
+                if (!isNaN(level)) {
+                    prefix = '#'.repeat(level);
+                    title = elem.innerText || elemId || elemClasses;
+                } else {
+                    // For other elements, create breadcrumb path
+                    const parents = [tagName];
+                    let node = elem.parentNode;
+                    while (node && parents.length < 5) {
+                        if (node.tagName) {
+                            const tag = node.tagName.toLowerCase();
+                            if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) {
+                                parents.unshift(tag);
+                            } else {
+                                parents.unshift('');
+                            }
+                        }
+                        node = node.parentNode;
+                    }
+                    prefix = parents.join('>');
+
+                    title = elemId ? `#${elemId}` : '';
+                    if (!title && elemClasses) title = `.${elemClasses}`;
+                    if (action) title += ` /${action}`;
+                    if (summary && !title.includes(summary)) title += `: ${summary}`;
+                }
+
+                // Clean up title
+                title = title.replace(/\s+/g, ' ').trim();
+
+                if (prefix) {
+                    headings.push(`${prefix} ${title}`);
+                }
+            });
+
+            return headings;
+        });
+
+        // Get iframe tree
+        const iframes = [];
+        function dumpFrameTree(frame, indent = '>') {
+            iframes.push(indent + frame.url());
+            for (const child of frame.childFrames()) {
+                dumpFrameTree(child, indent + '>');
+            }
+        }
+        dumpFrameTree(page.mainFrame(), '');
+
+        const accessibilityData = {
+            url,
+            headings: outline,
+            iframes,
+            tree: accessibilityTree,
+        };
+
+        // Write output
+        fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2));
+
+        return { success: true, output: outputPath, accessibilityData };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_ACCESSIBILITY', true)) {
+            console.log('Skipping accessibility (SAVE_ACCESSIBILITY=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await extractAccessibility(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            const headingCount = result.accessibilityData.headings.length;
+            const iframeCount = result.accessibilityData.iframes.length;
+            console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/apt/on_Dependency__install_using_apt_provider.py
+++ b/archivebox/plugins/apt/on_Dependency__install_using_apt_provider.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""
+Install a binary using apt package manager.
+
+Usage: on_Dependency__install_using_apt_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
+Output: InstalledBinary JSONL record to stdout after installation
+
+Environment variables:
+    MACHINE_ID: Machine UUID (set by orchestrator)
+"""
+
+import json
+import os
+import sys
+
+import rich_click as click
+from abx_pkg import Binary, AptProvider, BinProviderOverrides
+
+# Fix pydantic forward reference issue
+AptProvider.model_rebuild()
+
+
+@click.command()
+@click.option('--dependency-id', required=True, help="Dependency UUID")
+@click.option('--bin-name', required=True, help="Binary name to install")
+@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
+@click.option('--custom-cmd', default=None, help="Custom install command (overrides default)")
+def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
+    """Install binary using apt package manager."""
+
+    # Check if apt provider is allowed
+    if bin_providers != '*' and 'apt' not in bin_providers.split(','):
+        click.echo(f"apt provider not allowed for {bin_name}", err=True)
+        sys.exit(0)  # Not an error, just skip
+
+    # Use abx-pkg AptProvider to install binary
+    provider = AptProvider()
+    if not provider.INSTALLER_BIN:
+        click.echo("apt not available on this system", err=True)
+        sys.exit(1)
+
+    click.echo(f"Installing {bin_name} via apt...", err=True)
+
+    try:
+        binary = Binary(name=bin_name, binproviders=[provider]).install()
+    except Exception as e:
+        click.echo(f"apt install failed: {e}", err=True)
+        sys.exit(1)
+
+    if not binary.abspath:
+        click.echo(f"{bin_name} not found after apt install", err=True)
+        sys.exit(1)
+
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    # Output InstalledBinary JSONL record to stdout
+    record = {
+        'type': 'InstalledBinary',
+        'name': bin_name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'apt',
+        'machine_id': machine_id,
+        'dependency_id': dependency_id,
+    }
+    print(json.dumps(record))
+
+    # Log human-readable info to stderr
+    click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
+    click.echo(f"  version: {binary.version}", err=True)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/archive_org/config.json
+++ b/archivebox/plugins/archive_org/config.json
@@ -0,0 +1,26 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_ARCHIVE_DOT_ORG": {
+      "type": "boolean",
+      "default": true,
+      "x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
+      "description": "Submit URLs to archive.org Wayback Machine"
+    },
+    "ARCHIVE_ORG_TIMEOUT": {
+      "type": "integer",
+      "default": 60,
+      "minimum": 10,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for archive.org submission in seconds"
+    },
+    "ARCHIVE_ORG_USER_AGENT": {
+      "type": "string",
+      "default": "",
+      "x-fallback": "USER_AGENT",
+      "description": "User agent string"
+    }
+  }
+}
--- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
+++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""
+Submit a URL to archive.org for archiving.
+
+Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
+Output: Writes archive.org.txt to $PWD with the archived URL
+
+Environment variables:
+    TIMEOUT: Timeout in seconds (default: 60)
+    USER_AGENT: User agent string
+
+Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
+      It can run standalone if requests is installed: pip install requests
+"""
+
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'archive_org'
+OUTPUT_DIR = 'archive_org'
+OUTPUT_FILE = 'archive.org.txt'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
+    """
+    Submit URL to archive.org Wayback Machine.
+
+    Returns: (success, output_path, error_message)
+    """
+    try:
+        import requests
+    except ImportError:
+        return False, None, 'requests library not installed'
+
+    timeout = get_env_int('TIMEOUT', 60)
+    user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+    submit_url = f'https://web.archive.org/save/{url}'
+
+    try:
+        response = requests.get(
+            submit_url,
+            timeout=timeout,
+            headers={'User-Agent': user_agent},
+            allow_redirects=True,
+        )
+
+        # Check for successful archive
+        content_location = response.headers.get('Content-Location', '')
+        x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
+
+        # Build archive URL
+        if content_location:
+            archive_url = f'https://web.archive.org{content_location}'
+            Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
+            return True, OUTPUT_FILE, ''
+        elif 'web.archive.org' in response.url:
+            # We were redirected to an archive page
+            Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
+            return True, OUTPUT_FILE, ''
+        else:
+            # Check for errors in response
+            if 'RobotAccessControlException' in response.text:
+                # Blocked by robots.txt - save submit URL for manual retry
+                Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
+                return True, OUTPUT_FILE, ''  # Consider this a soft success
+            elif response.status_code >= 400:
+                return False, None, f'HTTP {response.status_code}'
+            else:
+                # Save submit URL anyway
+                Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
+                return True, OUTPUT_FILE, ''
+
+    except requests.Timeout:
+        return False, None, f'Request timed out after {timeout} seconds'
+    except requests.RequestException as e:
+        return False, None, f'{type(e).__name__}: {e}'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to submit to archive.org')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Submit a URL to archive.org for archiving."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+
+    try:
+        # Run extraction
+        success, output, error = submit_to_archive_org(url)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            archive_url = Path(output).read_text().strip()
+            print(f'Archived at: {archive_url}')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/brew/on_Dependency__install_using_brew_provider.py
+++ b/archivebox/plugins/brew/on_Dependency__install_using_brew_provider.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Install a binary using Homebrew package manager.
+
+Usage: on_Dependency__install_using_brew_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
+Output: InstalledBinary JSONL record to stdout after installation
+
+Environment variables:
+    MACHINE_ID: Machine UUID (set by orchestrator)
+"""
+
+import json
+import os
+import sys
+
+import rich_click as click
+from abx_pkg import Binary, BrewProvider, BinProviderOverrides
+
+# Fix pydantic forward reference issue
+BrewProvider.model_rebuild()
+
+
+@click.command()
+@click.option('--dependency-id', required=True, help="Dependency UUID")
+@click.option('--bin-name', required=True, help="Binary name to install")
+@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
+@click.option('--custom-cmd', default=None, help="Custom install command")
+def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
+    """Install binary using Homebrew."""
+
+    if bin_providers != '*' and 'brew' not in bin_providers.split(','):
+        click.echo(f"brew provider not allowed for {bin_name}", err=True)
+        sys.exit(0)
+
+    # Use abx-pkg BrewProvider to install binary
+    provider = BrewProvider()
+    if not provider.INSTALLER_BIN:
+        click.echo("brew not available on this system", err=True)
+        sys.exit(1)
+
+    click.echo(f"Installing {bin_name} via brew...", err=True)
+
+    try:
+        binary = Binary(name=bin_name, binproviders=[provider]).install()
+    except Exception as e:
+        click.echo(f"brew install failed: {e}", err=True)
+        sys.exit(1)
+
+    if not binary.abspath:
+        click.echo(f"{bin_name} not found after brew install", err=True)
+        sys.exit(1)
+
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    # Output InstalledBinary JSONL record to stdout
+    record = {
+        'type': 'InstalledBinary',
+        'name': bin_name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'brew',
+        'machine_id': machine_id,
+        'dependency_id': dependency_id,
+    }
+    print(json.dumps(record))
+
+    # Log human-readable info to stderr
+    click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
+    click.echo(f"  version: {binary.version}", err=True)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py
+++ b/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""
+Create symlinks from plugin outputs to canonical legacy locations.
+
+This plugin runs after all extractors complete and creates symlinks from the
+new plugin-based output structure to the legacy canonical output paths that
+ArchiveBox has historically used. This maintains backward compatibility with
+existing tools and scripts that expect outputs at specific locations.
+
+Canonical output paths (from Snapshot.canonical_outputs()):
+    - favicon.ico → favicon/favicon.ico
+    - singlefile.html → singlefile/singlefile.html
+    - readability/content.html → readability/content.html
+    - mercury/content.html → mercury/content.html
+    - htmltotext.txt → htmltotext/htmltotext.txt
+    - output.pdf → pdf/output.pdf
+    - screenshot.png → screenshot/screenshot.png
+    - output.html → dom/output.html
+    - headers.json → headers/headers.json
+    - warc/{timestamp} → wget/warc/{timestamp}
+
+New plugin outputs:
+    - ssl.json → ssl/ssl.json
+    - seo.json → seo/seo.json
+    - accessibility.json → accessibility/accessibility.json
+    - outlinks.json → outlinks/outlinks.json
+    - redirects.json → redirects/redirects.json
+    - console.jsonl → consolelog/console.jsonl
+
+Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
+
+Environment variables:
+    SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
+"""
+
+__package__ = 'archivebox.plugins.canonical_outputs'
+
+import os
+import sys
+from pathlib import Path
+from typing import Dict, Optional
+
+# Configure Django if running standalone
+if __name__ == '__main__':
+    parent_dir = str(Path(__file__).resolve().parent.parent.parent)
+    if parent_dir not in sys.path:
+        sys.path.insert(0, parent_dir)
+    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
+    import django
+    django.setup()
+
+import rich_click as click
+
+
+# Mapping from canonical path to plugin output path
+CANONICAL_MAPPINGS = {
+    # Legacy extractors
+    'favicon.ico': 'favicon/favicon.ico',
+    'singlefile.html': 'singlefile/singlefile.html',
+    'readability/content.html': 'readability/content.html',
+    'mercury/content.html': 'mercury/content.html',
+    'htmltotext.txt': 'htmltotext/htmltotext.txt',
+    'output.pdf': 'pdf/output.pdf',
+    'screenshot.png': 'screenshot/screenshot.png',
+    'output.html': 'dom/output.html',
+    'headers.json': 'headers/headers.json',
+
+    # New plugins
+    'ssl.json': 'ssl/ssl.json',
+    'seo.json': 'seo/seo.json',
+    'accessibility.json': 'accessibility/accessibility.json',
+    'outlinks.json': 'parse_dom_outlinks/outlinks.json',
+    'redirects.json': 'redirects/redirects.json',
+    'console.jsonl': 'consolelog/console.jsonl',
+}
+
+
+def create_symlink(target: Path, link: Path, relative: bool = True) -> bool:
+    """
+    Create a symlink from link to target.
+
+    Args:
+        target: The actual file/directory (source)
+        link: The symlink to create (destination)
+        relative: Whether to create a relative symlink (default: True)
+
+    Returns:
+        True if symlink was created or already exists, False otherwise
+    """
+    try:
+        # Skip if target doesn't exist
+        if not target.exists():
+            return False
+
+        # Remove existing symlink/file if present
+        if link.exists() or link.is_symlink():
+            if link.is_symlink() and link.resolve() == target.resolve():
+                # Already correctly symlinked
+                return True
+            link.unlink()
+
+        # Create parent directory
+        link.parent.mkdir(parents=True, exist_ok=True)
+
+        # Create relative or absolute symlink
+        if relative:
+            # Calculate relative path from link to target
+            rel_target = os.path.relpath(target, link.parent)
+            link.symlink_to(rel_target)
+        else:
+            link.symlink_to(target)
+
+        return True
+    except (OSError, FileNotFoundError, PermissionError) as e:
+        # Symlink creation failed, skip
+        return False
+
+
+def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
+    """
+    Create all canonical symlinks for a snapshot directory.
+
+    Args:
+        snapshot_dir: The snapshot directory (e.g., archive/<timestamp>/)
+
+    Returns:
+        Dict mapping canonical path to success status
+    """
+    results = {}
+
+    for canonical_path, plugin_output in CANONICAL_MAPPINGS.items():
+        target = snapshot_dir / plugin_output
+        link = snapshot_dir / canonical_path
+
+        success = create_symlink(target, link, relative=True)
+        results[canonical_path] = success
+
+    # Special handling for warc/ directory symlink
+    # wget plugin outputs to wget/warc/, but canonical expects warc/ at root
+    wget_warc = snapshot_dir / 'wget' / 'warc'
+    canonical_warc = snapshot_dir / 'warc'
+    if wget_warc.exists():
+        results['warc/'] = create_symlink(wget_warc, canonical_warc, relative=True)
+
+    return results
+
+
+@click.command()
+@click.option('--url', required=True, help='URL being archived')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Create symlinks from plugin outputs to canonical legacy locations."""
+    from datetime import datetime
+    from archivebox.core.models import Snapshot
+
+    start_ts = datetime.now()
+    status = 'failed'
+    output = None
+    error = ''
+    symlinks_created = 0
+
+    try:
+        # Check if enabled
+        from archivebox.config import CONSTANTS
+        save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
+
+        if not save_canonical:
+            click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
+            status = 'skipped'
+            end_ts = datetime.now()
+            click.echo(f'START_TS={start_ts.isoformat()}')
+            click.echo(f'END_TS={end_ts.isoformat()}')
+            click.echo(f'STATUS={status}')
+            click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
+            sys.exit(0)
+
+        # Get snapshot
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+        except Snapshot.DoesNotExist:
+            error = f'Snapshot {snapshot_id} not found'
+            raise ValueError(error)
+
+        # Get snapshot directory
+        snapshot_dir = Path(snapshot.output_dir)
+        if not snapshot_dir.exists():
+            error = f'Snapshot directory not found: {snapshot_dir}'
+            raise FileNotFoundError(error)
+
+        # Create canonical symlinks
+        results = create_canonical_symlinks(snapshot_dir)
+
+        # Count successful symlinks
+        symlinks_created = sum(1 for success in results.values() if success)
+        total_mappings = len(results)
+
+        status = 'succeeded'
+        output = str(snapshot_dir)
+        click.echo(f'Created {symlinks_created}/{total_mappings} canonical symlinks')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+        click.echo(f'Error: {error}', err=True)
+
+    end_ts = datetime.now()
+    duration = (end_ts - start_ts).total_seconds()
+
+    # Print results
+    click.echo(f'START_TS={start_ts.isoformat()}')
+    click.echo(f'END_TS={end_ts.isoformat()}')
+    click.echo(f'DURATION={duration:.2f}')
+    if output:
+        click.echo(f'OUTPUT={output}')
+    click.echo(f'STATUS={status}')
+
+    if error:
+        click.echo(f'ERROR={error}', err=True)
+
+    # Print JSON result
+    import json
+    result_json = {
+        'extractor': 'canonical_outputs',
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'symlinks_created': symlinks_created,
+        'error': error or None,
+    }
+    click.echo(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js
+++ b/archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js
@@ -0,0 +1,121 @@
+#!/usr/bin/env node
+/**
+ * 2Captcha Extension Plugin
+ *
+ * Installs and configures the 2captcha Chrome extension for automatic
+ * CAPTCHA solving during page archiving.
+ *
+ * Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
+ * Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
+ *
+ * Priority: 01 (early) - Must install before Chrome session starts
+ * Hook: on_Snapshot
+ *
+ * Requirements:
+ * - API_KEY_2CAPTCHA environment variable must be set
+ * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
+ */
+
+const path = require('path');
+const fs = require('fs');
+
+// Import extension utilities
+const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
+
+// Extension metadata
+const EXTENSION = {
+    webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
+    name: 'captcha2',
+};
+
+// Get extensions directory from environment or use default
+const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
+    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
+
+/**
+ * Install and configure the 2captcha extension
+ */
+async function installCaptchaExtension() {
+    console.log('[*] Installing 2captcha extension...');
+
+    // Install the extension
+    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
+
+    if (!extension) {
+        console.error('[❌] Failed to install 2captcha extension');
+        return null;
+    }
+
+    // Check if API key is configured
+    const apiKey = process.env.API_KEY_2CAPTCHA;
+    if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
+        console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
+        console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
+    } else {
+        console.log('[+] 2captcha extension installed and API key configured');
+    }
+
+    return extension;
+}
+
+/**
+ * Note: 2captcha configuration is now handled by chrome_session plugin
+ * during first-time browser setup to avoid repeated configuration on every snapshot.
+ * The API key is injected via chrome.storage API once per browser session.
+ */
+
+/**
+ * Main entry point - install extension before archiving
+ */
+async function main() {
+    // Check if extension is already cached
+    const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
+
+    if (fs.existsSync(cacheFile)) {
+        try {
+            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
+            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
+
+            if (fs.existsSync(manifestPath)) {
+                console.log('[*] 2captcha extension already installed (using cache)');
+                return cached;
+            }
+        } catch (e) {
+            // Cache file corrupted, re-install
+            console.warn('[⚠️] Extension cache corrupted, re-installing...');
+        }
+    }
+
+    // Install extension
+    const extension = await installCaptchaExtension();
+
+    // Export extension metadata for chrome_session to load
+    if (extension) {
+        // Write extension info to a cache file that chrome_session can read
+        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
+        await fs.promises.writeFile(
+            cacheFile,
+            JSON.stringify(extension, null, 2)
+        );
+        console.log(`[+] Extension metadata written to ${cacheFile}`);
+    }
+
+    return extension;
+}
+
+// Export functions for use by other plugins
+module.exports = {
+    EXTENSION,
+    installCaptchaExtension,
+};
+
+// Run if executed directly
+if (require.main === module) {
+    main().then(() => {
+        console.log('[✓] 2captcha extension setup complete');
+        process.exit(0);
+    }).catch(err => {
+        console.error('[❌] 2captcha extension setup failed:', err);
+        process.exit(1);
+    });
+}
--- a/archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js
+++ b/archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js
@@ -0,0 +1,284 @@
+#!/usr/bin/env node
+/**
+ * 2Captcha Extension Configuration
+ *
+ * Configures the 2captcha extension with API key after Chrome session starts.
+ * Runs once per browser session to inject API key into extension storage.
+ *
+ * Priority: 21 (after chrome_session at 20, before navigation at 30)
+ * Hook: on_Snapshot
+ *
+ * Requirements:
+ * - API_KEY_2CAPTCHA environment variable must be set
+ * - chrome_session must have loaded extensions (extensions.json must exist)
+ */
+
+const path = require('path');
+const fs = require('fs');
+const puppeteer = require('puppeteer-core');
+
+const OUTPUT_DIR = 'chrome_session';
+const CONFIG_MARKER = path.join(OUTPUT_DIR, '.captcha2_configured');
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+async function configure2Captcha() {
+    // Check if already configured in this session
+    if (fs.existsSync(CONFIG_MARKER)) {
+        console.log('[*] 2captcha already configured in this browser session');
+        return { success: true, skipped: true };
+    }
+
+    // Check if API key is set
+    const apiKey = getEnv('API_KEY_2CAPTCHA');
+    if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
+        console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
+        console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
+        return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
+    }
+
+    // Load extensions metadata
+    const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
+    if (!fs.existsSync(extensionsFile)) {
+        return { success: false, error: 'extensions.json not found - chrome_session must run first' };
+    }
+
+    const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
+    const captchaExt = extensions.find(ext => ext.name === 'captcha2');
+
+    if (!captchaExt) {
+        console.log('[*] 2captcha extension not installed, skipping configuration');
+        return { success: true, skipped: true };
+    }
+
+    console.log('[*] Configuring 2captcha extension with API key...');
+
+    try {
+        // Connect to the existing Chrome session via CDP
+        const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
+        if (!fs.existsSync(cdpFile)) {
+            return { success: false, error: 'CDP URL not found - chrome_session must run first' };
+        }
+
+        const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
+        const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
+
+        try {
+            // Method 1: Try to inject via extension background page
+            if (captchaExt.target && captchaExt.target_ctx) {
+                console.log('[*] Attempting to configure via extension background page...');
+
+                // Reconnect to the browser to get fresh target context
+                const targets = await browser.targets();
+                const extTarget = targets.find(t =>
+                    t.url().startsWith(`chrome-extension://${captchaExt.id}`)
+                );
+
+                if (extTarget) {
+                    const extContext = await extTarget.worker() || await extTarget.page();
+
+                    if (extContext) {
+                        await extContext.evaluate((key) => {
+                            // Try all common storage patterns
+                            if (typeof chrome !== 'undefined' && chrome.storage) {
+                                chrome.storage.local.set({
+                                    apiKey: key,
+                                    api_key: key,
+                                    '2captcha_apikey': key,
+                                    apikey: key,
+                                    'solver-api-key': key,
+                                });
+                                chrome.storage.sync.set({
+                                    apiKey: key,
+                                    api_key: key,
+                                    '2captcha_apikey': key,
+                                    apikey: key,
+                                    'solver-api-key': key,
+                                });
+                            }
+
+                            // Also try localStorage as fallback
+                            if (typeof localStorage !== 'undefined') {
+                                localStorage.setItem('apiKey', key);
+                                localStorage.setItem('2captcha_apikey', key);
+                                localStorage.setItem('solver-api-key', key);
+                            }
+                        }, apiKey);
+
+                        console.log('[+] 2captcha API key configured successfully via background page');
+
+                        // Mark as configured
+                        fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
+
+                        return { success: true, method: 'background_page' };
+                    }
+                }
+            }
+
+            // Method 2: Try to configure via options page
+            console.log('[*] Attempting to configure via options page...');
+            const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
+            const configPage = await browser.newPage();
+
+            try {
+                await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
+
+                const configured = await configPage.evaluate((key) => {
+                    // Try to find API key input field
+                    const selectors = [
+                        'input[name*="apikey" i]',
+                        'input[id*="apikey" i]',
+                        'input[name*="api-key" i]',
+                        'input[id*="api-key" i]',
+                        'input[name*="key" i]',
+                        'input[placeholder*="api" i]',
+                        'input[type="text"]',
+                    ];
+
+                    for (const selector of selectors) {
+                        const input = document.querySelector(selector);
+                        if (input) {
+                            input.value = key;
+                            input.dispatchEvent(new Event('input', { bubbles: true }));
+                            input.dispatchEvent(new Event('change', { bubbles: true }));
+
+                            // Try to find and click save button
+                            const saveSelectors = [
+                                'button[type="submit"]',
+                                'input[type="submit"]',
+                                'button:contains("Save")',
+                                'button:contains("Apply")',
+                            ];
+
+                            for (const btnSel of saveSelectors) {
+                                const btn = document.querySelector(btnSel);
+                                if (btn) {
+                                    btn.click();
+                                    break;
+                                }
+                            }
+
+                            // Also save to storage
+                            if (typeof chrome !== 'undefined' && chrome.storage) {
+                                chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
+                                chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
+                            }
+
+                            return true;
+                        }
+                    }
+
+                    // Fallback: Just save to storage
+                    if (typeof chrome !== 'undefined' && chrome.storage) {
+                        chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
+                        chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
+                        return true;
+                    }
+
+                    return false;
+                }, apiKey);
+
+                await configPage.close();
+
+                if (configured) {
+                    console.log('[+] 2captcha API key configured successfully via options page');
+
+                    // Mark as configured
+                    fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
+
+                    return { success: true, method: 'options_page' };
+                }
+            } catch (e) {
+                console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
+                try {
+                    await configPage.close();
+                } catch (e2) {}
+            }
+
+            return { success: false, error: 'Could not configure via any method' };
+        } finally {
+            browser.disconnect();
+        }
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let error = '';
+
+    try {
+        const result = await configure2Captcha();
+
+        if (result.skipped) {
+            status = 'skipped';
+        } else if (result.success) {
+            status = 'succeeded';
+        } else {
+            status = 'failed';
+            error = result.error || 'Configuration failed';
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: 'captcha2_config',
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/captcha2/tests/test_captcha2.py
+++ b/archivebox/plugins/captcha2/tests/test_captcha2.py
@@ -0,0 +1,184 @@
+"""
+Unit tests for captcha2 plugin
+
+Tests invoke the plugin hooks as external processes and verify outputs/side effects.
+"""
+
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__01_captcha2.js"
+CONFIG_SCRIPT = PLUGIN_DIR / "on_Snapshot__21_captcha2_config.js"
+
+
+def test_install_script_exists():
+    """Verify install script exists"""
+    assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
+
+
+def test_config_script_exists():
+    """Verify config script exists"""
+    assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
+
+
+def test_extension_metadata():
+    """Test that captcha2 extension has correct metadata"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
+
+        # Just check the script can be loaded
+        result = subprocess.run(
+            ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
+            capture_output=True,
+            text=True,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
+
+        metadata = json.loads(result.stdout)
+        assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
+        assert metadata["name"] == "captcha2"
+
+
+def test_install_creates_cache():
+    """Test that install creates extension cache"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+        env["API_KEY_2CAPTCHA"] = "test_api_key"
+
+        # Run install script
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Check output mentions installation
+        assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
+
+        # Check cache file was created
+        cache_file = ext_dir / "captcha2.extension.json"
+        assert cache_file.exists(), "Cache file should be created"
+
+        # Verify cache content
+        cache_data = json.loads(cache_file.read_text())
+        assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
+        assert cache_data["name"] == "captcha2"
+        assert "unpacked_path" in cache_data
+        assert "version" in cache_data
+
+
+def test_install_uses_existing_cache():
+    """Test that install uses existing cache when available"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        # Create fake cache
+        fake_extension_dir = ext_dir / "ifibfemgeogfhoebkmokieepdoobkbpo__captcha2"
+        fake_extension_dir.mkdir(parents=True)
+
+        manifest = {"version": "3.7.0", "name": "2Captcha Solver"}
+        (fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
+
+        cache_data = {
+            "webstore_id": "ifibfemgeogfhoebkmokieepdoobkbpo",
+            "name": "captcha2",
+            "unpacked_path": str(fake_extension_dir),
+            "version": "3.7.0"
+        }
+        (ext_dir / "captcha2.extension.json").write_text(json.dumps(cache_data))
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+        env["API_KEY_2CAPTCHA"] = "test_api_key"
+
+        # Run install script
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should use cache
+        assert "already installed (using cache)" in result.stdout or "Installed extension captcha2" in result.stdout
+
+
+def test_install_warns_without_api_key():
+    """Test that install warns when API key not configured"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+        # Don't set API_KEY_2CAPTCHA
+
+        # Run install script
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should warn about missing API key
+        combined_output = result.stdout + result.stderr
+        assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
+
+
+def test_install_success_with_api_key():
+    """Test that install succeeds when API key is configured"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+        env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
+
+        # Run install script
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should mention API key configured
+        combined_output = result.stdout + result.stderr
+        assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
+
+
+def test_config_script_structure():
+    """Test that config script has proper structure"""
+    # Verify the script exists and contains expected markers
+    script_content = CONFIG_SCRIPT.read_text()
+
+    # Should mention configuration marker file
+    assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
+
+    # Should mention API key
+    assert "API_KEY_2CAPTCHA" in script_content
+
+    # Should have main function or be executable
+    assert "async function" in script_content or "main" in script_content
--- a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
+++ b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""
+Clean up Chrome browser session started by chrome_session extractor.
+
+This extractor runs after all Chrome-based extractors (screenshot, pdf, dom)
+to terminate the Chrome process and clean up any leftover files.
+
+Usage: on_Snapshot__24_chrome_cleanup.py --url=<url> --snapshot-id=<uuid>
+Output: Terminates Chrome process and removes lock files
+
+Environment variables:
+    CHROME_USER_DATA_DIR: Chrome profile directory (for lock file cleanup)
+    CHROME_PROFILE_NAME: Chrome profile name (default: Default)
+"""
+
+import json
+import os
+import signal
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'chrome_cleanup'
+CHROME_SESSION_DIR = 'chrome_session'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def cleanup_chrome_session() -> tuple[bool, str | None, str]:
+    """
+    Clean up Chrome session started by chrome_session extractor.
+
+    Returns: (success, output_info, error_message)
+    """
+    session_dir = Path(CHROME_SESSION_DIR)
+
+    if not session_dir.exists():
+        return True, 'No chrome_session directory found', ''
+
+    pid_file = session_dir / 'pid.txt'
+    killed = False
+
+    if pid_file.exists():
+        try:
+            pid = int(pid_file.read_text().strip())
+
+            # Try graceful termination first
+            try:
+                os.kill(pid, signal.SIGTERM)
+                killed = True
+
+                # Wait briefly for graceful shutdown
+                for _ in range(10):
+                    try:
+                        os.kill(pid, 0)  # Check if still running
+                        time.sleep(0.1)
+                    except OSError:
+                        break  # Process is gone
+                else:
+                    # Force kill if still running
+                    try:
+                        os.kill(pid, signal.SIGKILL)
+                    except OSError:
+                        pass
+
+            except OSError as e:
+                # Process might already be dead, that's fine
+                if e.errno == 3:  # No such process
+                    pass
+                else:
+                    return False, None, f'Failed to kill Chrome PID {pid}: {e}'
+
+        except ValueError:
+            return False, None, f'Invalid PID in {pid_file}'
+        except Exception as e:
+            return False, None, f'{type(e).__name__}: {e}'
+
+    # Clean up Chrome profile lock files if configured
+    user_data_dir = get_env('CHROME_USER_DATA_DIR', '')
+    profile_name = get_env('CHROME_PROFILE_NAME', 'Default')
+
+    if user_data_dir:
+        user_data_path = Path(user_data_dir)
+        for lockfile in [
+            user_data_path / 'SingletonLock',
+            user_data_path / profile_name / 'SingletonLock',
+        ]:
+            try:
+                lockfile.unlink(missing_ok=True)
+            except Exception:
+                pass  # Best effort cleanup
+
+    result_info = f'Chrome cleanup: PID {"killed" if killed else "not found"}'
+    return True, result_info, ''
+
+
+@click.command()
+@click.option('--url', required=True, help='URL that was loaded')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Clean up Chrome browser session."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+
+    try:
+        success, output, error = cleanup_chrome_session()
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            print(f'Chrome cleanup completed: {output}')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/chrome_extensions/chrome_extension_utils.js
+++ b/archivebox/plugins/chrome_extensions/chrome_extension_utils.js
@@ -0,0 +1,483 @@
+#!/usr/bin/env node
+/**
+ * Chrome Extension Management Utilities
+ *
+ * Handles downloading, installing, and managing Chrome extensions for browser automation.
+ * Ported from the TypeScript implementation in archivebox.ts
+ */
+
+const fs = require('fs');
+const path = require('path');
+const crypto = require('crypto');
+const { exec } = require('child_process');
+const { promisify } = require('util');
+const { Readable } = require('stream');
+const { finished } = require('stream/promises');
+
+const execAsync = promisify(exec);
+
+// Try to import unzipper, fallback to system unzip if not available
+let unzip = null;
+try {
+    const unzipper = require('unzipper');
+    unzip = async (sourcePath, destPath) => {
+        const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath }));
+        return stream.promise();
+    };
+} catch (err) {
+    // Will use system unzip command as fallback
+}
+
+/**
+ * Compute the extension ID from the unpacked path.
+ * Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id.
+ *
+ * @param {string} unpacked_path - Path to the unpacked extension directory
+ * @returns {string} - 32-character extension ID
+ */
+function getExtensionId(unpacked_path) {
+    // Chrome uses a SHA256 hash of the unpacked extension directory path
+    const hash = crypto.createHash('sha256');
+    hash.update(Buffer.from(unpacked_path, 'utf-8'));
+
+    // Convert first 32 hex chars to characters in the range 'a'-'p'
+    const detected_extension_id = Array.from(hash.digest('hex'))
+        .slice(0, 32)
+        .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
+        .join('');
+
+    return detected_extension_id;
+}
+
+/**
+ * Download and install a Chrome extension from the Chrome Web Store.
+ *
+ * @param {Object} extension - Extension metadata object
+ * @param {string} extension.webstore_id - Chrome Web Store extension ID
+ * @param {string} extension.name - Human-readable extension name
+ * @param {string} extension.crx_url - URL to download the CRX file
+ * @param {string} extension.crx_path - Local path to save the CRX file
+ * @param {string} extension.unpacked_path - Path to extract the extension
+ * @returns {Promise<boolean>} - True if installation succeeded
+ */
+async function installExtension(extension) {
+    const manifest_path = path.join(extension.unpacked_path, 'manifest.json');
+
+    // Download CRX file if not already downloaded
+    if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
+        console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`);
+
+        try {
+            // Ensure parent directory exists
+            const crxDir = path.dirname(extension.crx_path);
+            if (!fs.existsSync(crxDir)) {
+                fs.mkdirSync(crxDir, { recursive: true });
+            }
+
+            // Download CRX file from Chrome Web Store
+            const response = await fetch(extension.crx_url);
+
+            if (!response.ok) {
+                console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`);
+                return false;
+            }
+
+            if (response.body) {
+                const crx_file = fs.createWriteStream(extension.crx_path);
+                const crx_stream = Readable.fromWeb(response.body);
+                await finished(crx_stream.pipe(crx_file));
+            } else {
+                console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`);
+                return false;
+            }
+        } catch (err) {
+            console.error(`[❌] Failed to download extension ${extension.name}:`, err);
+            return false;
+        }
+    }
+
+    // Unzip CRX file to unpacked_path
+    await fs.promises.mkdir(extension.unpacked_path, { recursive: true });
+
+    try {
+        // Try system unzip command first
+        await execAsync(`/usr/bin/unzip -o ${extension.crx_path} -d ${extension.unpacked_path}`);
+    } catch (err1) {
+        if (unzip) {
+            // Fallback to unzipper library
+            try {
+                await unzip(extension.crx_path, extension.unpacked_path);
+            } catch (err2) {
+                console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
+                return false;
+            }
+        } else {
+            console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
+            return false;
+        }
+    }
+
+    if (!fs.existsSync(manifest_path)) {
+        console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`);
+        return false;
+    }
+
+    return true;
+}
+
+/**
+ * Load or install a Chrome extension, computing all metadata.
+ *
+ * @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path)
+ * @param {string} [ext.webstore_id] - Chrome Web Store extension ID
+ * @param {string} [ext.name] - Human-readable extension name
+ * @param {string} [ext.unpacked_path] - Path to unpacked extension
+ * @param {string} [extensions_dir] - Directory to store extensions
+ * @returns {Promise<Object>} - Complete extension metadata object
+ */
+async function loadOrInstallExtension(ext, extensions_dir = null) {
+    if (!(ext.webstore_id || ext.unpacked_path)) {
+        throw new Error('Extension must have either {webstore_id} or {unpacked_path}');
+    }
+
+    // Determine extensions directory
+    const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions';
+
+    // Set statically computable extension metadata
+    ext.webstore_id = ext.webstore_id || ext.id;
+    ext.name = ext.name || ext.webstore_id;
+    ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`;
+    ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`;
+    ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`);
+    ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`);
+
+    const manifest_path = path.join(ext.unpacked_path, 'manifest.json');
+    ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'));
+    ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null;
+
+    // If extension is not installed, download and unpack it
+    if (!ext.read_version()) {
+        await installExtension(ext);
+    }
+
+    // Autodetect ID from filesystem path (unpacked extensions don't have stable IDs)
+    ext.id = getExtensionId(ext.unpacked_path);
+    ext.version = ext.read_version();
+
+    if (!ext.version) {
+        console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`);
+    } else {
+        console.log(`[➕] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`);
+    }
+
+    return ext;
+}
+
+/**
+ * Check if a Puppeteer target is an extension background page/service worker.
+ *
+ * @param {Object} target - Puppeteer target object
+ * @returns {Promise<Object>} - Object with target_is_bg, extension_id, manifest_version, etc.
+ */
+async function isTargetExtension(target) {
+    let target_type;
+    let target_ctx;
+    let target_url;
+
+    try {
+        target_type = target.type();
+        target_ctx = (await target.worker()) || (await target.page()) || null;
+        target_url = target.url() || target_ctx?.url() || null;
+    } catch (err) {
+        if (String(err).includes('No target with given id found')) {
+            // Target closed during check, ignore harmless race condition
+            target_type = 'closed';
+            target_ctx = null;
+            target_url = 'about:closed';
+        } else {
+            throw err;
+        }
+    }
+
+    // Check if this is an extension background page or service worker
+    const is_chrome_extension = target_url?.startsWith('chrome-extension://');
+    const is_background_page = target_type === 'background_page';
+    const is_service_worker = target_type === 'service_worker';
+    const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker);
+
+    let extension_id = null;
+    let manifest_version = null;
+    const target_is_extension = is_chrome_extension || target_is_bg;
+
+    if (target_is_extension) {
+        try {
+            extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
+
+            if (target_ctx) {
+                const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
+                manifest_version = manifest?.manifest_version || null;
+            }
+        } catch (err) {
+            // Failed to get extension metadata
+        }
+    }
+
+    return {
+        target_is_extension,
+        target_is_bg,
+        target_type,
+        target_ctx,
+        target_url,
+        extension_id,
+        manifest_version,
+    };
+}
+
+/**
+ * Load extension metadata and connection handlers from a browser target.
+ *
+ * @param {Array} extensions - Array of extension metadata objects to update
+ * @param {Object} target - Puppeteer target object
+ * @returns {Promise<Object|null>} - Updated extension object or null if not an extension
+ */
+async function loadExtensionFromTarget(extensions, target) {
+    const {
+        target_is_bg,
+        target_is_extension,
+        target_type,
+        target_ctx,
+        target_url,
+        extension_id,
+        manifest_version,
+    } = await isTargetExtension(target);
+
+    if (!(target_is_bg && extension_id && target_ctx)) {
+        return null;
+    }
+
+    // Find matching extension in our list
+    const extension = extensions.find(ext => ext.id === extension_id);
+    if (!extension) {
+        console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`);
+        return null;
+    }
+
+    // Load manifest from the extension context
+    let manifest = null;
+    try {
+        manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
+    } catch (err) {
+        console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err);
+        return null;
+    }
+
+    // Create dispatch methods for communicating with the extension
+    const new_extension = {
+        ...extension,
+        target,
+        target_type,
+        target_url,
+        manifest,
+        manifest_version,
+
+        // Trigger extension toolbar button click
+        dispatchAction: async (tab) => {
+            return await target_ctx.evaluate((tabId) => {
+                return new Promise((resolve) => {
+                    chrome.action.onClicked.addListener((tab) => {
+                        resolve({ success: true, tab });
+                    });
+                    chrome.action.openPopup();
+                });
+            }, tab?.id || null);
+        },
+
+        // Send message to extension
+        dispatchMessage: async (message, options = {}) => {
+            return await target_ctx.evaluate((msg, opts) => {
+                return new Promise((resolve) => {
+                    chrome.runtime.sendMessage(msg, opts, (response) => {
+                        resolve(response);
+                    });
+                });
+            }, message, options);
+        },
+
+        // Trigger extension command (keyboard shortcut)
+        dispatchCommand: async (command) => {
+            return await target_ctx.evaluate((cmd) => {
+                return new Promise((resolve) => {
+                    chrome.commands.onCommand.addListener((receivedCommand) => {
+                        if (receivedCommand === cmd) {
+                            resolve({ success: true, command: receivedCommand });
+                        }
+                    });
+                    // Note: Actually triggering commands programmatically is not directly supported
+                    // This would need to be done via CDP or keyboard simulation
+                });
+            }, command);
+        },
+    };
+
+    // Update the extension in the array
+    Object.assign(extension, new_extension);
+
+    console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`);
+
+    return new_extension;
+}
+
+/**
+ * Install all extensions in the list if not already installed.
+ *
+ * @param {Array} extensions - Array of extension metadata objects
+ * @param {string} [extensions_dir] - Directory to store extensions
+ * @returns {Promise<Array>} - Array of installed extension objects
+ */
+async function installAllExtensions(extensions, extensions_dir = null) {
+    console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`);
+
+    for (const extension of extensions) {
+        await loadOrInstallExtension(extension, extensions_dir);
+    }
+
+    return extensions;
+}
+
+/**
+ * Load and connect to all extensions from a running browser.
+ *
+ * @param {Object} browser - Puppeteer browser instance
+ * @param {Array} extensions - Array of extension metadata objects
+ * @returns {Promise<Array>} - Array of loaded extension objects with connection handlers
+ */
+async function loadAllExtensionsFromBrowser(browser, extensions) {
+    console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`);
+
+    // Find loaded extensions at runtime by examining browser targets
+    for (const target of browser.targets()) {
+        await loadExtensionFromTarget(extensions, target);
+    }
+
+    return extensions;
+}
+
+/**
+ * Load extension manifest.json file
+ *
+ * @param {string} unpacked_path - Path to unpacked extension directory
+ * @returns {object|null} - Parsed manifest object or null if not found/invalid
+ */
+function loadExtensionManifest(unpacked_path) {
+    const manifest_path = path.join(unpacked_path, 'manifest.json');
+
+    if (!fs.existsSync(manifest_path)) {
+        return null;
+    }
+
+    try {
+        const manifest_content = fs.readFileSync(manifest_path, 'utf-8');
+        return JSON.parse(manifest_content);
+    } catch (error) {
+        // Invalid JSON or read error
+        return null;
+    }
+}
+
+/**
+ * Generate Chrome launch arguments for loading extensions.
+ *
+ * @param {Array} extensions - Array of extension metadata objects
+ * @returns {Array<string>} - Chrome CLI arguments for loading extensions
+ */
+function getExtensionLaunchArgs(extensions) {
+    if (!extensions || extensions.length === 0) {
+        return [];
+    }
+
+    // Filter out extensions without unpacked_path first
+    const validExtensions = extensions.filter(ext => ext.unpacked_path);
+
+    const unpacked_paths = validExtensions.map(ext => ext.unpacked_path);
+    const webstore_ids = validExtensions.map(ext => ext.webstore_id || ext.id);
+
+    return [
+        `--load-extension=${unpacked_paths.join(',')}`,
+        `--allowlisted-extension-id=${webstore_ids.join(',')}`,
+        '--allow-legacy-extension-manifests',
+        '--disable-extensions-auto-update',
+    ];
+}
+
+// Export all functions
+module.exports = {
+    getExtensionId,
+    loadExtensionManifest,
+    installExtension,
+    loadOrInstallExtension,
+    isTargetExtension,
+    loadExtensionFromTarget,
+    installAllExtensions,
+    loadAllExtensionsFromBrowser,
+    getExtensionLaunchArgs,
+};
+
+// CLI usage
+if (require.main === module) {
+    const args = process.argv.slice(2);
+
+    if (args.length === 0) {
+        console.log('Usage: chrome_extension_utils.js <command> [args...]');
+        console.log('');
+        console.log('Commands:');
+        console.log('  getExtensionId <path>');
+        console.log('  loadExtensionManifest <path>');
+        console.log('  getExtensionLaunchArgs <extensions_json>');
+        console.log('  loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
+        process.exit(1);
+    }
+
+    const [command, ...commandArgs] = args;
+
+    (async () => {
+        try {
+            switch (command) {
+                case 'getExtensionId': {
+                    const [unpacked_path] = commandArgs;
+                    const id = getExtensionId(unpacked_path);
+                    console.log(id);
+                    break;
+                }
+
+                case 'loadExtensionManifest': {
+                    const [unpacked_path] = commandArgs;
+                    const manifest = loadExtensionManifest(unpacked_path);
+                    console.log(JSON.stringify(manifest));
+                    break;
+                }
+
+                case 'getExtensionLaunchArgs': {
+                    const [extensions_json] = commandArgs;
+                    const extensions = JSON.parse(extensions_json);
+                    const args = getExtensionLaunchArgs(extensions);
+                    console.log(JSON.stringify(args));
+                    break;
+                }
+
+                case 'loadOrInstallExtension': {
+                    const [webstore_id, name, extensions_dir] = commandArgs;
+                    const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir);
+                    console.log(JSON.stringify(ext, null, 2));
+                    break;
+                }
+
+                default:
+                    console.error(`Unknown command: ${command}`);
+                    process.exit(1);
+            }
+        } catch (error) {
+            console.error(`Error: ${error.message}`);
+            process.exit(1);
+        }
+    })();
+}
--- a/archivebox/plugins/chrome_extensions/tests/test_chrome_extension_utils.js
+++ b/archivebox/plugins/chrome_extensions/tests/test_chrome_extension_utils.js
@@ -0,0 +1,329 @@
+/**
+ * Unit tests for chrome_extension_utils.js
+ *
+ * Run with: npm test
+ * Or: node --test tests/test_chrome_extension_utils.js
+ */
+
+const assert = require('assert');
+const fs = require('fs');
+const path = require('path');
+const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
+
+// Import module under test
+const extensionUtils = require('../chrome_extension_utils.js');
+
+// Test fixtures
+const TEST_DIR = path.join(__dirname, '.test_fixtures');
+const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
+
+describe('chrome_extension_utils', () => {
+    before(() => {
+        // Create test directory
+        if (!fs.existsSync(TEST_DIR)) {
+            fs.mkdirSync(TEST_DIR, { recursive: true });
+        }
+    });
+
+    after(() => {
+        // Cleanup test directory
+        if (fs.existsSync(TEST_DIR)) {
+            fs.rmSync(TEST_DIR, { recursive: true, force: true });
+        }
+    });
+
+    describe('getExtensionId', () => {
+        it('should compute extension ID from path', () => {
+            const testPath = '/path/to/extension';
+            const extensionId = extensionUtils.getExtensionId(testPath);
+
+            assert.strictEqual(typeof extensionId, 'string');
+            assert.strictEqual(extensionId.length, 32);
+            // Should only contain lowercase letters a-p
+            assert.match(extensionId, /^[a-p]+$/);
+        });
+
+        it('should compute ID even for non-existent paths', () => {
+            const testPath = '/nonexistent/path';
+            const extensionId = extensionUtils.getExtensionId(testPath);
+
+            // Should still compute an ID from the path string
+            assert.strictEqual(typeof extensionId, 'string');
+            assert.strictEqual(extensionId.length, 32);
+            assert.match(extensionId, /^[a-p]+$/);
+        });
+
+        it('should return consistent ID for same path', () => {
+            const testPath = '/path/to/extension';
+            const id1 = extensionUtils.getExtensionId(testPath);
+            const id2 = extensionUtils.getExtensionId(testPath);
+
+            assert.strictEqual(id1, id2);
+        });
+
+        it('should return different IDs for different paths', () => {
+            const path1 = '/path/to/extension1';
+            const path2 = '/path/to/extension2';
+            const id1 = extensionUtils.getExtensionId(path1);
+            const id2 = extensionUtils.getExtensionId(path2);
+
+            assert.notStrictEqual(id1, id2);
+        });
+    });
+
+    describe('loadExtensionManifest', () => {
+        beforeEach(() => {
+            // Create test extension directory with manifest
+            const testExtDir = path.join(TEST_DIR, 'test_extension');
+            fs.mkdirSync(testExtDir, { recursive: true });
+
+            const manifest = {
+                manifest_version: 3,
+                name: "Test Extension",
+                version: "1.0.0"
+            };
+
+            fs.writeFileSync(
+                path.join(testExtDir, 'manifest.json'),
+                JSON.stringify(manifest)
+            );
+        });
+
+        afterEach(() => {
+            // Cleanup test extension
+            const testExtDir = path.join(TEST_DIR, 'test_extension');
+            if (fs.existsSync(testExtDir)) {
+                fs.rmSync(testExtDir, { recursive: true });
+            }
+        });
+
+        it('should load valid manifest.json', () => {
+            const testExtDir = path.join(TEST_DIR, 'test_extension');
+            const manifest = extensionUtils.loadExtensionManifest(testExtDir);
+
+            assert.notStrictEqual(manifest, null);
+            assert.strictEqual(manifest.manifest_version, 3);
+            assert.strictEqual(manifest.name, "Test Extension");
+            assert.strictEqual(manifest.version, "1.0.0");
+        });
+
+        it('should return null for missing manifest', () => {
+            const nonExistentDir = path.join(TEST_DIR, 'nonexistent');
+            const manifest = extensionUtils.loadExtensionManifest(nonExistentDir);
+
+            assert.strictEqual(manifest, null);
+        });
+
+        it('should handle invalid JSON gracefully', () => {
+            const testExtDir = path.join(TEST_DIR, 'invalid_extension');
+            fs.mkdirSync(testExtDir, { recursive: true });
+
+            // Write invalid JSON
+            fs.writeFileSync(
+                path.join(testExtDir, 'manifest.json'),
+                'invalid json content'
+            );
+
+            const manifest = extensionUtils.loadExtensionManifest(testExtDir);
+
+            assert.strictEqual(manifest, null);
+
+            // Cleanup
+            fs.rmSync(testExtDir, { recursive: true });
+        });
+    });
+
+    describe('getExtensionLaunchArgs', () => {
+        it('should return empty array for no extensions', () => {
+            const args = extensionUtils.getExtensionLaunchArgs([]);
+
+            assert.deepStrictEqual(args, []);
+        });
+
+        it('should generate correct launch args for single extension', () => {
+            const extensions = [{
+                webstore_id: 'abcd1234',
+                unpacked_path: '/path/to/extension'
+            }];
+
+            const args = extensionUtils.getExtensionLaunchArgs(extensions);
+
+            assert.strictEqual(args.length, 4);
+            assert.strictEqual(args[0], '--load-extension=/path/to/extension');
+            assert.strictEqual(args[1], '--allowlisted-extension-id=abcd1234');
+            assert.strictEqual(args[2], '--allow-legacy-extension-manifests');
+            assert.strictEqual(args[3], '--disable-extensions-auto-update');
+        });
+
+        it('should generate correct launch args for multiple extensions', () => {
+            const extensions = [
+                { webstore_id: 'ext1', unpacked_path: '/path/ext1' },
+                { webstore_id: 'ext2', unpacked_path: '/path/ext2' },
+                { webstore_id: 'ext3', unpacked_path: '/path/ext3' }
+            ];
+
+            const args = extensionUtils.getExtensionLaunchArgs(extensions);
+
+            assert.strictEqual(args.length, 4);
+            assert.strictEqual(args[0], '--load-extension=/path/ext1,/path/ext2,/path/ext3');
+            assert.strictEqual(args[1], '--allowlisted-extension-id=ext1,ext2,ext3');
+        });
+
+        it('should handle extensions with id instead of webstore_id', () => {
+            const extensions = [{
+                id: 'computed_id',
+                unpacked_path: '/path/to/extension'
+            }];
+
+            const args = extensionUtils.getExtensionLaunchArgs(extensions);
+
+            assert.strictEqual(args[1], '--allowlisted-extension-id=computed_id');
+        });
+
+        it('should filter out extensions without paths', () => {
+            const extensions = [
+                { webstore_id: 'ext1', unpacked_path: '/path/ext1' },
+                { webstore_id: 'ext2', unpacked_path: null },
+                { webstore_id: 'ext3', unpacked_path: '/path/ext3' }
+            ];
+
+            const args = extensionUtils.getExtensionLaunchArgs(extensions);
+
+            assert.strictEqual(args[0], '--load-extension=/path/ext1,/path/ext3');
+            assert.strictEqual(args[1], '--allowlisted-extension-id=ext1,ext3');
+        });
+    });
+
+    describe('loadOrInstallExtension', () => {
+        beforeEach(() => {
+            // Create test extensions directory
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            // Cleanup test extensions directory
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        it('should throw error if neither webstore_id nor unpacked_path provided', async () => {
+            await assert.rejects(
+                async () => {
+                    await extensionUtils.loadOrInstallExtension({}, TEST_EXTENSIONS_DIR);
+                },
+                /Extension must have either/
+            );
+        });
+
+        it('should set correct default values for extension metadata', async () => {
+            const input = {
+                webstore_id: 'test123',
+                name: 'test_extension'
+            };
+
+            // Mock the installation to avoid actual download
+            const originalInstall = extensionUtils.installExtension;
+            extensionUtils.installExtension = async () => {
+                // Create fake manifest
+                const extDir = path.join(TEST_EXTENSIONS_DIR, 'test123__test_extension');
+                fs.mkdirSync(extDir, { recursive: true });
+                fs.writeFileSync(
+                    path.join(extDir, 'manifest.json'),
+                    JSON.stringify({ version: '1.0.0' })
+                );
+                return true;
+            };
+
+            const ext = await extensionUtils.loadOrInstallExtension(input, TEST_EXTENSIONS_DIR);
+
+            // Restore original
+            extensionUtils.installExtension = originalInstall;
+
+            assert.strictEqual(ext.webstore_id, 'test123');
+            assert.strictEqual(ext.name, 'test_extension');
+            assert.ok(ext.webstore_url.includes(ext.webstore_id));
+            assert.ok(ext.crx_url.includes(ext.webstore_id));
+            assert.ok(ext.crx_path.includes('test123__test_extension.crx'));
+            assert.ok(ext.unpacked_path.includes('test123__test_extension'));
+        });
+
+        it('should detect version from manifest after installation', async () => {
+            const input = {
+                webstore_id: 'test456',
+                name: 'versioned_extension'
+            };
+
+            // Create pre-installed extension
+            const extDir = path.join(TEST_EXTENSIONS_DIR, 'test456__versioned_extension');
+            fs.mkdirSync(extDir, { recursive: true });
+            fs.writeFileSync(
+                path.join(extDir, 'manifest.json'),
+                JSON.stringify({
+                    manifest_version: 3,
+                    name: "Versioned Extension",
+                    version: "2.5.1"
+                })
+            );
+
+            const ext = await extensionUtils.loadOrInstallExtension(input, TEST_EXTENSIONS_DIR);
+
+            assert.strictEqual(ext.version, '2.5.1');
+        });
+    });
+
+    describe('isTargetExtension', () => {
+        it('should identify extension targets by URL', async () => {
+            // Mock Puppeteer target
+            const mockTarget = {
+                type: () => 'service_worker',
+                url: () => 'chrome-extension://abcdefgh/background.js',
+                worker: async () => null,
+                page: async () => null
+            };
+
+            const result = await extensionUtils.isTargetExtension(mockTarget);
+
+            assert.strictEqual(result.target_is_extension, true);
+            assert.strictEqual(result.target_is_bg, true);
+            assert.strictEqual(result.extension_id, 'abcdefgh');
+        });
+
+        it('should not identify non-extension targets', async () => {
+            const mockTarget = {
+                type: () => 'page',
+                url: () => 'https://example.com',
+                worker: async () => null,
+                page: async () => null
+            };
+
+            const result = await extensionUtils.isTargetExtension(mockTarget);
+
+            assert.strictEqual(result.target_is_extension, false);
+            assert.strictEqual(result.target_is_bg, false);
+            assert.strictEqual(result.extension_id, null);
+        });
+
+        it('should handle closed targets gracefully', async () => {
+            const mockTarget = {
+                type: () => { throw new Error('No target with given id found'); },
+                url: () => { throw new Error('No target with given id found'); },
+                worker: async () => { throw new Error('No target with given id found'); },
+                page: async () => { throw new Error('No target with given id found'); }
+            };
+
+            const result = await extensionUtils.isTargetExtension(mockTarget);
+
+            assert.strictEqual(result.target_type, 'closed');
+            assert.strictEqual(result.target_url, 'about:closed');
+        });
+    });
+});
+
+// Run tests if executed directly
+if (require.main === module) {
+    console.log('Run tests with: npm test');
+    console.log('Or: node --test tests/test_chrome_extension_utils.js');
+}
--- a/archivebox/plugins/chrome_extensions/tests/test_chrome_extension_utils.py
+++ b/archivebox/plugins/chrome_extensions/tests/test_chrome_extension_utils.py
@@ -0,0 +1,224 @@
+"""
+Unit tests for chrome_extension_utils.js
+
+Tests invoke the script as an external process and verify outputs/side effects.
+"""
+
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+SCRIPT_PATH = Path(__file__).parent.parent / "chrome_extension_utils.js"
+
+
+def test_script_exists():
+    """Verify the script file exists and is executable via node"""
+    assert SCRIPT_PATH.exists(), f"Script not found: {SCRIPT_PATH}"
+
+
+def test_get_extension_id():
+    """Test extension ID computation from path"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        test_path = "/path/to/extension"
+
+        # Run script with test path
+        result = subprocess.run(
+            ["node", str(SCRIPT_PATH), "getExtensionId", test_path],
+            capture_output=True,
+            text=True
+        )
+
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        extension_id = result.stdout.strip()
+
+        # Should return 32-character ID with only letters a-p
+        assert len(extension_id) == 32
+        assert all(c in 'abcdefghijklmnop' for c in extension_id)
+
+
+def test_get_extension_id_consistency():
+    """Test that same path produces same ID"""
+    test_path = "/path/to/extension"
+
+    result1 = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionId", test_path],
+        capture_output=True,
+        text=True
+    )
+
+    result2 = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionId", test_path],
+        capture_output=True,
+        text=True
+    )
+
+    assert result1.returncode == 0
+    assert result2.returncode == 0
+    assert result1.stdout.strip() == result2.stdout.strip()
+
+
+def test_get_extension_id_different_paths():
+    """Test that different paths produce different IDs"""
+    result1 = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionId", "/path1"],
+        capture_output=True,
+        text=True
+    )
+
+    result2 = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionId", "/path2"],
+        capture_output=True,
+        text=True
+    )
+
+    assert result1.returncode == 0
+    assert result2.returncode == 0
+    assert result1.stdout.strip() != result2.stdout.strip()
+
+
+def test_load_extension_manifest():
+    """Test loading extension manifest.json"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "test_extension"
+        ext_dir.mkdir()
+
+        # Create manifest
+        manifest = {
+            "manifest_version": 3,
+            "name": "Test Extension",
+            "version": "1.0.0"
+        }
+        (ext_dir / "manifest.json").write_text(json.dumps(manifest))
+
+        # Load manifest via script
+        result = subprocess.run(
+            ["node", str(SCRIPT_PATH), "loadExtensionManifest", str(ext_dir)],
+            capture_output=True,
+            text=True
+        )
+
+        assert result.returncode == 0
+        loaded = json.loads(result.stdout)
+
+        assert loaded["manifest_version"] == 3
+        assert loaded["name"] == "Test Extension"
+        assert loaded["version"] == "1.0.0"
+
+
+def test_load_extension_manifest_missing():
+    """Test loading manifest from non-existent directory"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        nonexistent = Path(tmpdir) / "nonexistent"
+
+        result = subprocess.run(
+            ["node", str(SCRIPT_PATH), "loadExtensionManifest", str(nonexistent)],
+            capture_output=True,
+            text=True
+        )
+
+        # Should return null/empty for missing manifest
+        assert result.returncode == 0
+        assert result.stdout.strip() in ("null", "")
+
+
+def test_load_extension_manifest_invalid_json():
+    """Test handling of invalid JSON in manifest"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "test_extension"
+        ext_dir.mkdir()
+
+        # Write invalid JSON
+        (ext_dir / "manifest.json").write_text("invalid json content")
+
+        result = subprocess.run(
+            ["node", str(SCRIPT_PATH), "loadExtensionManifest", str(ext_dir)],
+            capture_output=True,
+            text=True
+        )
+
+        # Should handle gracefully
+        assert result.returncode == 0
+        assert result.stdout.strip() in ("null", "")
+
+
+def test_get_extension_launch_args_empty():
+    """Test launch args with no extensions"""
+    result = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", "[]"],
+        capture_output=True,
+        text=True
+    )
+
+    assert result.returncode == 0
+    args = json.loads(result.stdout)
+    assert args == []
+
+
+def test_get_extension_launch_args_single():
+    """Test launch args with single extension"""
+    extensions = [{
+        "webstore_id": "abcd1234",
+        "unpacked_path": "/path/to/extension"
+    }]
+
+    result = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
+        capture_output=True,
+        text=True
+    )
+
+    assert result.returncode == 0
+    args = json.loads(result.stdout)
+
+    assert len(args) == 4
+    assert args[0] == "--load-extension=/path/to/extension"
+    assert args[1] == "--allowlisted-extension-id=abcd1234"
+    assert args[2] == "--allow-legacy-extension-manifests"
+    assert args[3] == "--disable-extensions-auto-update"
+
+
+def test_get_extension_launch_args_multiple():
+    """Test launch args with multiple extensions"""
+    extensions = [
+        {"webstore_id": "ext1", "unpacked_path": "/path/ext1"},
+        {"webstore_id": "ext2", "unpacked_path": "/path/ext2"},
+        {"webstore_id": "ext3", "unpacked_path": "/path/ext3"}
+    ]
+
+    result = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
+        capture_output=True,
+        text=True
+    )
+
+    assert result.returncode == 0
+    args = json.loads(result.stdout)
+
+    assert args[0] == "--load-extension=/path/ext1,/path/ext2,/path/ext3"
+    assert args[1] == "--allowlisted-extension-id=ext1,ext2,ext3"
+
+
+def test_get_extension_launch_args_filter_null_paths():
+    """Test that extensions without paths are filtered out"""
+    extensions = [
+        {"webstore_id": "ext1", "unpacked_path": "/path/ext1"},
+        {"webstore_id": "ext2", "unpacked_path": None},
+        {"webstore_id": "ext3", "unpacked_path": "/path/ext3"}
+    ]
+
+    result = subprocess.run(
+        ["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
+        capture_output=True,
+        text=True
+    )
+
+    assert result.returncode == 0
+    args = json.loads(result.stdout)
+
+    assert args[0] == "--load-extension=/path/ext1,/path/ext3"
+    assert args[1] == "--allowlisted-extension-id=ext1,ext3"
--- a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
@@ -0,0 +1,309 @@
+#!/usr/bin/env node
+/**
+ * Navigate the Chrome browser to the target URL.
+ *
+ * This extractor runs AFTER pre-load extractors (21-29) have registered their
+ * CDP listeners. It connects to the existing Chrome session, navigates to the URL,
+ * waits for page load, and captures response headers.
+ *
+ * Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes to chrome_session/:
+ *   - response_headers.json: HTTP response headers from main document
+ *   - final_url.txt: Final URL after any redirects
+ *   - page_loaded.txt: Marker file indicating navigation is complete
+ *
+ * Environment variables:
+ *     CHROME_PAGELOAD_TIMEOUT: Timeout for page load in seconds (default: 60)
+ *     CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
+ *     CHROME_WAIT_FOR: Wait condition (default: networkidle2)
+ *         - domcontentloaded: DOM is ready, resources may still load
+ *         - load: Page fully loaded including resources
+ *         - networkidle0: No network activity for 500ms (strictest)
+ *         - networkidle2: At most 2 network connections for 500ms
+ *
+ *     # Fallbacks
+ *     TIMEOUT: Fallback timeout
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'chrome_navigate';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+function getEnvFloat(name, defaultValue = 0) {
+    const val = parseFloat(getEnv(name, String(defaultValue)));
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Read CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (!fs.existsSync(cdpFile)) {
+        return null;
+    }
+    return fs.readFileSync(cdpFile, 'utf8').trim();
+}
+
+// Read URL from chrome_session (set by chrome_session extractor)
+function getTargetUrl() {
+    const urlFile = path.join(CHROME_SESSION_DIR, 'url.txt');
+    if (!fs.existsSync(urlFile)) {
+        return null;
+    }
+    return fs.readFileSync(urlFile, 'utf8').trim();
+}
+
+// Validate wait condition
+function getWaitCondition() {
+    const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
+    const validConditions = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
+    if (validConditions.includes(waitFor)) {
+        return waitFor;
+    }
+    console.error(`Warning: Invalid CHROME_WAIT_FOR="${waitFor}", using networkidle2`);
+    return 'networkidle2';
+}
+
+// Sleep helper
+function sleep(ms) {
+    return new Promise(resolve => setTimeout(resolve, ms));
+}
+
+async function navigateToUrl(url, cdpUrl) {
+    const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
+    const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
+    const waitUntil = getWaitCondition();
+
+    let browser = null;
+    let responseHeaders = {};
+    let redirectChain = [];
+    let finalUrl = url;
+
+    try {
+        // Connect to existing browser
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get all pages and find our target page
+        const pages = await browser.pages();
+        if (pages.length === 0) {
+            return { success: false, error: 'No pages found in browser' };
+        }
+
+        // Use the last created page (most likely the one chrome_session created)
+        const page = pages[pages.length - 1];
+
+        // Set up response interception to capture headers and redirects
+        page.on('response', async (response) => {
+            const request = response.request();
+
+            // Track redirects
+            if (response.status() >= 300 && response.status() < 400) {
+                redirectChain.push({
+                    url: response.url(),
+                    status: response.status(),
+                    location: response.headers()['location'] || null,
+                });
+            }
+
+            // Capture headers from the main document request
+            if (request.isNavigationRequest() && request.frame() === page.mainFrame()) {
+                try {
+                    responseHeaders = {
+                        url: response.url(),
+                        status: response.status(),
+                        statusText: response.statusText(),
+                        headers: response.headers(),
+                    };
+                    finalUrl = response.url();
+                } catch (e) {
+                    // Ignore errors capturing headers
+                }
+            }
+        });
+
+        // Navigate to URL and wait for load
+        console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
+
+        const response = await page.goto(url, {
+            waitUntil,
+            timeout,
+        });
+
+        // Capture final response if not already captured
+        if (response && Object.keys(responseHeaders).length === 0) {
+            responseHeaders = {
+                url: response.url(),
+                status: response.status(),
+                statusText: response.statusText(),
+                headers: response.headers(),
+            };
+            finalUrl = response.url();
+        }
+
+        // Apply optional delay after load
+        if (delayAfterLoad > 0) {
+            console.log(`Waiting ${delayAfterLoad}ms after load...`);
+            await sleep(delayAfterLoad);
+        }
+
+        // Write response headers
+        if (Object.keys(responseHeaders).length > 0) {
+            // Add redirect chain to headers
+            responseHeaders.redirect_chain = redirectChain;
+
+            fs.writeFileSync(
+                path.join(CHROME_SESSION_DIR, 'response_headers.json'),
+                JSON.stringify(responseHeaders, null, 2)
+            );
+        }
+
+        // Write final URL (after redirects)
+        fs.writeFileSync(path.join(CHROME_SESSION_DIR, 'final_url.txt'), finalUrl);
+
+        // Write marker file indicating page is loaded
+        fs.writeFileSync(
+            path.join(CHROME_SESSION_DIR, 'page_loaded.txt'),
+            new Date().toISOString()
+        );
+
+        // Disconnect but leave browser running for post-load extractors
+        browser.disconnect();
+
+        return {
+            success: true,
+            output: CHROME_SESSION_DIR,
+            finalUrl,
+            status: responseHeaders.status,
+            redirectCount: redirectChain.length,
+        };
+
+    } catch (e) {
+        // Don't close browser on error - let cleanup handle it
+        if (browser) {
+            try {
+                browser.disconnect();
+            } catch (disconnectErr) {
+                // Ignore
+            }
+        }
+        return { success: false, error: `${e.name}: ${e.message}` };
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check for chrome_session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            console.error('ERROR: chrome_session not found (cdp_url.txt missing)');
+            console.error('chrome_navigate requires chrome_session to run first');
+            process.exit(1);
+        }
+
+        // Get URL from chrome_session or use provided URL
+        const targetUrl = getTargetUrl() || url;
+
+        const result = await navigateToUrl(targetUrl, cdpUrl);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            console.log(`Page loaded: ${result.finalUrl}`);
+            console.log(`HTTP status: ${result.status}`);
+            if (result.redirectCount > 0) {
+                console.log(`Redirects: ${result.redirectCount}`);
+            }
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/chrome_session/config.json
+++ b/archivebox/plugins/chrome_session/config.json
@@ -0,0 +1,80 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "CHROME_BINARY": {
+      "type": "string",
+      "default": "chromium",
+      "x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
+      "description": "Path to Chrome/Chromium binary"
+    },
+    "NODE_BINARY": {
+      "type": "string",
+      "default": "node",
+      "x-aliases": ["NODEJS_BINARY"],
+      "description": "Path to Node.js binary (for Puppeteer)"
+    },
+    "CHROME_TIMEOUT": {
+      "type": "integer",
+      "default": 60,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for Chrome operations in seconds"
+    },
+    "CHROME_HEADLESS": {
+      "type": "boolean",
+      "default": true,
+      "description": "Run Chrome in headless mode"
+    },
+    "CHROME_SANDBOX": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable Chrome sandbox (disable in Docker with --no-sandbox)"
+    },
+    "CHROME_RESOLUTION": {
+      "type": "string",
+      "default": "1440,2000",
+      "pattern": "^\\d+,\\d+$",
+      "x-fallback": "RESOLUTION",
+      "description": "Browser viewport resolution (width,height)"
+    },
+    "CHROME_USER_DATA_DIR": {
+      "type": "string",
+      "default": "",
+      "description": "Path to Chrome user data directory for persistent sessions"
+    },
+    "CHROME_USER_AGENT": {
+      "type": "string",
+      "default": "",
+      "x-fallback": "USER_AGENT",
+      "description": "User agent string for Chrome"
+    },
+    "CHROME_EXTRA_ARGS": {
+      "type": "string",
+      "default": "",
+      "description": "Extra command-line arguments for Chrome (space-separated)"
+    },
+    "CHROME_CHECK_SSL_VALIDITY": {
+      "type": "boolean",
+      "default": true,
+      "x-fallback": "CHECK_SSL_VALIDITY",
+      "description": "Whether to verify SSL certificates"
+    },
+    "SAVE_SCREENSHOT": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable screenshot capture"
+    },
+    "SAVE_PDF": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable PDF generation"
+    },
+    "SAVE_DOM": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable DOM capture"
+    }
+  }
+}
--- a/archivebox/plugins/chrome_session/on_Crawl__00_validate_chrome.py
+++ b/archivebox/plugins/chrome_session/on_Crawl__00_validate_chrome.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+Validation hook for Chrome/Chromium binary.
+
+Runs at crawl start to verify Chrome is available.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+# Common Chrome/Chromium binary names and paths
+CHROME_NAMES = [
+    'chromium',
+    'chromium-browser',
+    'google-chrome',
+    'google-chrome-stable',
+    'chrome',
+]
+
+CHROME_PATHS = [
+    '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+    '/Applications/Chromium.app/Contents/MacOS/Chromium',
+    '/usr/bin/google-chrome',
+    '/usr/bin/google-chrome-stable',
+    '/usr/bin/chromium',
+    '/usr/bin/chromium-browser',
+    '/snap/bin/chromium',
+    '/opt/google/chrome/chrome',
+]
+
+
+def get_binary_version(abspath: str) -> str | None:
+    """Get version string from Chrome binary."""
+    try:
+        result = subprocess.run(
+            [abspath, '--version'],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode == 0 and result.stdout:
+            # Chrome version string: "Google Chrome 120.0.6099.109" or "Chromium 120.0.6099.109"
+            first_line = result.stdout.strip().split('\n')[0]
+            parts = first_line.split()
+            # Find version number (looks like 120.0.6099.109)
+            for part in parts:
+                if '.' in part and part[0].isdigit():
+                    return part
+            return first_line[:32]
+    except Exception:
+        pass
+    return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+    """Get SHA256 hash of binary."""
+    try:
+        with open(abspath, 'rb') as f:
+            return hashlib.sha256(f.read()).hexdigest()
+    except Exception:
+        return None
+
+
+def find_chrome() -> dict | None:
+    """Find Chrome/Chromium binary."""
+    # Check env var first
+    env_path = os.environ.get('CHROME_BINARY', '')
+    if env_path and Path(env_path).is_file():
+        return {
+            'name': 'chrome',
+            'abspath': env_path,
+            'version': get_binary_version(env_path),
+            'sha256': get_binary_hash(env_path),
+            'binprovider': 'env',
+        }
+
+    # Try shutil.which for various names
+    for name in CHROME_NAMES:
+        abspath = shutil.which(name)
+        if abspath:
+            return {
+                'name': 'chrome',
+                'abspath': abspath,
+                'version': get_binary_version(abspath),
+                'sha256': get_binary_hash(abspath),
+                'binprovider': 'env',
+            }
+
+    # Check common paths
+    for path in CHROME_PATHS:
+        if Path(path).is_file():
+            return {
+                'name': 'chrome',
+                'abspath': path,
+                'version': get_binary_version(path),
+                'sha256': get_binary_hash(path),
+                'binprovider': 'env',
+            }
+
+    return None
+
+
+def main():
+    result = find_chrome()
+
+    if result and result.get('abspath'):
+        print(json.dumps({
+            'type': 'InstalledBinary',
+            'name': result['name'],
+            'abspath': result['abspath'],
+            'version': result['version'],
+            'sha256': result['sha256'],
+            'binprovider': result['binprovider'],
+        }))
+
+        print(json.dumps({
+            'type': 'Machine',
+            '_method': 'update',
+            'key': 'config/CHROME_BINARY',
+            'value': result['abspath'],
+        }))
+
+        if result['version']:
+            print(json.dumps({
+                'type': 'Machine',
+                '_method': 'update',
+                'key': 'config/CHROME_VERSION',
+                'value': result['version'],
+            }))
+
+        sys.exit(0)
+    else:
+        print(json.dumps({
+            'type': 'Dependency',
+            'bin_name': 'chrome',
+            'bin_providers': 'apt,brew,env',
+        }))
+        print(f"Chrome/Chromium binary not found", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/chrome_session/on_Crawl__00_validate_chrome_config.py
+++ b/archivebox/plugins/chrome_session/on_Crawl__00_validate_chrome_config.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+"""
+Validate and compute derived Chrome config values.
+
+This hook runs early in the Crawl lifecycle to:
+1. Auto-detect Chrome binary location
+2. Compute sandbox settings based on Docker detection
+3. Validate binary availability and version
+4. Set computed env vars for subsequent hooks
+
+Output:
+    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
+    - InstalledBinary JSONL records to stdout when binaries are found
+"""
+
+import json
+import os
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+# Chrome binary search order
+CHROME_BINARY_NAMES = [
+    'chromium',
+    'chromium-browser',
+    'google-chrome',
+    'google-chrome-stable',
+    'chrome',
+]
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def detect_docker() -> bool:
+    """Detect if running inside Docker container."""
+    return (
+        os.path.exists('/.dockerenv') or
+        os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
+        os.path.exists('/run/.containerenv')
+    )
+
+
+def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
+    """Find Chrome binary using abx-pkg, checking configured path first."""
+    # Try configured binary first
+    if configured:
+        try:
+            binary = Binary(name=configured, binproviders=[provider]).load()
+            if binary.abspath:
+                return binary
+        except Exception:
+            pass
+
+    # Search common names
+    for name in CHROME_BINARY_NAMES:
+        try:
+            binary = Binary(name=name, binproviders=[provider]).load()
+            if binary.abspath:
+                return binary
+        except Exception:
+            continue
+
+    return None
+
+
+def output_installed_binary(binary: Binary, name: str):
+    """Output InstalledBinary JSONL record to stdout."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'InstalledBinary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    warnings = []
+    errors = []
+    computed = {}
+
+    # Get config values
+    chrome_binary = get_env('CHROME_BINARY', 'chromium')
+    chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
+    save_screenshot = get_env_bool('SAVE_SCREENSHOT', True)
+    save_pdf = get_env_bool('SAVE_PDF', True)
+    save_dom = get_env_bool('SAVE_DOM', True)
+
+    # Compute USE_CHROME (derived from SAVE_* flags)
+    use_chrome = save_screenshot or save_pdf or save_dom
+    computed['USE_CHROME'] = str(use_chrome).lower()
+
+    # Detect Docker and adjust sandbox
+    in_docker = detect_docker()
+    computed['IN_DOCKER'] = str(in_docker).lower()
+
+    if in_docker and chrome_sandbox:
+        warnings.append(
+            "Running in Docker with CHROME_SANDBOX=true. "
+            "Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
+        )
+        # Auto-disable sandbox in Docker unless explicitly set
+        if not get_env('CHROME_SANDBOX'):
+            computed['CHROME_SANDBOX'] = 'false'
+
+    # Find Chrome binary using abx-pkg
+    provider = EnvProvider()
+    if use_chrome:
+        chrome = find_chrome_binary(chrome_binary, provider)
+        if not chrome or not chrome.abspath:
+            errors.append(
+                f"Chrome binary not found (tried: {chrome_binary}). "
+                "Install Chrome/Chromium or set CHROME_BINARY path."
+            )
+            computed['CHROME_BINARY'] = ''
+        else:
+            computed['CHROME_BINARY'] = str(chrome.abspath)
+            computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
+
+            # Output InstalledBinary JSONL record for Chrome
+            output_installed_binary(chrome, name='chrome')
+
+    # Check Node.js for Puppeteer
+    node_binary_name = get_env('NODE_BINARY', 'node')
+    try:
+        node = Binary(name=node_binary_name, binproviders=[provider]).load()
+        node_path = str(node.abspath) if node.abspath else ''
+    except Exception:
+        node = None
+        node_path = ''
+
+    if use_chrome and not node_path:
+        errors.append(
+            f"Node.js not found (tried: {node_binary_name}). "
+            "Install Node.js or set NODE_BINARY path for Puppeteer."
+        )
+    else:
+        computed['NODE_BINARY'] = node_path
+        if node and node.abspath:
+            # Output InstalledBinary JSONL record for Node
+            output_installed_binary(node, name='node')
+
+    # Output computed values
+    for key, value in computed.items():
+        print(f"COMPUTED:{key}={value}")
+
+    for warning in warnings:
+        print(f"WARNING:{warning}", file=sys.stderr)
+
+    for error in errors:
+        print(f"ERROR:{error}", file=sys.stderr)
+
+    sys.exit(1 if errors else 0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js
+++ b/archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js
@@ -0,0 +1,350 @@
+#!/usr/bin/env node
+/**
+ * Start a Chrome browser session for use by other extractors.
+ *
+ * This extractor ONLY launches Chrome and creates a blank page - it does NOT navigate.
+ * Pre-load extractors (21-29) can connect via CDP to register listeners before navigation.
+ * The chrome_navigate extractor (30) performs the actual page load.
+ *
+ * Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>
+ * Output: Creates chrome_session/ with:
+ *   - cdp_url.txt: WebSocket URL for CDP connection
+ *   - pid.txt: Chrome process ID (for cleanup)
+ *   - page_id.txt: Target ID of the page for other extractors to use
+ *   - url.txt: The URL to be navigated to (for chrome_navigate)
+ *
+ * Environment variables:
+ *     CHROME_BINARY: Path to Chrome/Chromium binary
+ *     CHROME_RESOLUTION: Page resolution (default: 1440,2000)
+ *     CHROME_USER_AGENT: User agent string (optional)
+ *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
+ *     CHROME_HEADLESS: Run in headless mode (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Import extension utilities
+const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'chrome_session';
+const OUTPUT_DIR = 'chrome_session';
+
+// Get extensions directory from environment or use default
+const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
+    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+
+// Find Chrome binary
+function findChrome() {
+    const chromeBinary = getEnv('CHROME_BINARY');
+    if (chromeBinary && fs.existsSync(chromeBinary)) {
+        return chromeBinary;
+    }
+
+    const candidates = [
+        // Linux
+        '/usr/bin/google-chrome',
+        '/usr/bin/google-chrome-stable',
+        '/usr/bin/chromium',
+        '/usr/bin/chromium-browser',
+        // macOS
+        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+        '/Applications/Chromium.app/Contents/MacOS/Chromium',
+    ];
+
+    for (const candidate of candidates) {
+        if (fs.existsSync(candidate)) {
+            return candidate;
+        }
+    }
+
+    return null;
+}
+
+// Parse resolution string
+function parseResolution(resolution) {
+    const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
+    return { width: width || 1440, height: height || 2000 };
+}
+
+// Load installed extensions from cache files
+function loadInstalledExtensions() {
+    const extensions = [];
+
+    if (!fs.existsSync(EXTENSIONS_DIR)) {
+        return extensions;
+    }
+
+    // Look for *.extension.json cache files created by extension plugins
+    const files = fs.readdirSync(EXTENSIONS_DIR);
+    const extensionFiles = files.filter(f => f.endsWith('.extension.json'));
+
+    for (const file of extensionFiles) {
+        try {
+            const filePath = path.join(EXTENSIONS_DIR, file);
+            const data = fs.readFileSync(filePath, 'utf-8');
+            const extension = JSON.parse(data);
+
+            // Verify extension is actually installed
+            const manifestPath = path.join(extension.unpacked_path, 'manifest.json');
+            if (fs.existsSync(manifestPath)) {
+                extensions.push(extension);
+                console.log(`[+] Loaded extension: ${extension.name} (${extension.webstore_id})`);
+            }
+        } catch (e) {
+            console.warn(`[⚠️] Failed to load extension from ${file}: ${e.message}`);
+        }
+    }
+
+    return extensions;
+}
+
+
+async function startChromeSession(url, binary) {
+    const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
+    const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
+    const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
+    const headless = getEnvBool('CHROME_HEADLESS', true);
+
+    const { width, height } = parseResolution(resolution);
+
+    // Load installed extensions
+    const extensions = loadInstalledExtensions();
+    const extensionArgs = extensionUtils.getExtensionLaunchArgs(extensions);
+
+    if (extensions.length > 0) {
+        console.log(`[*] Loading ${extensions.length} Chrome extensions...`);
+    }
+
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+
+    let browser = null;
+
+    try {
+        // Launch browser with Puppeteer
+        browser = await puppeteer.launch({
+            executablePath: binary,
+            headless: headless ? 'new' : false,
+            args: [
+                '--no-sandbox',
+                '--disable-setuid-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--disable-sync',
+                '--no-first-run',
+                '--no-default-browser-check',
+                '--disable-default-apps',
+                '--disable-infobars',
+                '--disable-blink-features=AutomationControlled',
+                '--disable-component-update',
+                '--disable-domain-reliability',
+                '--disable-breakpad',
+                '--disable-background-networking',
+                '--disable-background-timer-throttling',
+                '--disable-backgrounding-occluded-windows',
+                '--disable-renderer-backgrounding',
+                '--disable-ipc-flooding-protection',
+                '--password-store=basic',
+                '--use-mock-keychain',
+                '--font-render-hinting=none',
+                '--force-color-profile=srgb',
+                `--window-size=${width},${height}`,
+                ...(checkSsl ? [] : ['--ignore-certificate-errors']),
+                ...extensionArgs,
+            ],
+            defaultViewport: { width, height },
+        });
+
+        // Get the WebSocket endpoint URL
+        const cdpUrl = browser.wsEndpoint();
+        fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
+
+        // Write PID for cleanup
+        const browserProcess = browser.process();
+        if (browserProcess) {
+            fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(browserProcess.pid));
+        }
+
+        // Create a new page (but DON'T navigate yet)
+        const page = await browser.newPage();
+
+        // Set user agent if specified
+        if (userAgent) {
+            await page.setUserAgent(userAgent);
+        }
+
+        // Write the page target ID so other extractors can find this specific page
+        const target = page.target();
+        const targetId = target._targetId;
+        fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
+
+        // Write the URL for chrome_navigate to use
+        fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
+
+        // Connect to loaded extensions at runtime (only if not already done)
+        const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
+        if (extensions.length > 0 && !fs.existsSync(extensionsFile)) {
+            console.log('[*] Connecting to loaded extensions (first time setup)...');
+            try {
+                const loadedExtensions = await extensionUtils.loadAllExtensionsFromBrowser(browser, extensions);
+
+                // Write loaded extensions metadata for other extractors to use
+                fs.writeFileSync(extensionsFile, JSON.stringify(loadedExtensions, null, 2));
+
+                console.log(`[+] Extensions loaded and available at ${extensionsFile}`);
+                console.log(`[+] ${loadedExtensions.length} extensions ready for configuration by subsequent plugins`);
+            } catch (e) {
+                console.warn(`[⚠️] Failed to load extensions from browser: ${e.message}`);
+            }
+        } else if (extensions.length > 0) {
+            console.log('[*] Extensions already loaded from previous snapshot');
+        }
+
+        // Don't close browser - leave it running for other extractors
+        // Detach puppeteer from browser so it stays running
+        browser.disconnect();
+
+        return { success: true, output: OUTPUT_DIR, cdpUrl, targetId };
+
+    } catch (e) {
+        // Kill browser if startup failed
+        if (browser) {
+            try {
+                await browser.close();
+            } catch (closeErr) {
+                // Ignore
+            }
+        }
+        return { success: false, error: `${e.name}: ${e.message}` };
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+    let version = '';
+
+    try {
+        // chrome_session launches Chrome and creates a blank page
+        // Pre-load extractors (21-29) register CDP listeners
+        // chrome_navigate (30) performs actual navigation
+        const binary = findChrome();
+        if (!binary) {
+            console.error('ERROR: Chrome/Chromium binary not found');
+            console.error('DEPENDENCY_NEEDED=chrome');
+            console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
+            console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
+            process.exit(1);
+        }
+
+        // Get Chrome version
+        try {
+            const { execSync } = require('child_process');
+            version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
+        } catch (e) {
+            version = '';
+        }
+
+        const result = await startChromeSession(url, binary);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            console.log(`Chrome session started (no navigation yet): ${result.cdpUrl}`);
+            console.log(`Page target ID: ${result.targetId}`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (version) {
+        console.log(`VERSION=${version}`);
+    }
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        cmd_version: version,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
@@ -0,0 +1,297 @@
+#!/usr/bin/env node
+/**
+ * Capture console output from a page.
+ *
+ * Captures all console messages during page load:
+ * - log, warn, error, info, debug
+ * - Includes stack traces for errors
+ * - Timestamps for each message
+ *
+ * Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes consolelog/console.jsonl (one message per line)
+ *
+ * Environment variables:
+ *     SAVE_CONSOLELOG: Enable console log capture (default: true)
+ *     CONSOLELOG_TIMEOUT: Capture duration in seconds (default: 5)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'consolelog';
+const OUTPUT_DIR = 'consolelog';
+const OUTPUT_FILE = 'console.jsonl';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Serialize console message arguments
+async function serializeArgs(args) {
+    const serialized = [];
+    for (const arg of args) {
+        try {
+            const json = await arg.jsonValue();
+            serialized.push(json);
+        } catch (e) {
+            // If jsonValue() fails, try to get text representation
+            try {
+                serialized.push(String(arg));
+            } catch (e2) {
+                serialized.push('[Unserializable]');
+            }
+        }
+    }
+    return serialized;
+}
+
+// Capture console logs
+async function captureConsoleLogs(url) {
+    const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
+
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    // Clear existing file
+    fs.writeFileSync(outputPath, '');
+
+    let browser = null;
+    const consoleLogs = [];
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Listen for console messages
+        page.on('console', async (msg) => {
+            try {
+                const type = msg.type();
+                const text = msg.text();
+                const location = msg.location();
+                const args = await serializeArgs(msg.args());
+
+                const logEntry = {
+                    timestamp: new Date().toISOString(),
+                    type,
+                    text,
+                    args,
+                    location: {
+                        url: location.url || '',
+                        lineNumber: location.lineNumber,
+                        columnNumber: location.columnNumber,
+                    },
+                };
+
+                // Write immediately to file
+                fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+                consoleLogs.push(logEntry);
+            } catch (e) {
+                // Error processing console message, skip it
+                console.error(`Error processing console message: ${e.message}`);
+            }
+        });
+
+        // Listen for page errors
+        page.on('pageerror', (error) => {
+            try {
+                const logEntry = {
+                    timestamp: new Date().toISOString(),
+                    type: 'error',
+                    text: error.message,
+                    stack: error.stack || '',
+                    location: {},
+                };
+
+                fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+                consoleLogs.push(logEntry);
+            } catch (e) {
+                console.error(`Error processing page error: ${e.message}`);
+            }
+        });
+
+        // Listen for request failures
+        page.on('requestfailed', (request) => {
+            try {
+                const failure = request.failure();
+                const logEntry = {
+                    timestamp: new Date().toISOString(),
+                    type: 'request_failed',
+                    text: `Request failed: ${request.url()}`,
+                    error: failure ? failure.errorText : 'Unknown error',
+                    url: request.url(),
+                    location: {},
+                };
+
+                fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
+                consoleLogs.push(logEntry);
+            } catch (e) {
+                console.error(`Error processing request failure: ${e.message}`);
+            }
+        });
+
+        // Wait to capture logs
+        await new Promise(resolve => setTimeout(resolve, captureTimeout));
+
+        // Group logs by type
+        const logStats = consoleLogs.reduce((acc, log) => {
+            acc[log.type] = (acc[log.type] || 0) + 1;
+            return acc;
+        }, {});
+
+        return {
+            success: true,
+            output: outputPath,
+            logCount: consoleLogs.length,
+            logStats,
+        };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+    let logCount = 0;
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_CONSOLELOG', true)) {
+            console.log('Skipping console log (SAVE_CONSOLELOG=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await captureConsoleLogs(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            logCount = result.logCount || 0;
+            const statsStr = Object.entries(result.logStats || {})
+                .map(([type, count]) => `${count} ${type}`)
+                .join(', ');
+            console.log(`Captured ${logCount} console messages: ${statsStr}`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        log_count: logCount,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/custom/on_Dependency__install_using_custom_bash.py
+++ b/archivebox/plugins/custom/on_Dependency__install_using_custom_bash.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Install a binary using a custom bash command.
+
+This provider runs arbitrary shell commands to install binaries
+that don't fit into standard package managers.
+
+Usage: on_Dependency__install_using_custom_bash.py --dependency-id=<uuid> --bin-name=<name> --custom-cmd=<cmd>
+Output: InstalledBinary JSONL record to stdout after installation
+
+Environment variables:
+    MACHINE_ID: Machine UUID (set by orchestrator)
+"""
+
+import json
+import os
+import subprocess
+import sys
+
+import rich_click as click
+from abx_pkg import Binary, EnvProvider
+
+
+@click.command()
+@click.option('--dependency-id', required=True, help="Dependency UUID")
+@click.option('--bin-name', required=True, help="Binary name to install")
+@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
+@click.option('--custom-cmd', required=True, help="Custom bash command to run")
+def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str):
+    """Install binary using custom bash command."""
+
+    if bin_providers != '*' and 'custom' not in bin_providers.split(','):
+        click.echo(f"custom provider not allowed for {bin_name}", err=True)
+        sys.exit(0)
+
+    if not custom_cmd:
+        click.echo("custom provider requires --custom-cmd", err=True)
+        sys.exit(1)
+
+    click.echo(f"Installing {bin_name} via custom command: {custom_cmd}", err=True)
+
+    try:
+        result = subprocess.run(
+            custom_cmd,
+            shell=True,
+            capture_output=True,
+            text=True,
+            timeout=600,  # 10 minute timeout for custom installs
+        )
+        if result.returncode != 0:
+            click.echo(f"Custom install failed: {result.stderr}", err=True)
+            sys.exit(1)
+    except subprocess.TimeoutExpired:
+        click.echo("Custom install timed out", err=True)
+        sys.exit(1)
+
+    # Use abx-pkg to load the installed binary and get its info
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=bin_name, binproviders=[provider]).load()
+    except Exception as e:
+        click.echo(f"{bin_name} not found after custom install: {e}", err=True)
+        sys.exit(1)
+
+    if not binary.abspath:
+        click.echo(f"{bin_name} not found after custom install", err=True)
+        sys.exit(1)
+
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    # Output InstalledBinary JSONL record to stdout
+    record = {
+        'type': 'InstalledBinary',
+        'name': bin_name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'custom',
+        'machine_id': machine_id,
+        'dependency_id': dependency_id,
+    }
+    print(json.dumps(record))
+
+    # Log human-readable info to stderr
+    click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
+    click.echo(f"  version: {binary.version}", err=True)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/dom/on_Snapshot__36_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js
@@ -0,0 +1,296 @@
+#!/usr/bin/env node
+/**
+ * Dump the DOM of a URL using Chrome/Puppeteer.
+ *
+ * If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
+ * Otherwise launches a new Chrome instance.
+ *
+ * Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes dom/output.html
+ *
+ * Environment variables:
+ *     CHROME_BINARY: Path to Chrome/Chromium binary
+ *     CHROME_TIMEOUT: Timeout in seconds (default: 60)
+ *     CHROME_RESOLUTION: Page resolution (default: 1440,2000)
+ *     CHROME_USER_AGENT: User agent string (optional)
+ *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
+ *     CHROME_HEADLESS: Run in headless mode (default: true)
+ *     SAVE_DOM: Enable DOM extraction (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'dom';
+const OUTPUT_DIR = 'dom';
+const OUTPUT_FILE = 'output.html';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Check if staticfile extractor already downloaded this URL
+const STATICFILE_DIR = 'staticfile';
+function hasStaticFileOutput() {
+    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
+}
+
+// Get CDP URL from chrome_session if available
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Find Chrome binary
+function findChrome() {
+    const chromeBinary = getEnv('CHROME_BINARY');
+    if (chromeBinary && fs.existsSync(chromeBinary)) {
+        return chromeBinary;
+    }
+
+    const candidates = [
+        // Linux
+        '/usr/bin/google-chrome',
+        '/usr/bin/google-chrome-stable',
+        '/usr/bin/chromium',
+        '/usr/bin/chromium-browser',
+        // macOS
+        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+        '/Applications/Chromium.app/Contents/MacOS/Chromium',
+    ];
+
+    for (const candidate of candidates) {
+        if (candidate.startsWith('/') && fs.existsSync(candidate)) {
+            return candidate;
+        }
+    }
+
+    return null;
+}
+
+// Parse resolution string
+function parseResolution(resolution) {
+    const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
+    return { width: width || 1440, height: height || 2000 };
+}
+
+async function dumpDom(url) {
+    const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
+    const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
+    const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
+    const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
+    const headless = getEnvBool('CHROME_HEADLESS', true);
+
+    const { width, height } = parseResolution(resolution);
+
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+    let page = null;
+    let connectedToSession = false;
+
+    try {
+        // Try to connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            try {
+                browser = await puppeteer.connect({
+                    browserWSEndpoint: cdpUrl,
+                    defaultViewport: { width, height },
+                });
+                connectedToSession = true;
+
+                // Get existing pages or create new one
+                const pages = await browser.pages();
+                page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+                if (!page) {
+                    page = await browser.newPage();
+                }
+
+                // Set viewport on the page
+                await page.setViewport({ width, height });
+
+            } catch (e) {
+                console.error(`Failed to connect to CDP session: ${e.message}`);
+                browser = null;
+            }
+        }
+
+        // Fall back to launching new browser
+        if (!browser) {
+            const executablePath = findChrome();
+            if (!executablePath) {
+                return { success: false, error: 'Chrome binary not found' };
+            }
+
+            browser = await puppeteer.launch({
+                executablePath,
+                headless: headless ? 'new' : false,
+                args: [
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-gpu',
+                    `--window-size=${width},${height}`,
+                    ...(checkSsl ? [] : ['--ignore-certificate-errors']),
+                ],
+                defaultViewport: { width, height },
+            });
+
+            page = await browser.newPage();
+
+            // Navigate to URL (only if we launched fresh browser)
+            if (userAgent) {
+                await page.setUserAgent(userAgent);
+            }
+
+            await page.goto(url, {
+                waitUntil: 'networkidle2',
+                timeout,
+            });
+        }
+
+        // Get the full DOM content
+        const domContent = await page.content();
+
+        if (domContent && domContent.length > 100) {
+            fs.writeFileSync(outputPath, domContent, 'utf8');
+            return { success: true, output: outputPath };
+        } else {
+            return { success: false, error: 'DOM content too short or empty' };
+        }
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        // Only close browser if we launched it (not if we connected to session)
+        if (browser && !connectedToSession) {
+            await browser.close();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if DOM is enabled (permanent skip - don't retry)
+        if (!getEnvBool('SAVE_DOM', true)) {
+            console.log('Skipping DOM (SAVE_DOM=False)');
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${new Date().toISOString()}`);
+            console.log(`STATUS=skipped`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
+            process.exit(0);  // Permanent skip - feature disabled
+        }
+        // Check if staticfile extractor already handled this (permanent skip)
+        if (hasStaticFileOutput()) {
+            console.log(`Skipping DOM - staticfile extractor already downloaded this`);
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${new Date().toISOString()}`);
+            console.log(`STATUS=skipped`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
+            process.exit(0);  // Permanent skip - staticfile already handled
+        } else {
+            const result = await dumpDom(url);
+
+            if (result.success) {
+                status = 'succeeded';
+                output = result.output;
+                const size = fs.statSync(output).size;
+                console.log(`DOM saved (${size} bytes)`);
+            } else {
+                status = 'failed';
+                error = result.error;
+            }
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/env/on_Dependency__install_using_env_provider.py
+++ b/archivebox/plugins/env/on_Dependency__install_using_env_provider.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+"""
+Check if a binary is already available in the system PATH.
+
+This is the simplest "provider" - it doesn't install anything,
+it just discovers binaries that are already installed.
+
+Usage: on_Dependency__install_using_env_provider.py --dependency-id=<uuid> --bin-name=<name>
+Output: InstalledBinary JSONL record to stdout if binary found in PATH
+
+Environment variables:
+    MACHINE_ID: Machine UUID (set by orchestrator)
+"""
+
+import json
+import os
+import sys
+
+import rich_click as click
+from abx_pkg import Binary, EnvProvider
+
+
+@click.command()
+@click.option('--dependency-id', required=True, help="Dependency UUID")
+@click.option('--bin-name', required=True, help="Binary name to find")
+@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
+def main(dependency_id: str, bin_name: str, bin_providers: str):
+    """Check if binary is available in PATH and record it."""
+
+    # Check if env provider is allowed
+    if bin_providers != '*' and 'env' not in bin_providers.split(','):
+        click.echo(f"env provider not allowed for {bin_name}", err=True)
+        sys.exit(0)  # Not an error, just skip
+
+    # Use abx-pkg EnvProvider to find binary
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=bin_name, binproviders=[provider]).load()
+    except Exception as e:
+        click.echo(f"{bin_name} not found in PATH: {e}", err=True)
+        sys.exit(1)
+
+    if not binary.abspath:
+        click.echo(f"{bin_name} not found in PATH", err=True)
+        sys.exit(1)
+
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    # Output InstalledBinary JSONL record to stdout
+    record = {
+        'type': 'InstalledBinary',
+        'name': bin_name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',
+        'machine_id': machine_id,
+        'dependency_id': dependency_id,
+    }
+    print(json.dumps(record))
+
+    # Log human-readable info to stderr
+    click.echo(f"Found {bin_name} at {binary.abspath}", err=True)
+    click.echo(f"  version: {binary.version}", err=True)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/extractor_utils.py
+++ b/archivebox/plugins/extractor_utils.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+"""
+Shared utilities for extractor hooks.
+
+This module provides common functionality for all extractors to ensure
+consistent behavior, output format, error handling, and timing.
+
+All extractors should:
+1. Import and use these utilities
+2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
+3. Write all files to $PWD
+4. Return proper exit codes (0=success, 1=failure)
+5. Be runnable standalone without any archivebox imports
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+# Static file extensions that generally don't need browser-based extraction
+STATIC_EXTENSIONS = (
+    '.pdf', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico',
+    '.mp4', '.mp3', '.m4a', '.webm', '.mkv', '.avi', '.mov',
+    '.zip', '.tar', '.gz', '.bz2', '.xz', '.7z', '.rar',
+    '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
+    '.exe', '.dmg', '.apk', '.deb', '.rpm',
+)
+
+
+def is_static_file(url: str) -> bool:
+    """Check if URL points to a static file that may not need browser extraction."""
+    return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)
+
+
+def get_env(name: str, default: str = '') -> str:
+    """Get environment variable with default."""
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    """Get boolean environment variable."""
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    """Get integer environment variable."""
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def find_binary(bin_name: str, env_var: str | None = None) -> str | None:
+    """Find binary from environment variable or PATH."""
+    if env_var:
+        binary = get_env(env_var)
+        if binary and os.path.isfile(binary):
+            return binary
+    return shutil.which(bin_name)
+
+
+def get_version(binary: str, version_args: list[str] | None = None) -> str:
+    """Get binary version string."""
+    if not binary or not os.path.isfile(binary):
+        return ''
+
+    args = version_args or ['--version']
+    try:
+        result = subprocess.run(
+            [binary] + args,
+            capture_output=True,
+            text=True,
+            timeout=10
+        )
+        # Return first non-empty line, truncated
+        for line in result.stdout.split('\n'):
+            line = line.strip()
+            if line:
+                return line[:64]
+        return ''
+    except Exception:
+        return ''
+
+
+class ExtractorResult:
+    """
+    Tracks extractor execution and produces consistent output.
+
+    Usage:
+        result = ExtractorResult(name='wget', url=url)
+        result.cmd = ['wget', url]
+        result.version = '1.21'
+
+        # ... do extraction ...
+
+        result.output = 'example.com/index.html'
+        result.status = 'succeeded'
+        result.finish()
+
+        sys.exit(result.exit_code)
+    """
+
+    def __init__(self, name: str, url: str, snapshot_id: str = ''):
+        self.name = name
+        self.url = url
+        self.snapshot_id = snapshot_id
+        self.start_ts = datetime.now(timezone.utc)
+        self.end_ts: datetime | None = None
+
+        self.cmd: list[str] = []
+        self.version: str = ''
+        self.output: str | Path | None = None
+        self.status: str = 'failed'  # 'succeeded', 'failed', 'skipped'
+
+        self.stdout: str = ''
+        self.stderr: str = ''
+        self.returncode: int | None = None
+
+        self.error: str = ''
+        self.hints: list[str] = []
+
+        # Dependency info for missing binary
+        self.dependency_needed: str = ''
+        self.bin_providers: str = ''
+
+    @property
+    def duration(self) -> float:
+        """Duration in seconds."""
+        if self.end_ts:
+            return (self.end_ts - self.start_ts).total_seconds()
+        return (datetime.now(timezone.utc) - self.start_ts).total_seconds()
+
+    @property
+    def exit_code(self) -> int:
+        """Exit code based on status."""
+        if self.status == 'succeeded':
+            return 0
+        if self.status == 'skipped':
+            return 0  # Skipped is not a failure
+        return 1
+
+    def finish(self, status: str | None = None):
+        """Mark extraction as finished and print results."""
+        self.end_ts = datetime.now(timezone.utc)
+        if status:
+            self.status = status
+        self._print_results()
+
+    def _print_results(self):
+        """Print consistent output for hooks.py to parse."""
+        import sys
+
+        # Print timing
+        print(f"START_TS={self.start_ts.isoformat()}")
+        print(f"END_TS={self.end_ts.isoformat() if self.end_ts else ''}")
+        print(f"DURATION={self.duration:.2f}")
+
+        # Print command info
+        if self.cmd:
+            print(f"CMD={' '.join(str(c) for c in self.cmd)}")
+        if self.version:
+            print(f"VERSION={self.version}")
+
+        # Print output path
+        if self.output:
+            print(f"OUTPUT={self.output}")
+
+        # Print status
+        print(f"STATUS={self.status}")
+
+        # Print dependency info if needed
+        if self.dependency_needed:
+            print(f"DEPENDENCY_NEEDED={self.dependency_needed}", file=sys.stderr)
+        if self.bin_providers:
+            print(f"BIN_PROVIDERS={self.bin_providers}", file=sys.stderr)
+
+        # Print error info
+        if self.error:
+            print(f"ERROR={self.error}", file=sys.stderr)
+        for hint in self.hints:
+            print(f"HINT={hint}", file=sys.stderr)
+
+        # Print JSON result for structured parsing
+        result_json = {
+            'extractor': self.name,
+            'url': self.url,
+            'snapshot_id': self.snapshot_id,
+            'status': self.status,
+            'start_ts': self.start_ts.isoformat(),
+            'end_ts': self.end_ts.isoformat() if self.end_ts else None,
+            'duration': round(self.duration, 2),
+            'cmd': self.cmd,
+            'cmd_version': self.version,
+            'output': str(self.output) if self.output else None,
+            'returncode': self.returncode,
+            'error': self.error or None,
+        }
+        print(f"RESULT_JSON={json.dumps(result_json)}")
+
+
+def run_shell_command(
+    cmd: list[str],
+    cwd: str | Path | None = None,
+    timeout: int = 60,
+    result: ExtractorResult | None = None,
+) -> subprocess.CompletedProcess:
+    """
+    Run a shell command with proper capturing and timing.
+
+    Updates result object if provided with stdout, stderr, returncode.
+    """
+    cwd = cwd or Path.cwd()
+
+    try:
+        proc = subprocess.run(
+            cmd,
+            cwd=str(cwd),
+            capture_output=True,
+            timeout=timeout,
+        )
+
+        if result:
+            result.stdout = proc.stdout.decode('utf-8', errors='replace')
+            result.stderr = proc.stderr.decode('utf-8', errors='replace')
+            result.returncode = proc.returncode
+
+        return proc
+
+    except subprocess.TimeoutExpired as e:
+        if result:
+            result.error = f"Command timed out after {timeout} seconds"
+            result.stdout = e.stdout.decode('utf-8', errors='replace') if e.stdout else ''
+            result.stderr = e.stderr.decode('utf-8', errors='replace') if e.stderr else ''
+        raise
+
+    except Exception as e:
+        if result:
+            result.error = f"{type(e).__name__}: {e}"
+        raise
+
+
+def chrome_args(
+    headless: bool = True,
+    sandbox: bool = False,
+    resolution: str = '1440,900',
+    user_agent: str = '',
+    check_ssl: bool = True,
+    user_data_dir: str = '',
+    profile_name: str = 'Default',
+    extra_args: list[str] | None = None,
+) -> list[str]:
+    """
+    Build Chrome/Chromium command line arguments.
+
+    Based on the old CHROME_CONFIG.chrome_args() implementation.
+    """
+    args = [
+        # Disable unnecessary features
+        '--disable-sync',
+        '--no-pings',
+        '--no-first-run',
+        '--no-default-browser-check',
+        '--disable-default-apps',
+        '--disable-infobars',
+        '--disable-blink-features=AutomationControlled',
+
+        # Deterministic behavior
+        '--js-flags=--random-seed=1157259159',
+        '--deterministic-mode',
+        '--deterministic-fetch',
+
+        # Performance
+        '--disable-background-networking',
+        '--disable-background-timer-throttling',
+        '--disable-backgrounding-occluded-windows',
+        '--disable-renderer-backgrounding',
+        '--disable-ipc-flooding-protection',
+
+        # Disable prompts/popups
+        '--deny-permission-prompts',
+        '--disable-notifications',
+        '--disable-popup-blocking',
+        '--noerrdialogs',
+
+        # Security/privacy
+        '--disable-client-side-phishing-detection',
+        '--disable-domain-reliability',
+        '--disable-component-update',
+        '--safebrowsing-disable-auto-update',
+        '--password-store=basic',
+        '--use-mock-keychain',
+
+        # GPU/rendering
+        '--force-gpu-mem-available-mb=4096',
+        '--font-render-hinting=none',
+        '--force-color-profile=srgb',
+        '--disable-partial-raster',
+        '--disable-skia-runtime-opts',
+        '--disable-2d-canvas-clip-aa',
+        '--disable-lazy-loading',
+
+        # Media
+        '--use-fake-device-for-media-stream',
+        '--disable-gesture-requirement-for-media-playback',
+    ]
+
+    if headless:
+        args.append('--headless=new')
+
+    if not sandbox:
+        args.extend([
+            '--no-sandbox',
+            '--no-zygote',
+            '--disable-dev-shm-usage',
+            '--disable-software-rasterizer',
+        ])
+
+    if resolution:
+        args.append(f'--window-size={resolution}')
+
+    if not check_ssl:
+        args.extend([
+            '--disable-web-security',
+            '--ignore-certificate-errors',
+        ])
+
+    if user_agent:
+        args.append(f'--user-agent={user_agent}')
+
+    if user_data_dir:
+        args.append(f'--user-data-dir={user_data_dir}')
+        args.append(f'--profile-directory={profile_name}')
+
+    if extra_args:
+        args.extend(extra_args)
+
+    return args
+
+
+def chrome_cleanup_lockfile(user_data_dir: str | Path):
+    """Remove Chrome SingletonLock file that can prevent browser from starting."""
+    if not user_data_dir:
+        return
+    lockfile = Path(user_data_dir) / 'SingletonLock'
+    try:
+        lockfile.unlink(missing_ok=True)
+    except Exception:
+        pass
+
+
+# Common Chrome binary names to search for
+CHROME_BINARY_NAMES = [
+    'google-chrome',
+    'google-chrome-stable',
+    'chromium',
+    'chromium-browser',
+    'chrome',
+]
+CHROME_BINARY_NAMES_MACOS = [
+    '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+    '/Applications/Chromium.app/Contents/MacOS/Chromium',
+]
+
+
+def find_chrome() -> str | None:
+    """Find Chrome/Chromium binary."""
+    # Check environment first
+    chrome = get_env('CHROME_BINARY')
+    if chrome and os.path.isfile(chrome):
+        return chrome
+
+    # Search PATH
+    for name in CHROME_BINARY_NAMES:
+        binary = shutil.which(name)
+        if binary:
+            return binary
+
+    # Check macOS locations
+    for path in CHROME_BINARY_NAMES_MACOS:
+        if os.path.isfile(path):
+            return path
+
+    return None
--- a/archivebox/plugins/favicon/config.json
+++ b/archivebox/plugins/favicon/config.json
@@ -0,0 +1,31 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_FAVICON": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable favicon downloading"
+    },
+    "FAVICON_TIMEOUT": {
+      "type": "integer",
+      "default": 30,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for favicon fetch in seconds"
+    },
+    "FAVICON_USER_AGENT": {
+      "type": "string",
+      "default": "",
+      "x-fallback": "USER_AGENT",
+      "description": "User agent string"
+    },
+    "FAVICON_CHECK_SSL_VALIDITY": {
+      "type": "boolean",
+      "default": true,
+      "x-fallback": "CHECK_SSL_VALIDITY",
+      "description": "Whether to verify SSL certificates"
+    }
+  }
+}
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Extract favicon from a URL.
+
+Usage: on_Snapshot__favicon.py --url=<url> --snapshot-id=<uuid>
+Output: Writes favicon.ico to $PWD
+
+Environment variables:
+    TIMEOUT: Timeout in seconds (default: 30)
+    USER_AGENT: User agent string
+
+Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
+      It can run standalone if requests is installed: pip install requests
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'favicon'
+OUTPUT_DIR = 'favicon'
+OUTPUT_FILE = 'favicon.ico'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def get_favicon(url: str) -> tuple[bool, str | None, str]:
+    """
+    Fetch favicon from URL.
+
+    Returns: (success, output_path, error_message)
+    """
+    try:
+        import requests
+    except ImportError:
+        return False, None, 'requests library not installed'
+
+    timeout = get_env_int('TIMEOUT', 30)
+    user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+    headers = {'User-Agent': user_agent}
+
+    # Build list of possible favicon URLs
+    parsed = urlparse(url)
+    base_url = f"{parsed.scheme}://{parsed.netloc}"
+
+    favicon_urls = [
+        urljoin(base_url, '/favicon.ico'),
+        urljoin(base_url, '/favicon.png'),
+        urljoin(base_url, '/apple-touch-icon.png'),
+    ]
+
+    # Try to extract favicon URL from HTML link tags
+    try:
+        response = requests.get(url, timeout=timeout, headers=headers)
+        if response.ok:
+            # Look for <link rel="icon" href="...">
+            for match in re.finditer(
+                r'<link[^>]+rel=["\'](?:shortcut )?icon["\'][^>]+href=["\']([^"\']+)["\']',
+                response.text,
+                re.I
+            ):
+                favicon_urls.insert(0, urljoin(url, match.group(1)))
+
+            # Also check reverse order: href before rel
+            for match in re.finditer(
+                r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\'](?:shortcut )?icon["\']',
+                response.text,
+                re.I
+            ):
+                favicon_urls.insert(0, urljoin(url, match.group(1)))
+    except Exception:
+        pass  # Continue with default favicon URLs
+
+    # Try each URL until we find one that works
+    for favicon_url in favicon_urls:
+        try:
+            response = requests.get(favicon_url, timeout=15, headers=headers)
+            if response.ok and len(response.content) > 0:
+                Path(OUTPUT_FILE).write_bytes(response.content)
+                return True, OUTPUT_FILE, ''
+        except Exception:
+            continue
+
+    # Try Google's favicon service as fallback
+    try:
+        google_url = f'https://www.google.com/s2/favicons?domain={parsed.netloc}'
+        response = requests.get(google_url, timeout=15, headers=headers)
+        if response.ok and len(response.content) > 0:
+            Path(OUTPUT_FILE).write_bytes(response.content)
+            return True, OUTPUT_FILE, ''
+    except Exception:
+        pass
+
+    return False, None, 'No favicon found'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to extract favicon from')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Extract favicon from a URL."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+
+    try:
+        # Run extraction
+        success, output, error = get_favicon(url)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            print(f'Favicon saved ({Path(output).stat().st_size} bytes)')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/favicon/tests/test_favicon.py
+++ b/archivebox/plugins/favicon/tests/test_favicon.py
@@ -0,0 +1,262 @@
+"""
+Integration tests for favicon plugin
+
+Tests verify:
+1. Plugin script exists
+2. requests library is available
+3. Favicon extraction works for real example.com
+4. Output file is actual image data
+5. Tries multiple favicon URLs
+6. Falls back to Google's favicon service
+7. Config options work (TIMEOUT, USER_AGENT)
+8. Handles failures gracefully
+"""
+
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+FAVICON_HOOK = PLUGIN_DIR / 'on_Snapshot__11_favicon.py'
+TEST_URL = 'https://example.com'
+
+
+def test_hook_script_exists():
+    """Verify hook script exists."""
+    assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
+
+
+def test_requests_library_available():
+    """Test that requests library is available."""
+    result = subprocess.run(
+        [sys.executable, '-c', 'import requests; print(requests.__version__)'],
+        capture_output=True,
+        text=True
+    )
+
+    if result.returncode != 0:
+        pytest.skip("requests library not installed")
+
+    assert len(result.stdout.strip()) > 0, "Should report requests version"
+
+
+def test_extracts_favicon_from_example_com():
+    """Test full workflow: extract favicon from real example.com.
+
+    Note: example.com doesn't have a favicon and Google's service may also fail,
+    so we test that the extraction completes and reports appropriate status.
+    """
+
+    # Check requests is available
+    check_result = subprocess.run(
+        [sys.executable, '-c', 'import requests'],
+        capture_output=True
+    )
+    if check_result.returncode != 0:
+        pytest.skip("requests not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Run favicon extraction
+        result = subprocess.run(
+            [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # May succeed (if Google service works) or fail (if no favicon)
+        assert result.returncode in (0, 1), "Should complete extraction attempt"
+
+        # Verify RESULT_JSON is present
+        assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
+
+        # If it succeeded, verify the favicon file
+        if result.returncode == 0:
+            assert 'STATUS=succeeded' in result.stdout, "Should report success"
+            assert 'Favicon saved' in result.stdout, "Should report completion"
+
+            favicon_file = tmpdir / 'favicon.ico'
+            assert favicon_file.exists(), "favicon.ico not created"
+
+            # Verify file is not empty and contains actual image data
+            file_size = favicon_file.stat().st_size
+            assert file_size > 0, "Favicon file should not be empty"
+            assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
+
+            # Check for common image magic bytes
+            favicon_data = favicon_file.read_bytes()
+            # ICO, PNG, GIF, JPEG, or WebP
+            is_image = (
+                favicon_data[:4] == b'\x00\x00\x01\x00' or  # ICO
+                favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or  # PNG
+                favicon_data[:3] == b'GIF' or  # GIF
+                favicon_data[:2] == b'\xff\xd8' or  # JPEG
+                favicon_data[8:12] == b'WEBP'  # WebP
+            )
+            assert is_image, "Favicon file should be a valid image format"
+        else:
+            # Failed as expected
+            assert 'STATUS=failed' in result.stdout
+            assert 'No favicon found' in result.stdout or 'No favicon found' in result.stderr
+
+
+def test_config_timeout_honored():
+    """Test that TIMEOUT config is respected."""
+
+    check_result = subprocess.run(
+        [sys.executable, '-c', 'import requests'],
+        capture_output=True
+    )
+    if check_result.returncode != 0:
+        pytest.skip("requests not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set very short timeout (but example.com should still succeed)
+        import os
+        env = os.environ.copy()
+        env['TIMEOUT'] = '5'
+
+        result = subprocess.run(
+            [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should complete (success or fail, but not hang)
+        assert result.returncode in (0, 1), "Should complete without hanging"
+
+
+def test_config_user_agent():
+    """Test that USER_AGENT config is used."""
+
+    check_result = subprocess.run(
+        [sys.executable, '-c', 'import requests'],
+        capture_output=True
+    )
+    if check_result.returncode != 0:
+        pytest.skip("requests not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set custom user agent
+        import os
+        env = os.environ.copy()
+        env['USER_AGENT'] = 'TestBot/1.0'
+
+        result = subprocess.run(
+            [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should succeed (example.com doesn't block)
+        if result.returncode == 0:
+            assert 'STATUS=succeeded' in result.stdout
+
+
+def test_handles_https_urls():
+    """Test that HTTPS URLs work correctly."""
+
+    check_result = subprocess.run(
+        [sys.executable, '-c', 'import requests'],
+        capture_output=True
+    )
+    if check_result.returncode != 0:
+        pytest.skip("requests not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        result = subprocess.run(
+            [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        if result.returncode == 0:
+            favicon_file = tmpdir / 'favicon.ico'
+            if favicon_file.exists():
+                assert favicon_file.stat().st_size > 0
+
+
+def test_handles_missing_favicon_gracefully():
+    """Test that favicon plugin handles sites without favicons gracefully.
+
+    Note: The plugin falls back to Google's favicon service, which generates
+    a generic icon even if the site doesn't have one, so extraction usually succeeds.
+    """
+
+    check_result = subprocess.run(
+        [sys.executable, '-c', 'import requests'],
+        capture_output=True
+    )
+    if check_result.returncode != 0:
+        pytest.skip("requests not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Try a URL that likely doesn't have a favicon
+        result = subprocess.run(
+            [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # May succeed (Google fallback) or fail gracefully
+        assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
+
+        if result.returncode != 0:
+            combined = result.stdout + result.stderr
+            assert 'No favicon found' in combined or 'ERROR=' in combined
+
+
+def test_reports_missing_requests_library():
+    """Test that script reports error when requests library is missing."""
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Run with PYTHONPATH cleared to simulate missing requests
+        import os
+        env = os.environ.copy()
+        # Keep only minimal PATH, clear PYTHONPATH
+        env['PYTHONPATH'] = '/nonexistent'
+
+        result = subprocess.run(
+            [sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env
+        )
+
+        # Should fail and report missing requests
+        if result.returncode != 0:
+            combined = result.stdout + result.stderr
+            # May report missing requests or other import errors
+            assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/git/config.json
+++ b/archivebox/plugins/git/config.json
@@ -0,0 +1,40 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_GIT": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable git repository cloning"
+    },
+    "GIT_BINARY": {
+      "type": "string",
+      "default": "git",
+      "description": "Path to git binary"
+    },
+    "GIT_TIMEOUT": {
+      "type": "integer",
+      "default": 120,
+      "minimum": 10,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for git operations in seconds"
+    },
+    "GIT_DOMAINS": {
+      "type": "string",
+      "default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
+      "description": "Comma-separated list of domains to treat as git repositories"
+    },
+    "GIT_CLONE_DEPTH": {
+      "type": "integer",
+      "default": 1,
+      "minimum": 0,
+      "description": "Depth of git clone (0 for full history, 1 for shallow)"
+    },
+    "GIT_EXTRA_ARGS": {
+      "type": "string",
+      "default": "",
+      "description": "Extra arguments for git clone"
+    }
+  }
+}
--- a/archivebox/plugins/git/on_Crawl__00_validate_git.py
+++ b/archivebox/plugins/git/on_Crawl__00_validate_git.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+Validation hook for git binary.
+
+Runs at crawl start to verify git is available.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+def get_binary_version(abspath: str) -> str | None:
+    """Get version string from binary."""
+    try:
+        result = subprocess.run(
+            [abspath, '--version'],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode == 0 and result.stdout:
+            # git version string: "git version 2.43.0"
+            first_line = result.stdout.strip().split('\n')[0]
+            parts = first_line.split()
+            if len(parts) >= 3 and parts[0] == 'git':
+                return parts[2]
+            return first_line[:32]
+    except Exception:
+        pass
+    return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+    """Get SHA256 hash of binary."""
+    try:
+        with open(abspath, 'rb') as f:
+            return hashlib.sha256(f.read()).hexdigest()
+    except Exception:
+        return None
+
+
+def find_git() -> dict | None:
+    """Find git binary."""
+    try:
+        from abx_pkg import Binary, EnvProvider
+
+        class GitBinary(Binary):
+            name: str = 'git'
+            binproviders_supported = [EnvProvider()]
+
+        binary = GitBinary()
+        loaded = binary.load()
+        if loaded and loaded.abspath:
+            return {
+                'name': 'git',
+                'abspath': str(loaded.abspath),
+                'version': str(loaded.version) if loaded.version else None,
+                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
+                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
+            }
+    except ImportError:
+        pass
+    except Exception:
+        pass
+
+    # Fallback to shutil.which
+    abspath = shutil.which('git') or os.environ.get('GIT_BINARY', '')
+    if abspath and Path(abspath).is_file():
+        return {
+            'name': 'git',
+            'abspath': abspath,
+            'version': get_binary_version(abspath),
+            'sha256': get_binary_hash(abspath),
+            'binprovider': 'env',
+        }
+
+    return None
+
+
+def main():
+    result = find_git()
+
+    if result and result.get('abspath'):
+        print(json.dumps({
+            'type': 'InstalledBinary',
+            'name': result['name'],
+            'abspath': result['abspath'],
+            'version': result['version'],
+            'sha256': result['sha256'],
+            'binprovider': result['binprovider'],
+        }))
+
+        print(json.dumps({
+            'type': 'Machine',
+            '_method': 'update',
+            'key': 'config/GIT_BINARY',
+            'value': result['abspath'],
+        }))
+
+        if result['version']:
+            print(json.dumps({
+                'type': 'Machine',
+                '_method': 'update',
+                'key': 'config/GIT_VERSION',
+                'value': result['version'],
+            }))
+
+        sys.exit(0)
+    else:
+        print(json.dumps({
+            'type': 'Dependency',
+            'bin_name': 'git',
+            'bin_providers': 'apt,brew,env',
+        }))
+        print(f"git binary not found", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/git/on_Snapshot__12_git.py
+++ b/archivebox/plugins/git/on_Snapshot__12_git.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""
+Clone a git repository from a URL.
+
+Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
+Output: Clones repository to $PWD/repo
+
+Environment variables:
+    GIT_BINARY: Path to git binary
+    TIMEOUT: Timeout in seconds (default: 120)
+    GIT_ARGS: Extra arguments for git clone (space-separated)
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'git'
+BIN_NAME = 'git'
+BIN_PROVIDERS = 'apt,brew,env'
+OUTPUT_DIR = 'repo'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def is_git_url(url: str) -> bool:
+    """Check if URL looks like a git repository."""
+    git_patterns = [
+        '.git',
+        'github.com',
+        'gitlab.com',
+        'bitbucket.org',
+        'git://',
+        'ssh://git@',
+    ]
+    return any(p in url.lower() for p in git_patterns)
+
+
+def find_git() -> str | None:
+    """Find git binary."""
+    git = get_env('GIT_BINARY')
+    if git and os.path.isfile(git):
+        return git
+
+    return shutil.which('git')
+
+
+def get_version(binary: str) -> str:
+    """Get git version."""
+    try:
+        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
+        return result.stdout.strip()[:64]
+    except Exception:
+        return ''
+
+
+def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
+    """
+    Clone git repository.
+
+    Returns: (success, output_path, error_message)
+    """
+    timeout = get_env_int('TIMEOUT', 120)
+    extra_args = get_env('GIT_ARGS')
+
+    cmd = [
+        binary,
+        'clone',
+        '--depth=1',
+        '--recursive',
+    ]
+
+    if extra_args:
+        cmd.extend(extra_args.split())
+
+    cmd.extend([url, OUTPUT_DIR])
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, timeout=timeout)
+
+        if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
+            return True, OUTPUT_DIR, ''
+        else:
+            stderr = result.stderr.decode('utf-8', errors='replace')
+            return False, None, f'git clone failed: {stderr[:200]}'
+
+    except subprocess.TimeoutExpired:
+        return False, None, f'Timed out after {timeout} seconds'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='Git repository URL')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Clone a git repository from a URL."""
+
+    start_ts = datetime.now(timezone.utc)
+    version = ''
+    output = None
+    status = 'failed'
+    error = ''
+    binary = None
+
+    try:
+        # Check if URL looks like a git repo
+        if not is_git_url(url):
+            print(f'Skipping git clone for non-git URL: {url}')
+            status = 'skipped'
+            end_ts = datetime.now(timezone.utc)
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={end_ts.isoformat()}')
+            print(f'STATUS={status}')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url})}')
+            sys.exit(0)
+
+        # Find binary
+        binary = find_git()
+        if not binary:
+            print(f'ERROR: git binary not found', file=sys.stderr)
+            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
+            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
+            sys.exit(1)
+
+        version = get_version(binary)
+
+        # Run extraction
+        success, output, error = clone_git(url, binary)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            print(f'git clone completed')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if binary:
+        print(f'CMD={binary} clone {url}')
+    if version:
+        print(f'VERSION={version}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'cmd_version': version,
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/headers/on_Snapshot__33_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js
@@ -0,0 +1,203 @@
+#!/usr/bin/env node
+/**
+ * Extract HTTP response headers for a URL.
+ *
+ * If a Chrome session exists (from chrome_session extractor), reads the captured
+ * response headers from chrome_session/response_headers.json.
+ * Otherwise falls back to making an HTTP HEAD request.
+ *
+ * Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes headers/headers.json
+ *
+ * Environment variables:
+ *     TIMEOUT: Timeout in seconds (default: 30)
+ *     USER_AGENT: User agent string (optional)
+ *     CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+const http = require('http');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'headers';
+const OUTPUT_DIR = 'headers';
+const OUTPUT_FILE = 'headers.json';
+const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_HEADERS_FILE = 'response_headers.json';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Get headers from chrome_session if available
+function getHeadersFromChromeSession() {
+    const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE);
+    if (fs.existsSync(headersFile)) {
+        try {
+            const data = JSON.parse(fs.readFileSync(headersFile, 'utf8'));
+            return data;
+        } catch (e) {
+            return null;
+        }
+    }
+    return null;
+}
+
+// Fetch headers via HTTP HEAD request (fallback)
+function fetchHeaders(url) {
+    return new Promise((resolve, reject) => {
+        const timeout = getEnvInt('TIMEOUT', 30) * 1000;
+        const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
+        const checkSsl = getEnvBool('CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
+
+        const parsedUrl = new URL(url);
+        const client = parsedUrl.protocol === 'https:' ? https : http;
+
+        const options = {
+            method: 'HEAD',
+            hostname: parsedUrl.hostname,
+            port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
+            path: parsedUrl.pathname + parsedUrl.search,
+            headers: { 'User-Agent': userAgent },
+            timeout,
+            rejectUnauthorized: checkSsl,
+        };
+
+        const req = client.request(options, (res) => {
+            resolve({
+                url: url,
+                status: res.statusCode,
+                statusText: res.statusMessage,
+                headers: res.headers,
+            });
+        });
+
+        req.on('error', reject);
+        req.on('timeout', () => {
+            req.destroy();
+            reject(new Error('Request timeout'));
+        });
+
+        req.end();
+    });
+}
+
+async function extractHeaders(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    // Try Chrome session first
+    const chromeHeaders = getHeadersFromChromeSession();
+    if (chromeHeaders && chromeHeaders.headers) {
+        fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8');
+        return { success: true, output: outputPath, method: 'chrome_session', status: chromeHeaders.status };
+    }
+
+    // Fallback to HTTP HEAD request
+    try {
+        const headers = await fetchHeaders(url);
+        fs.writeFileSync(outputPath, JSON.stringify(headers, null, 2), 'utf8');
+        return { success: true, output: outputPath, method: 'http', status: headers.status };
+    } catch (e) {
+        return { success: false, error: e.message };
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        const result = await extractHeaders(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            console.log(`Headers extracted (${result.method}): HTTP ${result.status}`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/headers/tests/test_headers.py
+++ b/archivebox/plugins/headers/tests/test_headers.py
@@ -0,0 +1,319 @@
+"""
+Integration tests for headers plugin
+
+Tests verify:
+1. Plugin script exists and is executable
+2. Node.js is available
+3. Headers extraction works for real example.com
+4. Output JSON contains actual HTTP headers
+5. Fallback to HTTP HEAD when chrome_session not available
+6. Uses chrome_session headers when available
+7. Config options work (TIMEOUT, USER_AGENT, CHECK_SSL_VALIDITY)
+"""
+
+import json
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+HEADERS_HOOK = PLUGIN_DIR / 'on_Snapshot__33_headers.js'
+TEST_URL = 'https://example.com'
+
+
+def test_hook_script_exists():
+    """Verify hook script exists."""
+    assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}"
+
+
+def test_node_is_available():
+    """Test that Node.js is available on the system."""
+    result = subprocess.run(
+        ['which', 'node'],
+        capture_output=True,
+        text=True
+    )
+
+    if result.returncode != 0:
+        pytest.skip("node not installed on system")
+
+    binary_path = result.stdout.strip()
+    assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
+
+    # Test that node is executable and get version
+    result = subprocess.run(
+        ['node', '--version'],
+        capture_output=True,
+        text=True,
+        timeout=10
+    )
+    assert result.returncode == 0, f"node not executable: {result.stderr}"
+    assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
+
+
+def test_extracts_headers_from_example_com():
+    """Test full workflow: extract headers from real example.com."""
+
+    # Check node is available
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Run headers extraction
+        result = subprocess.run(
+            ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+
+        # Verify output in stdout
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+        assert 'Headers extracted' in result.stdout, "Should report completion"
+
+        # Verify output directory created
+        headers_dir = tmpdir / 'headers'
+        assert headers_dir.exists(), "Output directory not created"
+
+        # Verify output file exists
+        headers_file = headers_dir / 'headers.json'
+        assert headers_file.exists(), "headers.json not created"
+
+        # Verify headers JSON contains REAL example.com response
+        headers_data = json.loads(headers_file.read_text())
+
+        assert 'url' in headers_data, "Should have url field"
+        assert headers_data['url'] == TEST_URL, f"URL should be {TEST_URL}"
+
+        assert 'status' in headers_data, "Should have status field"
+        assert headers_data['status'] in [200, 301, 302], \
+            f"Should have valid HTTP status, got {headers_data['status']}"
+
+        assert 'headers' in headers_data, "Should have headers field"
+        assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
+        assert len(headers_data['headers']) > 0, "Headers dict should not be empty"
+
+        # Verify common HTTP headers are present
+        headers_lower = {k.lower(): v for k, v in headers_data['headers'].items()}
+        assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
+            "Should have at least one common HTTP header"
+
+        # Verify RESULT_JSON is present and valid
+        assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
+
+        for line in result.stdout.split('\n'):
+            if line.startswith('RESULT_JSON='):
+                result_json = json.loads(line.replace('RESULT_JSON=', ''))
+                assert result_json['extractor'] == 'headers'
+                assert result_json['status'] == 'succeeded'
+                assert result_json['url'] == TEST_URL
+                assert result_json['snapshot_id'] == 'test789'
+                assert 'duration' in result_json
+                assert result_json['duration'] >= 0
+                break
+
+
+def test_uses_chrome_session_headers_when_available():
+    """Test that headers plugin prefers chrome_session headers over HTTP HEAD."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Create mock chrome_session directory with response_headers.json
+        chrome_session_dir = tmpdir / 'chrome_session'
+        chrome_session_dir.mkdir()
+
+        mock_headers = {
+            'url': TEST_URL,
+            'status': 200,
+            'statusText': 'OK',
+            'headers': {
+                'content-type': 'text/html; charset=UTF-8',
+                'server': 'MockChromeServer',
+                'x-test-header': 'from-chrome-session'
+            }
+        }
+
+        headers_file = chrome_session_dir / 'response_headers.json'
+        headers_file.write_text(json.dumps(mock_headers))
+
+        # Run headers extraction
+        result = subprocess.run(
+            ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testchrome'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+        assert 'chrome_session' in result.stdout, "Should report using chrome_session method"
+
+        # Verify it used chrome_session headers
+        output_headers_file = tmpdir / 'headers' / 'headers.json'
+        assert output_headers_file.exists(), "Output headers.json not created"
+
+        output_data = json.loads(output_headers_file.read_text())
+        assert output_data['headers']['x-test-header'] == 'from-chrome-session', \
+            "Should use headers from chrome_session"
+        assert output_data['headers']['server'] == 'MockChromeServer', \
+            "Should use headers from chrome_session"
+
+
+def test_falls_back_to_http_when_chrome_session_unavailable():
+    """Test that headers plugin falls back to HTTP HEAD when chrome_session unavailable."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Don't create chrome_session directory - force HTTP fallback
+
+        # Run headers extraction
+        result = subprocess.run(
+            ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+        assert 'http' in result.stdout.lower() or 'HEAD' not in result.stdout, \
+            "Should use HTTP method"
+
+        # Verify output exists and has real HTTP headers
+        output_headers_file = tmpdir / 'headers' / 'headers.json'
+        assert output_headers_file.exists(), "Output headers.json not created"
+
+        output_data = json.loads(output_headers_file.read_text())
+        assert output_data['url'] == TEST_URL
+        assert output_data['status'] in [200, 301, 302]
+        assert isinstance(output_data['headers'], dict)
+        assert len(output_data['headers']) > 0
+
+
+def test_config_timeout_honored():
+    """Test that TIMEOUT config is respected."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set very short timeout (but example.com should still succeed)
+        import os
+        env = os.environ.copy()
+        env['TIMEOUT'] = '5'
+
+        result = subprocess.run(
+            ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should complete (success or fail, but not hang)
+        assert result.returncode in (0, 1), "Should complete without hanging"
+
+
+def test_config_user_agent():
+    """Test that USER_AGENT config is used."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set custom user agent
+        import os
+        env = os.environ.copy()
+        env['USER_AGENT'] = 'TestBot/1.0'
+
+        result = subprocess.run(
+            ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should succeed (example.com doesn't block)
+        if result.returncode == 0:
+            assert 'STATUS=succeeded' in result.stdout
+
+
+def test_handles_https_urls():
+    """Test that HTTPS URLs work correctly."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        result = subprocess.run(
+            ['node', str(HEADERS_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        if result.returncode == 0:
+            output_headers_file = tmpdir / 'headers' / 'headers.json'
+            if output_headers_file.exists():
+                output_data = json.loads(output_headers_file.read_text())
+                assert output_data['url'] == 'https://example.org'
+                assert output_data['status'] in [200, 301, 302]
+
+
+def test_handles_404_gracefully():
+    """Test that headers plugin handles 404s gracefully."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        result = subprocess.run(
+            ['node', str(HEADERS_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # May succeed or fail depending on server behavior
+        # If it succeeds, verify 404 status is captured
+        if result.returncode == 0:
+            output_headers_file = tmpdir / 'headers' / 'headers.json'
+            if output_headers_file.exists():
+                output_data = json.loads(output_headers_file.read_text())
+                assert output_data['status'] == 404, "Should capture 404 status"
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+"""
+Convert HTML to plain text for search indexing.
+
+This extractor reads HTML from other extractors (wget, singlefile, dom)
+and converts it to plain text for full-text search.
+
+Usage: on_Snapshot__htmltotext.py --url=<url> --snapshot-id=<uuid>
+Output: Writes htmltotext.txt to $PWD
+
+Environment variables:
+    TIMEOUT: Timeout in seconds (not used, but kept for consistency)
+
+Note: This extractor does not require any external binaries.
+      It uses Python's built-in html.parser module.
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from html.parser import HTMLParser
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'htmltotext'
+OUTPUT_DIR = 'htmltotext'
+OUTPUT_FILE = 'htmltotext.txt'
+
+
+class HTMLTextExtractor(HTMLParser):
+    """Extract text content from HTML, ignoring scripts/styles."""
+
+    def __init__(self):
+        super().__init__()
+        self.result = []
+        self.skip_tags = {'script', 'style', 'head', 'meta', 'link', 'noscript'}
+        self.current_tag = None
+
+    def handle_starttag(self, tag, attrs):
+        self.current_tag = tag.lower()
+
+    def handle_endtag(self, tag):
+        self.current_tag = None
+
+    def handle_data(self, data):
+        if self.current_tag not in self.skip_tags:
+            text = data.strip()
+            if text:
+                self.result.append(text)
+
+    def get_text(self) -> str:
+        return ' '.join(self.result)
+
+
+def html_to_text(html: str) -> str:
+    """Convert HTML to plain text."""
+    parser = HTMLTextExtractor()
+    try:
+        parser.feed(html)
+        return parser.get_text()
+    except Exception:
+        # Fallback: strip HTML tags with regex
+        text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
+        text = re.sub(r'<[^>]+>', ' ', text)
+        text = re.sub(r'\s+', ' ', text)
+        return text.strip()
+
+
+def find_html_source() -> str | None:
+    """Find HTML content from other extractors in the snapshot directory."""
+    # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
+    search_patterns = [
+        'singlefile/singlefile.html',
+        'singlefile/*.html',
+        'dom/output.html',
+        'dom/*.html',
+        'wget/**/*.html',
+        'wget/**/*.htm',
+    ]
+
+    cwd = Path.cwd()
+    for pattern in search_patterns:
+        matches = list(cwd.glob(pattern))
+        for match in matches:
+            if match.is_file() and match.stat().st_size > 0:
+                try:
+                    return match.read_text(errors='ignore')
+                except Exception:
+                    continue
+
+    return None
+
+
+def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
+    """
+    Extract plain text from HTML sources.
+
+    Returns: (success, output_path, error_message)
+    """
+    # Find HTML source from other extractors
+    html_content = find_html_source()
+    if not html_content:
+        return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
+
+    # Convert HTML to text
+    text = html_to_text(html_content)
+
+    if not text or len(text) < 10:
+        return False, None, 'No meaningful text extracted from HTML'
+
+    # Create output directory and write output
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(exist_ok=True)
+    output_path = output_dir / OUTPUT_FILE
+    output_path.write_text(text, encoding='utf-8')
+
+    return True, str(output_path), ''
+
+
+@click.command()
+@click.option('--url', required=True, help='URL that was archived')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Convert HTML to plain text for search indexing."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+
+    try:
+        # Run extraction
+        success, output, error = extract_htmltotext(url)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            text_len = Path(output).stat().st_size
+            print(f'Extracted {text_len} characters of text')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js
+++ b/archivebox/plugins/istilldontcareaboutcookies/on_Snapshot__02_istilldontcareaboutcookies.js
@@ -0,0 +1,115 @@
+#!/usr/bin/env node
+/**
+ * I Still Don't Care About Cookies Extension Plugin
+ *
+ * Installs and configures the "I still don't care about cookies" Chrome extension
+ * for automatic cookie consent banner dismissal during page archiving.
+ *
+ * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
+ *
+ * Priority: 02 (early) - Must install before Chrome session starts
+ * Hook: on_Snapshot
+ *
+ * This extension automatically:
+ * - Dismisses cookie consent popups
+ * - Removes cookie banners
+ * - Accepts necessary cookies to proceed with browsing
+ * - Works on thousands of websites out of the box
+ */
+
+const path = require('path');
+const fs = require('fs');
+
+// Import extension utilities
+const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
+
+// Extension metadata
+const EXTENSION = {
+    webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
+    name: 'istilldontcareaboutcookies',
+};
+
+// Get extensions directory from environment or use default
+const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
+    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
+
+/**
+ * Install the I Still Don't Care About Cookies extension
+ */
+async function installCookiesExtension() {
+    console.log('[*] Installing I Still Don\'t Care About Cookies extension...');
+
+    // Install the extension
+    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
+
+    if (!extension) {
+        console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension');
+        return null;
+    }
+
+    console.log('[+] I Still Don\'t Care About Cookies extension installed');
+    console.log('[+] Cookie banners will be automatically dismissed during archiving');
+
+    return extension;
+}
+
+/**
+ * Note: This extension works out of the box with no configuration needed.
+ * It automatically detects and dismisses cookie banners on page load.
+ */
+
+/**
+ * Main entry point - install extension before archiving
+ */
+async function main() {
+    // Check if extension is already cached
+    const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
+
+    if (fs.existsSync(cacheFile)) {
+        try {
+            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
+            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
+
+            if (fs.existsSync(manifestPath)) {
+                console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)');
+                return cached;
+            }
+        } catch (e) {
+            // Cache file corrupted, re-install
+            console.warn('[⚠️] Extension cache corrupted, re-installing...');
+        }
+    }
+
+    // Install extension
+    const extension = await installCookiesExtension();
+
+    // Export extension metadata for chrome_session to load
+    if (extension) {
+        // Write extension info to a cache file that chrome_session can read
+        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
+        await fs.promises.writeFile(
+            cacheFile,
+            JSON.stringify(extension, null, 2)
+        );
+        console.log(`[+] Extension metadata written to ${cacheFile}`);
+    }
+
+    return extension;
+}
+
+// Export functions for use by other plugins
+module.exports = {
+    EXTENSION,
+    installCookiesExtension,
+};
+
+// Run if executed directly
+if (require.main === module) {
+    main().then(() => {
+        console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
+        process.exit(0);
+    }).catch(err => {
+        console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
+        process.exit(1);
+    });
+}
--- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js
+++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js
@@ -0,0 +1,279 @@
+/**
+ * Unit tests for istilldontcareaboutcookies plugin
+ *
+ * Run with: node --test tests/test_istilldontcareaboutcookies.js
+ */
+
+const assert = require('assert');
+const fs = require('fs');
+const path = require('path');
+const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
+
+// Test fixtures
+const TEST_DIR = path.join(__dirname, '.test_fixtures');
+const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
+
+describe('istilldontcareaboutcookies plugin', () => {
+    before(() => {
+        if (!fs.existsSync(TEST_DIR)) {
+            fs.mkdirSync(TEST_DIR, { recursive: true });
+        }
+    });
+
+    after(() => {
+        if (fs.existsSync(TEST_DIR)) {
+            fs.rmSync(TEST_DIR, { recursive: true, force: true });
+        }
+    });
+
+    describe('EXTENSION metadata', () => {
+        it('should have correct webstore_id', () => {
+            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
+
+            assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
+        });
+
+        it('should have correct name', () => {
+            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
+
+            assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
+        });
+    });
+
+    describe('installCookiesExtension', () => {
+        beforeEach(() => {
+            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
+
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_EXTENSIONS_DIR;
+        });
+
+        it('should use cached extension if available', async () => {
+            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
+
+            // Create fake cache
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
+            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
+
+            fs.mkdirSync(fakeExtensionDir, { recursive: true });
+            fs.writeFileSync(
+                path.join(fakeExtensionDir, 'manifest.json'),
+                JSON.stringify({ version: '1.1.8' })
+            );
+
+            const fakeCache = {
+                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
+                name: 'istilldontcareaboutcookies',
+                unpacked_path: fakeExtensionDir,
+                version: '1.1.8'
+            };
+
+            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
+
+            const result = await installCookiesExtension();
+
+            assert.notStrictEqual(result, null);
+            assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
+        });
+
+        it('should not require any configuration', async () => {
+            // This extension works out of the box
+            // No API keys or config needed
+            const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
+
+            assert.ok(EXTENSION);
+            // No config fields should be required
+        });
+    });
+
+    describe('cache file creation', () => {
+        beforeEach(() => {
+            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
+
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_EXTENSIONS_DIR;
+        });
+
+        it('should create cache file with correct extension name', async () => {
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
+
+            // Create mock extension
+            const mockExtension = {
+                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
+                name: 'istilldontcareaboutcookies',
+                version: '1.1.9'
+            };
+
+            await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
+
+            assert.ok(fs.existsSync(cacheFile));
+
+            const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
+            assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
+        });
+
+        it('should use correct filename pattern', () => {
+            const expectedPattern = 'istilldontcareaboutcookies.extension.json';
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
+
+            // Pattern should match expected format
+            assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
+            assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
+        });
+    });
+
+    describe('extension functionality', () => {
+        it('should work automatically without configuration', () => {
+            // This extension automatically dismisses cookie banners
+            // No manual trigger or configuration needed
+
+            const features = {
+                automaticBannerDismissal: true,
+                requiresConfiguration: false,
+                requiresApiKey: false,
+                requiresUserAction: false
+            };
+
+            assert.strictEqual(features.automaticBannerDismissal, true);
+            assert.strictEqual(features.requiresConfiguration, false);
+            assert.strictEqual(features.requiresApiKey, false);
+            assert.strictEqual(features.requiresUserAction, false);
+        });
+
+        it('should not require any runtime hooks', () => {
+            // Extension works purely via Chrome's content script injection
+            // No need for additional hooks or configuration
+
+            const requiresHooks = {
+                preNavigation: false,
+                postNavigation: false,
+                onPageLoad: false
+            };
+
+            assert.strictEqual(requiresHooks.preNavigation, false);
+            assert.strictEqual(requiresHooks.postNavigation, false);
+            assert.strictEqual(requiresHooks.onPageLoad, false);
+        });
+    });
+
+    describe('priority and execution order', () => {
+        it('should have priority 02 (early)', () => {
+            const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
+
+            // Extract priority from filename
+            const match = filename.match(/on_Snapshot__(\d+)_/);
+            assert.ok(match);
+
+            const priority = parseInt(match[1]);
+            assert.strictEqual(priority, 2);
+        });
+
+        it('should run before chrome_session (priority 20)', () => {
+            const extensionPriority = 2;
+            const chromeSessionPriority = 20;
+
+            assert.ok(extensionPriority < chromeSessionPriority);
+        });
+    });
+
+    describe('error handling', () => {
+        beforeEach(() => {
+            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
+
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_EXTENSIONS_DIR;
+        });
+
+        it('should handle corrupted cache gracefully', async () => {
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
+
+            // Create corrupted cache
+            fs.writeFileSync(cacheFile, 'invalid json content');
+
+            // Should detect corruption and proceed with fresh install
+            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
+
+            // Mock loadOrInstallExtension to avoid actual download
+            const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
+            const originalFunc = extensionUtils.loadOrInstallExtension;
+
+            extensionUtils.loadOrInstallExtension = async () => ({
+                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
+                name: 'istilldontcareaboutcookies',
+                version: '1.1.9'
+            });
+
+            const result = await installCookiesExtension();
+
+            extensionUtils.loadOrInstallExtension = originalFunc;
+
+            assert.notStrictEqual(result, null);
+        });
+
+        it('should handle missing manifest gracefully', async () => {
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
+            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
+
+            // Create directory without manifest
+            fs.mkdirSync(fakeExtensionDir, { recursive: true });
+
+            const fakeCache = {
+                webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
+                name: 'istilldontcareaboutcookies',
+                unpacked_path: fakeExtensionDir
+            };
+
+            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
+
+            const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
+
+            // Mock to return fresh extension when manifest missing
+            const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
+            const originalFunc = extensionUtils.loadOrInstallExtension;
+
+            let freshInstallCalled = false;
+            extensionUtils.loadOrInstallExtension = async () => {
+                freshInstallCalled = true;
+                return {
+                    webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
+                    name: 'istilldontcareaboutcookies',
+                    version: '1.1.9'
+                };
+            };
+
+            const result = await installCookiesExtension();
+
+            extensionUtils.loadOrInstallExtension = originalFunc;
+
+            // Should trigger fresh install when manifest missing
+            assert.ok(freshInstallCalled || result);
+        });
+    });
+});
--- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
+++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
@@ -0,0 +1,122 @@
+"""
+Unit tests for istilldontcareaboutcookies plugin
+
+Tests invoke the plugin hook as an external process and verify outputs/side effects.
+"""
+
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__02_istilldontcareaboutcookies.js"
+
+
+def test_install_script_exists():
+    """Verify install script exists"""
+    assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
+
+
+def test_extension_metadata():
+    """Test that extension has correct metadata"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
+
+        result = subprocess.run(
+            ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
+            capture_output=True,
+            text=True,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
+
+        metadata = json.loads(result.stdout)
+        assert metadata["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm"
+        assert metadata["name"] == "istilldontcareaboutcookies"
+
+
+def test_install_creates_cache():
+    """Test that install creates extension cache"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Check output mentions installation
+        assert "Installing" in result.stdout or "installed" in result.stdout or "istilldontcareaboutcookies" in result.stdout
+
+        # Check cache file was created
+        cache_file = ext_dir / "istilldontcareaboutcookies.extension.json"
+        assert cache_file.exists(), "Cache file should be created"
+
+        # Verify cache content
+        cache_data = json.loads(cache_file.read_text())
+        assert cache_data["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm"
+        assert cache_data["name"] == "istilldontcareaboutcookies"
+
+
+def test_install_uses_existing_cache():
+    """Test that install uses existing cache when available"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        # Create fake cache
+        fake_extension_dir = ext_dir / "edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies"
+        fake_extension_dir.mkdir(parents=True)
+
+        manifest = {"version": "1.1.8", "name": "I still don't care about cookies"}
+        (fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should use cache or install successfully
+        assert result.returncode == 0
+
+
+def test_no_configuration_required():
+    """Test that extension works without any configuration"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+        # No special env vars needed - works out of the box
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should not require any API keys or configuration
+        assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
--- a/archivebox/plugins/media/config.json
+++ b/archivebox/plugins/media/config.json
@@ -0,0 +1,55 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_MEDIA": {
+      "type": "boolean",
+      "default": true,
+      "x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
+      "description": "Enable media downloading with yt-dlp"
+    },
+    "YOUTUBEDL_BINARY": {
+      "type": "string",
+      "default": "yt-dlp",
+      "x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
+      "description": "Path to yt-dlp binary"
+    },
+    "MEDIA_TIMEOUT": {
+      "type": "integer",
+      "default": 3600,
+      "minimum": 30,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for media downloads in seconds"
+    },
+    "MEDIA_MAX_SIZE": {
+      "type": "string",
+      "default": "750m",
+      "pattern": "^\\d+[kmgKMG]?$",
+      "description": "Maximum file size for media downloads"
+    },
+    "YTDLP_CHECK_SSL_VALIDITY": {
+      "type": "boolean",
+      "default": true,
+      "x-fallback": "CHECK_SSL_VALIDITY",
+      "description": "Whether to verify SSL certificates"
+    },
+    "YTDLP_ARGS": {
+      "type": "array",
+      "items": {"type": "string"},
+      "default": [
+        "--write-info-json",
+        "--write-thumbnail",
+        "--write-sub",
+        "--embed-subs",
+        "--write-auto-sub"
+      ],
+      "description": "Default yt-dlp arguments"
+    },
+    "YTDLP_EXTRA_ARGS": {
+      "type": "string",
+      "default": "",
+      "description": "Extra arguments for yt-dlp (space-separated)"
+    }
+  }
+}
--- a/archivebox/plugins/media/on_Snapshot__51_media.py
+++ b/archivebox/plugins/media/on_Snapshot__51_media.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Download media from a URL using yt-dlp.
+
+Usage: on_Snapshot__media.py --url=<url> --snapshot-id=<uuid>
+Output: Downloads media files to $PWD/media/
+
+Environment variables:
+    YTDLP_BINARY: Path to yt-dlp binary
+    YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large media)
+    YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
+    YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated)
+
+    # Media feature toggles
+    USE_YTDLP: Enable yt-dlp media extraction (default: True)
+    SAVE_MEDIA: Alias for USE_YTDLP
+
+    # Media size limits
+    MEDIA_MAX_SIZE: Maximum media file size (default: 750m)
+
+    # Fallback to ARCHIVING_CONFIG values if YTDLP_* not set:
+    MEDIA_TIMEOUT: Fallback timeout for media
+    TIMEOUT: Fallback timeout
+    CHECK_SSL_VALIDITY: Fallback SSL check
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'media'
+BIN_NAME = 'yt-dlp'
+BIN_PROVIDERS = 'pip,apt,brew,env'
+OUTPUT_DIR = 'media'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+STATICFILE_DIR = 'staticfile'
+
+def has_staticfile_output() -> bool:
+    """Check if staticfile extractor already downloaded this URL."""
+    staticfile_dir = Path(STATICFILE_DIR)
+    return staticfile_dir.exists() and any(staticfile_dir.iterdir())
+
+
+def find_ytdlp() -> str | None:
+    """Find yt-dlp binary."""
+    ytdlp = get_env('YTDLP_BINARY') or get_env('YOUTUBEDL_BINARY')
+    if ytdlp and os.path.isfile(ytdlp):
+        return ytdlp
+
+    for name in ['yt-dlp', 'youtube-dl']:
+        binary = shutil.which(name)
+        if binary:
+            return binary
+
+    return None
+
+
+def get_version(binary: str) -> str:
+    """Get yt-dlp version."""
+    try:
+        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
+        return result.stdout.strip()[:64]
+    except Exception:
+        return ''
+
+
+# Default yt-dlp args (from old YTDLP_CONFIG)
+def get_ytdlp_default_args(media_max_size: str = '750m') -> list[str]:
+    """Build default yt-dlp arguments."""
+    return [
+        '--restrict-filenames',
+        '--trim-filenames', '128',
+        '--write-description',
+        '--write-info-json',
+        '--write-annotations',
+        '--write-thumbnail',
+        '--no-call-home',
+        '--write-sub',
+        '--write-auto-subs',
+        '--convert-subs=srt',
+        '--yes-playlist',
+        '--continue',
+        '--no-abort-on-error',
+        '--ignore-errors',
+        '--geo-bypass',
+        '--add-metadata',
+        f'--format=(bv*+ba/b)[filesize<={media_max_size}][filesize_approx<=?{media_max_size}]/(bv*+ba/b)',
+    ]
+
+
+def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
+    """
+    Download media using yt-dlp.
+
+    Returns: (success, output_path, error_message)
+    """
+    # Get config from env (with YTDLP_ prefix or fallback to ARCHIVING_CONFIG style)
+    timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('MEDIA_TIMEOUT') or get_env_int('TIMEOUT', 3600)
+    check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
+    extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
+    media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
+
+    # Create output directory
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(exist_ok=True)
+
+    # Build command (later options take precedence)
+    cmd = [
+        binary,
+        *get_ytdlp_default_args(media_max_size),
+        '--no-progress',
+        '-o', f'{OUTPUT_DIR}/%(title)s.%(ext)s',
+    ]
+
+    if not check_ssl:
+        cmd.append('--no-check-certificate')
+
+    if extra_args:
+        cmd.extend(extra_args.split())
+
+    cmd.append(url)
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
+
+        # Check if any media files were downloaded
+        media_extensions = (
+            '.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v',
+            '.mp3', '.m4a', '.ogg', '.wav', '.flac', '.aac', '.opus',
+            '.json', '.jpg', '.png', '.webp', '.jpeg',
+            '.vtt', '.srt', '.ass', '.lrc',
+            '.description',
+        )
+
+        downloaded_files = [
+            f for f in output_dir.glob('*')
+            if f.is_file() and f.suffix.lower() in media_extensions
+        ]
+
+        if downloaded_files:
+            # Return first video/audio file, or first file if no media
+            video_audio = [
+                f for f in downloaded_files
+                if f.suffix.lower() in ('.mp4', '.webm', '.mkv', '.avi', '.mov', '.mp3', '.m4a', '.ogg', '.wav', '.flac')
+            ]
+            output = str(video_audio[0]) if video_audio else str(downloaded_files[0])
+            return True, output, ''
+        else:
+            stderr = result.stderr
+
+            # These are NOT errors - page simply has no downloadable media
+            # Return success with no output (legitimate "nothing to download")
+            if 'ERROR: Unsupported URL' in stderr:
+                return True, None, ''  # Not a media site - success, no output
+            if 'URL could be a direct video link' in stderr:
+                return True, None, ''  # Not a supported media URL - success, no output
+            if result.returncode == 0:
+                return True, None, ''  # yt-dlp exited cleanly, just no media - success
+
+            # These ARE errors - something went wrong
+            if 'HTTP Error 404' in stderr:
+                return False, None, '404 Not Found'
+            if 'HTTP Error 403' in stderr:
+                return False, None, '403 Forbidden'
+            if 'Unable to extract' in stderr:
+                return False, None, 'Unable to extract media info'
+
+            return False, None, f'yt-dlp error: {stderr[:200]}'
+
+    except subprocess.TimeoutExpired:
+        return False, None, f'Timed out after {timeout} seconds'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to download media from')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Download media from a URL using yt-dlp."""
+
+    start_ts = datetime.now(timezone.utc)
+    version = ''
+    output = None
+    status = 'failed'
+    error = ''
+    binary = None
+    cmd_str = ''
+
+    try:
+        # Check if yt-dlp is enabled
+        if not (get_env_bool('USE_YTDLP', True) and get_env_bool('SAVE_MEDIA', True)):
+            print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)')
+            status = 'skipped'
+            end_ts = datetime.now(timezone.utc)
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={end_ts.isoformat()}')
+            print(f'STATUS={status}')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)
+
+        # Check if staticfile extractor already handled this (permanent skip)
+        if has_staticfile_output():
+            print(f'Skipping media - staticfile extractor already downloaded this')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS={status}')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)
+
+        # Find binary
+        binary = find_ytdlp()
+        if not binary:
+            print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
+            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
+            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
+            print(f'INSTALL_HINT=pip install yt-dlp OR brew install yt-dlp', file=sys.stderr)
+            sys.exit(1)
+
+        version = get_version(binary)
+        cmd_str = f'{binary} {url}'
+
+        # Run extraction
+        success, output, error = save_media(url, binary)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            output_dir = Path(OUTPUT_DIR)
+            files = list(output_dir.glob('*'))
+            file_count = len([f for f in files if f.is_file()])
+            if file_count > 0:
+                print(f'yt-dlp completed: {file_count} files downloaded')
+            else:
+                print(f'yt-dlp completed: no media found on page (this is normal)')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if cmd_str:
+        print(f'CMD={cmd_str}')
+    if version:
+        print(f'VERSION={version}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'cmd_version': version,
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/mercury/config.json
+++ b/archivebox/plugins/mercury/config.json
@@ -0,0 +1,30 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_MERCURY": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable Mercury text extraction"
+    },
+    "MERCURY_BINARY": {
+      "type": "string",
+      "default": "postlight-parser",
+      "x-aliases": ["POSTLIGHT_PARSER_BINARY"],
+      "description": "Path to Mercury/Postlight parser binary"
+    },
+    "NODE_BINARY": {
+      "type": "string",
+      "default": "node",
+      "description": "Path to Node.js binary"
+    },
+    "MERCURY_TIMEOUT": {
+      "type": "integer",
+      "default": 30,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for Mercury in seconds"
+    }
+  }
+}
--- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""
+Extract article content using Postlight's Mercury Parser.
+
+Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
+Output: Creates mercury/ directory with content.html, content.txt, article.json
+
+Environment variables:
+    MERCURY_BINARY: Path to mercury-parser binary
+    TIMEOUT: Timeout in seconds (default: 60)
+
+Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'mercury'
+BIN_NAME = 'mercury-parser'
+BIN_PROVIDERS = 'npm,env'
+OUTPUT_DIR = 'mercury'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def find_mercury() -> str | None:
+    """Find mercury-parser binary."""
+    mercury = get_env('MERCURY_BINARY')
+    if mercury and os.path.isfile(mercury):
+        return mercury
+
+    for name in ['mercury-parser', 'mercury']:
+        binary = shutil.which(name)
+        if binary:
+            return binary
+
+    return None
+
+
+def get_version(binary: str) -> str:
+    """Get mercury-parser version."""
+    try:
+        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
+        return result.stdout.strip()[:64]
+    except Exception:
+        return ''
+
+
+def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
+    """
+    Extract article using Mercury Parser.
+
+    Returns: (success, output_path, error_message)
+    """
+    timeout = get_env_int('TIMEOUT', 60)
+
+    # Create output directory
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(exist_ok=True)
+
+    try:
+        # Get text version
+        cmd_text = [binary, url, '--format=text']
+        result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)
+
+        if result_text.returncode != 0:
+            stderr = result_text.stderr.decode('utf-8', errors='replace')
+            return False, None, f'mercury-parser failed: {stderr[:200]}'
+
+        try:
+            text_json = json.loads(result_text.stdout)
+        except json.JSONDecodeError:
+            return False, None, 'mercury-parser returned invalid JSON'
+
+        if text_json.get('failed'):
+            return False, None, 'Mercury was not able to extract article'
+
+        # Save text content
+        text_content = text_json.get('content', '')
+        (output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
+
+        # Get HTML version
+        cmd_html = [binary, url, '--format=html']
+        result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)
+
+        try:
+            html_json = json.loads(result_html.stdout)
+        except json.JSONDecodeError:
+            html_json = {}
+
+        # Save HTML content and metadata
+        html_content = html_json.pop('content', '')
+        (output_dir / 'content.html').write_text(html_content, encoding='utf-8')
+
+        # Save article metadata
+        metadata = {k: v for k, v in text_json.items() if k != 'content'}
+        (output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8')
+
+        return True, OUTPUT_DIR, ''
+
+    except subprocess.TimeoutExpired:
+        return False, None, f'Timed out after {timeout} seconds'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to extract article from')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Extract article content using Postlight's Mercury Parser."""
+
+    start_ts = datetime.now(timezone.utc)
+    version = ''
+    output = None
+    status = 'failed'
+    error = ''
+    binary = None
+
+    try:
+        # Find binary
+        binary = find_mercury()
+        if not binary:
+            print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
+            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
+            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
+            sys.exit(1)
+
+        version = get_version(binary)
+
+        # Run extraction
+        success, output, error = extract_mercury(url, binary)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            text_file = Path(output) / 'content.txt'
+            html_file = Path(output) / 'content.html'
+            text_len = text_file.stat().st_size if text_file.exists() else 0
+            html_len = html_file.stat().st_size if html_file.exists() else 0
+            print(f'Mercury extracted: {text_len} chars text, {html_len} chars HTML')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if binary:
+        print(f'CMD={binary} {url}')
+    if version:
+        print(f'VERSION={version}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'cmd_version': version,
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
+++ b/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+"""
+Create a Merkle tree of all archived outputs.
+
+This plugin runs after all extractors and post-processing complete (priority 92)
+and generates a cryptographic Merkle tree of all files in the snapshot directory.
+This provides:
+    - Tamper detection: verify archive integrity
+    - Efficient updates: only re-hash changed files
+    - Compact proofs: prove file inclusion without sending all files
+    - Deduplication: identify identical content across snapshots
+
+Output: merkletree/merkletree.json containing:
+    - root_hash: SHA256 hash of the Merkle root
+    - tree: Full tree structure with internal nodes
+    - files: List of all files with their hashes
+    - metadata: Timestamp, file count, total size
+
+Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
+
+Environment variables:
+    SAVE_MERKLETREE: Enable merkle tree generation (default: true)
+"""
+
+__package__ = 'archivebox.plugins.merkletree'
+
+import os
+import sys
+import json
+import hashlib
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple, Any
+
+# Configure Django if running standalone
+if __name__ == '__main__':
+    parent_dir = str(Path(__file__).resolve().parent.parent.parent)
+    if parent_dir not in sys.path:
+        sys.path.insert(0, parent_dir)
+    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
+    import django
+    django.setup()
+
+import rich_click as click
+
+
+def sha256_file(filepath: Path) -> str:
+    """Compute SHA256 hash of a file."""
+    h = hashlib.sha256()
+    try:
+        with open(filepath, 'rb') as f:
+            # Read in 64kb chunks
+            while chunk := f.read(65536):
+                h.update(chunk)
+        return h.hexdigest()
+    except (OSError, PermissionError):
+        # If we can't read the file, return a null hash
+        return '0' * 64
+
+
+def sha256_data(data: bytes) -> str:
+    """Compute SHA256 hash of raw data."""
+    return hashlib.sha256(data).hexdigest()
+
+
+def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
+    """
+    Recursively collect all files in snapshot directory.
+
+    Args:
+        snapshot_dir: Root directory to scan
+        exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
+
+    Returns:
+        List of (relative_path, sha256_hash, file_size) tuples
+    """
+    exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
+    files = []
+
+    for root, dirs, filenames in os.walk(snapshot_dir):
+        # Filter out excluded directories
+        dirs[:] = [d for d in dirs if d not in exclude_dirs]
+
+        for filename in filenames:
+            filepath = Path(root) / filename
+            rel_path = filepath.relative_to(snapshot_dir)
+
+            # Skip symlinks (we hash the target, not the link)
+            if filepath.is_symlink():
+                continue
+
+            # Compute hash and size
+            file_hash = sha256_file(filepath)
+            file_size = filepath.stat().st_size if filepath.exists() else 0
+
+            files.append((rel_path, file_hash, file_size))
+
+    # Sort by path for deterministic tree
+    files.sort(key=lambda x: str(x[0]))
+    return files
+
+
+def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
+    """
+    Build a Merkle tree from a list of leaf hashes.
+
+    Args:
+        file_hashes: List of SHA256 hashes (leaves)
+
+    Returns:
+        (root_hash, tree_levels) where tree_levels is a list of hash lists per level
+    """
+    if not file_hashes:
+        # Empty tree
+        return sha256_data(b''), [[]]
+
+    # Initialize with leaf level
+    tree_levels = [file_hashes.copy()]
+
+    # Build tree bottom-up
+    while len(tree_levels[-1]) > 1:
+        current_level = tree_levels[-1]
+        next_level = []
+
+        # Process pairs
+        for i in range(0, len(current_level), 2):
+            left = current_level[i]
+
+            if i + 1 < len(current_level):
+                # Combine left + right
+                right = current_level[i + 1]
+                combined = left + right
+            else:
+                # Odd number of nodes: duplicate the last one
+                combined = left + left
+
+            parent_hash = sha256_data(combined.encode('utf-8'))
+            next_level.append(parent_hash)
+
+        tree_levels.append(next_level)
+
+    # Root is the single hash at the top level
+    root_hash = tree_levels[-1][0]
+    return root_hash, tree_levels
+
+
+def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
+    """
+    Create a complete Merkle tree of all files in snapshot directory.
+
+    Args:
+        snapshot_dir: The snapshot directory to scan
+
+    Returns:
+        Dict containing root_hash, tree structure, file list, and metadata
+    """
+    # Collect all files
+    files = collect_files(snapshot_dir)
+
+    # Extract just the hashes for tree building
+    file_hashes = [file_hash for _, file_hash, _ in files]
+
+    # Build Merkle tree
+    root_hash, tree_levels = build_merkle_tree(file_hashes)
+
+    # Calculate total size
+    total_size = sum(size for _, _, size in files)
+
+    # Prepare file list with metadata
+    file_list = [
+        {
+            'path': str(path),
+            'hash': file_hash,
+            'size': size,
+        }
+        for path, file_hash, size in files
+    ]
+
+    # Prepare result
+    result = {
+        'root_hash': root_hash,
+        'tree_levels': tree_levels,
+        'files': file_list,
+        'metadata': {
+            'timestamp': datetime.now().isoformat(),
+            'file_count': len(files),
+            'total_size': total_size,
+            'tree_depth': len(tree_levels),
+        },
+    }
+
+    return result
+
+
+@click.command()
+@click.option('--url', required=True, help='URL being archived')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Generate Merkle tree of all archived outputs."""
+    from archivebox.core.models import Snapshot
+
+    start_ts = datetime.now()
+    status = 'failed'
+    output = None
+    error = ''
+    root_hash = None
+    file_count = 0
+
+    try:
+        # Check if enabled
+        save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
+
+        if not save_merkletree:
+            click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
+            status = 'skipped'
+            end_ts = datetime.now()
+            click.echo(f'START_TS={start_ts.isoformat()}')
+            click.echo(f'END_TS={end_ts.isoformat()}')
+            click.echo(f'STATUS={status}')
+            click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
+            sys.exit(0)
+
+        # Get snapshot
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+        except Snapshot.DoesNotExist:
+            error = f'Snapshot {snapshot_id} not found'
+            raise ValueError(error)
+
+        # Get snapshot directory
+        snapshot_dir = Path(snapshot.output_dir)
+        if not snapshot_dir.exists():
+            error = f'Snapshot directory not found: {snapshot_dir}'
+            raise FileNotFoundError(error)
+
+        # Create output directory
+        output_dir = snapshot_dir / 'merkletree'
+        output_dir.mkdir(exist_ok=True)
+        output_path = output_dir / 'merkletree.json'
+
+        # Generate Merkle tree
+        merkle_data = create_merkle_tree(snapshot_dir)
+
+        # Write output
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(merkle_data, f, indent=2)
+
+        status = 'succeeded'
+        output = str(output_path)
+        root_hash = merkle_data['root_hash']
+        file_count = merkle_data['metadata']['file_count']
+        total_size = merkle_data['metadata']['total_size']
+
+        click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+        click.echo(f'Error: {error}', err=True)
+
+    end_ts = datetime.now()
+    duration = (end_ts - start_ts).total_seconds()
+
+    # Print results
+    click.echo(f'START_TS={start_ts.isoformat()}')
+    click.echo(f'END_TS={end_ts.isoformat()}')
+    click.echo(f'DURATION={duration:.2f}')
+    if output:
+        click.echo(f'OUTPUT={output}')
+    click.echo(f'STATUS={status}')
+
+    if error:
+        click.echo(f'ERROR={error}', err=True)
+
+    # Print JSON result
+    result_json = {
+        'extractor': 'merkletree',
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'root_hash': root_hash,
+        'file_count': file_count,
+        'error': error or None,
+    }
+    click.echo(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/npm/on_Dependency__install_using_npm_provider.py
+++ b/archivebox/plugins/npm/on_Dependency__install_using_npm_provider.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Install a binary using npm package manager.
+
+Usage: on_Dependency__install_using_npm_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
+Output: InstalledBinary JSONL record to stdout after installation
+
+Environment variables:
+    MACHINE_ID: Machine UUID (set by orchestrator)
+"""
+
+import json
+import os
+import sys
+
+import rich_click as click
+from abx_pkg import Binary, NpmProvider, BinProviderOverrides
+
+# Fix pydantic forward reference issue
+NpmProvider.model_rebuild()
+
+
+@click.command()
+@click.option('--dependency-id', required=True, help="Dependency UUID")
+@click.option('--bin-name', required=True, help="Binary name to install")
+@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
+@click.option('--custom-cmd', default=None, help="Custom install command")
+def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
+    """Install binary using npm."""
+
+    if bin_providers != '*' and 'npm' not in bin_providers.split(','):
+        click.echo(f"npm provider not allowed for {bin_name}", err=True)
+        sys.exit(0)
+
+    # Use abx-pkg NpmProvider to install binary
+    provider = NpmProvider()
+    if not provider.INSTALLER_BIN:
+        click.echo("npm not available on this system", err=True)
+        sys.exit(1)
+
+    click.echo(f"Installing {bin_name} via npm...", err=True)
+
+    try:
+        binary = Binary(name=bin_name, binproviders=[provider]).install()
+    except Exception as e:
+        click.echo(f"npm install failed: {e}", err=True)
+        sys.exit(1)
+
+    if not binary.abspath:
+        click.echo(f"{bin_name} not found after npm install", err=True)
+        sys.exit(1)
+
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    # Output InstalledBinary JSONL record to stdout
+    record = {
+        'type': 'InstalledBinary',
+        'name': bin_name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'npm',
+        'machine_id': machine_id,
+        'dependency_id': dependency_id,
+    }
+    print(json.dumps(record))
+
+    # Log human-readable info to stderr
+    click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
+    click.echo(f"  version: {binary.version}", err=True)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -0,0 +1,281 @@
+#!/usr/bin/env node
+/**
+ * Extract and categorize outgoing links from a page's DOM.
+ *
+ * Categorizes links by type:
+ * - hrefs: All <a> links
+ * - images: <img src>
+ * - css_stylesheets: <link rel=stylesheet>
+ * - css_images: CSS background-image: url()
+ * - js_scripts: <script src>
+ * - iframes: <iframe src>
+ * - links: <link> tags with rel/href
+ *
+ * Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
+ *
+ * Environment variables:
+ *     SAVE_DOM_OUTLINKS: Enable DOM outlinks extraction (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'parse_dom_outlinks';
+const OUTPUT_DIR = 'parse_dom_outlinks';
+const OUTPUT_FILE = 'outlinks.json';
+const URLS_FILE = 'urls.jsonl';  // For crawl system
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Extract outlinks
+async function extractOutlinks(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Extract outlinks by category
+        const outlinksData = await page.evaluate(() => {
+            const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
+
+            const filterDataUrls = (urls) => urls.filter(url => url && !url.startsWith('data:'));
+            const filterW3Urls = (urls) => urls.filter(url => url && !url.startsWith('http://www.w3.org/'));
+
+            // Get raw links from HTML
+            const html = document.documentElement.outerHTML;
+            const raw = Array.from(html.matchAll(LINK_REGEX)).map(m => m[0]);
+
+            // Get all <a href> links
+            const hrefs = Array.from(document.querySelectorAll('a[href]'))
+                .map(elem => elem.href)
+                .filter(url => url);
+
+            // Get all <link> tags (not just stylesheets)
+            const linksMap = {};
+            document.querySelectorAll('link[href]').forEach(elem => {
+                const rel = elem.rel || '';
+                const href = elem.href;
+                if (href && rel !== 'stylesheet') {
+                    linksMap[href] = { rel, href };
+                }
+            });
+            const links = Object.values(linksMap);
+
+            // Get iframes
+            const iframes = Array.from(document.querySelectorAll('iframe[src]'))
+                .map(elem => elem.src)
+                .filter(url => url);
+
+            // Get images
+            const images = Array.from(document.querySelectorAll('img[src]'))
+                .map(elem => elem.src)
+                .filter(url => url && !url.startsWith('data:'));
+
+            // Get CSS background images
+            const css_images = Array.from(document.querySelectorAll('*'))
+                .map(elem => {
+                    const bgImg = window.getComputedStyle(elem).getPropertyValue('background-image');
+                    const match = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i.exec(bgImg);
+                    return match ? match[1] : null;
+                })
+                .filter(url => url);
+
+            // Get stylesheets
+            const css_stylesheets = Array.from(document.querySelectorAll('link[rel=stylesheet]'))
+                .map(elem => elem.href)
+                .filter(url => url);
+
+            // Get JS scripts
+            const js_scripts = Array.from(document.querySelectorAll('script[src]'))
+                .map(elem => elem.src)
+                .filter(url => url);
+
+            return {
+                url: window.location.href,
+                raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
+                hrefs: [...new Set(filterDataUrls(hrefs))],
+                links,
+                iframes: [...new Set(iframes)],
+                images: [...new Set(filterDataUrls(images))],
+                css_images: [...new Set(filterDataUrls(css_images))],
+                css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
+                js_scripts: [...new Set(filterDataUrls(js_scripts))],
+            };
+        });
+
+        // Write detailed output (for archival)
+        fs.writeFileSync(outputPath, JSON.stringify(outlinksData, null, 2));
+
+        // Write urls.jsonl for crawl system (only hrefs that are crawlable pages)
+        const urlsPath = path.join(OUTPUT_DIR, URLS_FILE);
+        const crawlableUrls = outlinksData.hrefs.filter(href => {
+            // Only include http/https URLs, exclude static assets
+            if (!href.startsWith('http://') && !href.startsWith('https://')) return false;
+            // Exclude common static file extensions
+            const staticExts = ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot', '.mp4', '.webm', '.mp3', '.pdf'];
+            const urlPath = href.split('?')[0].split('#')[0].toLowerCase();
+            return !staticExts.some(ext => urlPath.endsWith(ext));
+        });
+
+        const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
+            type: 'Snapshot',
+            url: href,
+            via_extractor: EXTRACTOR_NAME,
+        })).join('\n');
+
+        if (urlsJsonl) {
+            fs.writeFileSync(urlsPath, urlsJsonl + '\n');
+        }
+
+        return { success: true, output: outputPath, outlinksData, crawlableCount: crawlableUrls.length };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_DOM_OUTLINKS', true)) {
+            console.log('Skipping DOM outlinks (SAVE_DOM_OUTLINKS=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await extractOutlinks(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            const total = result.outlinksData.hrefs.length;
+            const crawlable = result.crawlableCount;
+            const images = result.outlinksData.images.length;
+            const scripts = result.outlinksData.js_scripts.length;
+            console.log(`DOM outlinks extracted: ${total} links (${crawlable} crawlable), ${images} images, ${scripts} scripts`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Parse HTML files and extract href URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads HTML content and extracts all <a href="..."> URLs.
+
+NOTE: If parse_dom_outlinks already ran (parse_dom_outlinks/urls.jsonl exists),
+this extractor will skip since parse_dom_outlinks provides better coverage via Chrome.
+
+Usage: ./on_Snapshot__60_parse_html_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Examples:
+    ./on_Snapshot__60_parse_html_urls.py --url=file:///path/to/page.html
+    ./on_Snapshot__60_parse_html_urls.py --url=https://example.com/page.html
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from html import unescape
+from html.parser import HTMLParser
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_html_urls'
+
+# Check if parse_dom_outlinks extractor already ran
+DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
+
+
+# URL regex from archivebox/misc/util.py
+URL_REGEX = re.compile(
+    r'(?=('
+    r'http[s]?://'
+    r'(?:[a-zA-Z]|[0-9]'
+    r'|[-_$@.&+!*\(\),]'
+    r'|[^\u0000-\u007F])+'
+    r'[^\]\[<>"\'\s]+'
+    r'))',
+    re.IGNORECASE | re.UNICODE,
+)
+
+
+class HrefParser(HTMLParser):
+    """Extract href attributes from anchor tags."""
+
+    def __init__(self):
+        super().__init__()
+        self.urls = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'a':
+            for attr, value in attrs:
+                if attr == 'href' and value:
+                    self.urls.append(value)
+
+
+def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
+    """Check if urljoin incorrectly stripped // from sub-URLs."""
+    relative_path = relative_path.lower()
+    if relative_path.startswith('http://') or relative_path.startswith('https://'):
+        relative_path = relative_path.split('://', 1)[-1]
+
+    original_path_had_suburl = '://' in relative_path
+    original_root_had_suburl = '://' in root_url[8:]
+    final_joined_has_suburl = '://' in final_url[8:]
+
+    return (original_root_had_suburl or original_path_had_suburl) and not final_joined_has_suburl
+
+
+def fix_urljoin_bug(url: str, nesting_limit=5) -> str:
+    """Fix broken sub-URLs where :// was changed to :/."""
+    input_url = url
+    for _ in range(nesting_limit):
+        url = re.sub(
+            r'(?P<root>.+?)'
+            r'(?P<separator>[-=/_&+%$#@!*\(\\])'
+            r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/'
+            r'(?P<suburl>[^/\\]+)',
+            r'\1\2\3://\4',
+            input_url,
+            re.IGNORECASE | re.UNICODE,
+        )
+        if url == input_url:
+            break
+        input_url = url
+    return url
+
+
+def normalize_url(url: str, root_url: str = None) -> str:
+    """Normalize a URL, resolving relative paths if root_url provided."""
+    if not root_url:
+        return url
+
+    url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')
+
+    if url_is_absolute:
+        return url
+
+    # Resolve relative URL
+    resolved = urljoin(root_url, url)
+
+    # Fix urljoin bug with sub-URLs
+    if did_urljoin_misbehave(root_url, url, resolved):
+        resolved = fix_urljoin_bug(resolved)
+
+    return resolved
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='HTML URL to parse')
+def main(url: str):
+    """Parse HTML and extract href URLs."""
+
+    # Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
+    # If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
+    if DOM_OUTLINKS_URLS_FILE.exists() and DOM_OUTLINKS_URLS_FILE.stat().st_size > 0:
+        click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
+        sys.exit(0)
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    # Parse HTML for hrefs
+    parser = HrefParser()
+    try:
+        parser.feed(content)
+    except Exception as e:
+        click.echo(f'Failed to parse HTML: {e}', err=True)
+        sys.exit(1)
+
+    urls_found = set()
+    for href in parser.urls:
+        # Normalize URL
+        normalized = normalize_url(href, root_url=url)
+
+        # Only include http/https URLs
+        if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
+            # Skip the source URL itself
+            if normalized != url:
+                urls_found.add(unescape(normalized))
+
+    if not urls_found:
+        click.echo('No URLs found', err=True)
+        sys.exit(1)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        for found_url in sorted(urls_found):
+            f.write(json.dumps({
+                'type': 'Snapshot',
+                'url': found_url,
+                'via_extractor': EXTRACTOR_NAME,
+            }) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_html_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.py'), None)
+
+
+class TestParseHtmlUrls:
+    """Test the parse_html_urls extractor CLI."""
+
+    def test_parses_real_example_com(self, tmp_path):
+        """Test parsing real https://example.com and extracting its links."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'https://example.com'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists(), "Output file not created"
+
+        # Verify output contains IANA link (example.com links to iana.org)
+        content = output_file.read_text()
+        assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
+
+    def test_extracts_href_urls(self, tmp_path):
+        """Test extracting URLs from anchor tags."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<!DOCTYPE html>
+<html>
+<body>
+    <a href="https://example.com">Example</a>
+    <a href="https://foo.bar/page">Foo</a>
+    <a href="http://test.org">Test</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 3 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+
+        urls = set()
+        for line in lines:
+            entry = json.loads(line)
+            assert 'url' in entry
+            urls.add(entry['url'])
+
+        assert 'https://example.com' in urls
+        assert 'https://foo.bar/page' in urls
+        assert 'http://test.org' in urls
+
+    def test_ignores_non_http_schemes(self, tmp_path):
+        """Test that non-http schemes are ignored."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="mailto:test@example.com">Email</a>
+    <a href="javascript:void(0)">JS</a>
+    <a href="tel:+1234567890">Phone</a>
+    <a href="https://valid.com">Valid</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 1
+
+        entry = json.loads(lines[0])
+        assert entry['url'] == 'https://valid.com'
+
+    def test_handles_html_entities(self, tmp_path):
+        """Test that HTML entities in URLs are decoded."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="https://example.com/page?a=1&amp;b=2">Link</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+
+    def test_deduplicates_urls(self, tmp_path):
+        """Test that duplicate URLs are deduplicated."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="https://example.com">Link 1</a>
+    <a href="https://example.com">Link 2</a>
+    <a href="https://example.com">Link 3</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 1
+
+    def test_excludes_source_url(self, tmp_path):
+        """Test that the source URL itself is excluded from results."""
+        input_file = tmp_path / 'page.html'
+        source_url = f'file://{input_file}'
+        input_file.write_text(f'''
+<html>
+<body>
+    <a href="{source_url}">Self</a>
+    <a href="https://other.com">Other</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', source_url],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 1
+        entry = json.loads(lines[0])
+        assert entry['url'] == 'https://other.com'
+
+    def test_exits_1_when_no_urls_found(self, tmp_path):
+        """Test that script exits with code 1 when no URLs found."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('<html><body>No links here</body></html>')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No URLs found' in result.stderr
+
+    def test_handles_malformed_html(self, tmp_path):
+        """Test handling of malformed HTML."""
+        input_file = tmp_path / 'malformed.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="https://example.com">Unclosed tag
+    <a href="https://other.com">Another link</a>
+</body>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_output_is_valid_json(self, tmp_path):
+        """Test that output contains required fields."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('<a href="https://example.com">Link</a>')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+        assert 'type' in entry
+        assert 'via_extractor' in entry
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Parse JSONL bookmark files and extract URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads JSONL-format bookmark exports (one JSON object per line).
+
+Usage: ./on_Snapshot__54_parse_jsonl_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Expected JSONL format (one object per line):
+    {"url": "https://example.com", "title": "Example", "tags": "tag1,tag2"}
+    {"href": "https://other.com", "description": "Other Site"}
+
+Supports various field names for URL, title, timestamp, and tags.
+"""
+
+import json
+import os
+import sys
+from datetime import datetime
+from html import unescape
+from urllib.parse import urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_jsonl_urls'
+
+
+def parse_bookmarked_at(link: dict) -> str | None:
+    """Parse timestamp from various JSON formats, return ISO 8601."""
+    from datetime import timezone
+
+    def json_date(s: str) -> datetime:
+        # Try ISO 8601 format
+        return datetime.strptime(s.split(',', 1)[0], '%Y-%m-%dT%H:%M:%S%z')
+
+    def to_iso(dt: datetime) -> str:
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.isoformat()
+
+    try:
+        if link.get('bookmarked_at'):
+            # Already in our format, pass through
+            return link['bookmarked_at']
+        elif link.get('timestamp'):
+            # Chrome/Firefox histories use microseconds
+            return to_iso(datetime.fromtimestamp(link['timestamp'] / 1000000, tz=timezone.utc))
+        elif link.get('time'):
+            return to_iso(json_date(link['time']))
+        elif link.get('created_at'):
+            return to_iso(json_date(link['created_at']))
+        elif link.get('created'):
+            return to_iso(json_date(link['created']))
+        elif link.get('date'):
+            return to_iso(json_date(link['date']))
+        elif link.get('bookmarked'):
+            return to_iso(json_date(link['bookmarked']))
+        elif link.get('saved'):
+            return to_iso(json_date(link['saved']))
+    except (ValueError, TypeError, KeyError):
+        pass
+
+    return None
+
+
+def json_object_to_entry(link: dict) -> dict | None:
+    """Convert a JSON bookmark object to a URL entry."""
+    # Parse URL (try various field names)
+    url = link.get('href') or link.get('url') or link.get('URL')
+    if not url:
+        return None
+
+    entry = {
+        'type': 'Snapshot',
+        'url': unescape(url),
+        'via_extractor': EXTRACTOR_NAME,
+    }
+
+    # Parse title
+    title = None
+    if link.get('title'):
+        title = link['title'].strip()
+    elif link.get('description'):
+        title = link['description'].replace(' — Readability', '').strip()
+    elif link.get('name'):
+        title = link['name'].strip()
+    if title:
+        entry['title'] = unescape(title)
+
+    # Parse bookmarked_at (ISO 8601)
+    bookmarked_at = parse_bookmarked_at(link)
+    if bookmarked_at:
+        entry['bookmarked_at'] = bookmarked_at
+
+    # Parse tags
+    tags = link.get('tags', '')
+    if isinstance(tags, list):
+        tags = ','.join(tags)
+    elif isinstance(tags, str) and ',' not in tags and tags:
+        # If no comma, assume space-separated
+        tags = tags.replace(' ', ',')
+    if tags:
+        entry['tags'] = unescape(tags)
+
+    return entry
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='JSONL file URL to parse')
+def main(url: str):
+    """Parse JSONL bookmark file and extract URLs."""
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    urls_found = []
+    for line in content.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+
+        try:
+            link = json.loads(line)
+            entry = json_object_to_entry(link)
+            if entry:
+                urls_found.append(entry)
+        except json.JSONDecodeError:
+            # Skip malformed lines
+            continue
+
+    if not urls_found:
+        click.echo('No URLs found', err=True)
+        sys.exit(1)
+
+    # Collect unique tags
+    all_tags = set()
+    for entry in urls_found:
+        if entry.get('tags'):
+            for tag in entry['tags'].split(','):
+                tag = tag.strip()
+                if tag:
+                    all_tags.add(tag)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        # Write Tag records first
+        for tag_name in sorted(all_tags):
+            f.write(json.dumps({
+                'type': 'Tag',
+                'name': tag_name,
+            }) + '\n')
+        # Write Snapshot records
+        for entry in urls_found:
+            f.write(json.dumps(entry) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_jsonl_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
+
+
+class TestParseJsonlUrls:
+    """Test the parse_jsonl_urls extractor CLI."""
+
+    def test_extracts_urls_from_jsonl(self, tmp_path):
+        """Test extracting URLs from JSONL bookmark file."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://example.com", "title": "Example"}\n'
+            '{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
+            '{"url": "https://test.org", "title": "Test Org"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 3 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+        titles = {e.get('title') for e in entries}
+
+        assert 'https://example.com' in urls
+        assert 'https://foo.bar/page' in urls
+        assert 'https://test.org' in urls
+        assert 'Example' in titles
+        assert 'Foo Bar' in titles
+        assert 'Test Org' in titles
+
+    def test_supports_href_field(self, tmp_path):
+        """Test that 'href' field is recognized as URL."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+
+    def test_supports_description_as_title(self, tmp_path):
+        """Test that 'description' field is used as title fallback."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['title'] == 'A description'
+
+    def test_parses_various_timestamp_formats(self, tmp_path):
+        """Test parsing of different timestamp field names."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        # Parser converts timestamp to bookmarked_at
+        assert 'bookmarked_at' in entry
+
+    def test_parses_tags_as_string(self, tmp_path):
+        """Test parsing tags as comma-separated string."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        # Parser converts tags to separate Tag objects in the output
+        content = output_file.read_text()
+        assert 'tech' in content or 'news' in content or 'Tag' in content
+
+    def test_parses_tags_as_list(self, tmp_path):
+        """Test parsing tags as JSON array."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        # Parser converts tags to separate Tag objects in the output
+        content = output_file.read_text()
+        assert 'tech' in content or 'news' in content or 'Tag' in content
+
+    def test_skips_malformed_lines(self, tmp_path):
+        """Test that malformed JSON lines are skipped."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://valid.com"}\n'
+            'not valid json\n'
+            '{"url": "https://also-valid.com"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_skips_entries_without_url(self, tmp_path):
+        """Test that entries without URL field are skipped."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://valid.com"}\n'
+            '{"title": "No URL here"}\n'
+            '{"url": "https://also-valid.com"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_exits_1_when_no_urls_found(self, tmp_path):
+        """Test that script exits with code 1 when no URLs found."""
+        input_file = tmp_path / 'empty.jsonl'
+        input_file.write_text('{"title": "No URL"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No URLs found' in result.stderr
+
+    def test_exits_1_when_file_not_found(self, tmp_path):
+        """Test that script exits with code 1 when file doesn't exist."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'Failed to fetch' in result.stderr
+
+    def test_handles_html_entities(self, tmp_path):
+        """Test that HTML entities in URLs and titles are decoded."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com/page?a=1&amp;b=2", "title": "Test &amp; Title"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+        assert entry['title'] == 'Test & Title'
+
+    def test_skips_empty_lines(self, tmp_path):
+        """Test that empty lines are skipped."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://example.com"}\n'
+            '\n'
+            '   \n'
+            '{"url": "https://other.com"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_output_includes_required_fields(self, tmp_path):
+        """Test that output includes required fields."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+        assert 'type' in entry
+        assert 'via_extractor' in entry
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Parse Netscape bookmark HTML files and extract URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads Netscape-format bookmark exports (produced by all major browsers).
+
+Usage: ./on_Snapshot__53_parse_netscape_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Examples:
+    ./on_Snapshot__53_parse_netscape_urls.py --url=file:///path/to/bookmarks.html
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from html import unescape
+from urllib.parse import urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_netscape_urls'
+
+# Regex pattern for Netscape bookmark format
+# Example: <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" TAGS="tag1,tag2">example title</A>
+NETSCAPE_PATTERN = re.compile(
+    r'<a\s+href="([^"]+)"\s+add_date="(\d+)"(?:\s+[^>]*?tags="([^"]*)")?[^>]*>([^<]+)</a>',
+    re.UNICODE | re.IGNORECASE
+)
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
+def main(url: str):
+    """Parse Netscape bookmark HTML and extract URLs."""
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    urls_found = []
+    all_tags = set()
+
+    for line in content.splitlines():
+        match = NETSCAPE_PATTERN.search(line)
+        if match:
+            bookmark_url = match.group(1)
+            tags_str = match.group(3) or ''
+            title = match.group(4).strip()
+
+            entry = {
+                'type': 'Snapshot',
+                'url': unescape(bookmark_url),
+                'via_extractor': EXTRACTOR_NAME,
+            }
+            if title:
+                entry['title'] = unescape(title)
+            if tags_str:
+                entry['tags'] = tags_str
+                # Collect unique tags
+                for tag in tags_str.split(','):
+                    tag = tag.strip()
+                    if tag:
+                        all_tags.add(tag)
+            try:
+                # Convert unix timestamp to ISO 8601
+                entry['bookmarked_at'] = datetime.fromtimestamp(float(match.group(2)), tz=timezone.utc).isoformat()
+            except (ValueError, OSError):
+                pass
+            urls_found.append(entry)
+
+    if not urls_found:
+        click.echo('No bookmarks found', err=True)
+        sys.exit(1)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        # Write Tag records first
+        for tag_name in sorted(all_tags):
+            f.write(json.dumps({
+                'type': 'Tag',
+                'name': tag_name,
+            }) + '\n')
+        # Write Snapshot records
+        for entry in urls_found:
+            f.write(json.dumps(entry) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_netscape_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
+
+
+class TestParseNetscapeUrls:
+    """Test the parse_netscape_urls extractor CLI."""
+
+    def test_extracts_urls_from_netscape_bookmarks(self, tmp_path):
+        """Test extracting URLs from Netscape bookmark HTML format."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1609459200">Example Site</A>
+    <DT><A HREF="https://foo.bar/page" ADD_DATE="1609545600">Foo Bar</A>
+    <DT><A HREF="https://test.org" ADD_DATE="1609632000">Test Org</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 3 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+        titles = {e.get('title') for e in entries}
+
+        assert 'https://example.com' in urls
+        assert 'https://foo.bar/page' in urls
+        assert 'https://test.org' in urls
+        assert 'Example Site' in titles
+        assert 'Foo Bar' in titles
+        assert 'Test Org' in titles
+
+    def test_parses_add_date_timestamps(self, tmp_path):
+        """Test that ADD_DATE timestamps are parsed correctly."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1609459200">Test</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        # Parser converts timestamp to bookmarked_at
+        assert 'bookmarked_at' in entry
+
+    def test_handles_query_params_in_urls(self, tmp_path):
+        """Test that URLs with query parameters are preserved."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com/search?q=test+query&page=1" ADD_DATE="1609459200">Search</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert 'q=test+query' in entry['url']
+        assert 'page=1' in entry['url']
+
+    def test_handles_html_entities(self, tmp_path):
+        """Test that HTML entities in URLs and titles are decoded."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com/page?a=1&amp;b=2" ADD_DATE="1609459200">Test &amp; Title</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+        assert entry['title'] == 'Test & Title'
+
+    def test_exits_1_when_no_bookmarks_found(self, tmp_path):
+        """Test that script exits with code 1 when no bookmarks found."""
+        input_file = tmp_path / 'empty.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No bookmarks found' in result.stderr
+
+    def test_exits_1_when_file_not_found(self, tmp_path):
+        """Test that script exits with code 1 when file doesn't exist."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.html'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'Failed to fetch' in result.stderr
+
+    def test_handles_nested_folders(self, tmp_path):
+        """Test parsing bookmarks in nested folder structure."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<DL><p>
+    <DT><H3>Folder 1</H3>
+    <DL><p>
+        <DT><A HREF="https://example.com/nested1" ADD_DATE="1609459200">Nested 1</A>
+        <DT><H3>Subfolder</H3>
+        <DL><p>
+            <DT><A HREF="https://example.com/nested2" ADD_DATE="1609459200">Nested 2</A>
+        </DL><p>
+    </DL><p>
+    <DT><A HREF="https://example.com/top" ADD_DATE="1609459200">Top Level</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        urls = {json.loads(line)['url'] for line in lines}
+
+        assert 'https://example.com/nested1' in urls
+        assert 'https://example.com/nested2' in urls
+        assert 'https://example.com/top' in urls
+
+    def test_case_insensitive_parsing(self, tmp_path):
+        """Test that parsing is case-insensitive for HTML tags."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<dt><a HREF="https://example.com" ADD_DATE="1609459200">Test</a>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+Parse RSS/Atom feeds and extract URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads feed content from a URL and extracts article URLs.
+
+Usage: ./on_Snapshot__51_parse_rss_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Examples:
+    ./on_Snapshot__51_parse_rss_urls.py --url=https://example.com/feed.rss
+    ./on_Snapshot__51_parse_rss_urls.py --url=file:///path/to/feed.xml
+"""
+
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from html import unescape
+from time import mktime
+from urllib.parse import urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_rss_urls'
+
+try:
+    import feedparser
+except ImportError:
+    feedparser = None
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
+def main(url: str):
+    """Parse RSS/Atom feed and extract article URLs."""
+
+    if feedparser is None:
+        click.echo('feedparser library not installed', err=True)
+        sys.exit(1)
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    # Parse the feed
+    feed = feedparser.parse(content)
+
+    if not feed.entries:
+        click.echo('No entries found in feed', err=True)
+        sys.exit(1)
+
+    urls_found = []
+    for item in feed.entries:
+        item_url = getattr(item, 'link', None)
+        if not item_url:
+            continue
+
+        title = getattr(item, 'title', None)
+
+        # Get bookmarked_at (published/updated date as ISO 8601)
+        bookmarked_at = None
+        if hasattr(item, 'published_parsed') and item.published_parsed:
+            bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
+        elif hasattr(item, 'updated_parsed') and item.updated_parsed:
+            bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
+
+        # Get tags
+        tags = ''
+        if hasattr(item, 'tags') and item.tags:
+            try:
+                tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
+            except (AttributeError, TypeError):
+                pass
+
+        entry = {
+            'type': 'Snapshot',
+            'url': unescape(item_url),
+            'via_extractor': EXTRACTOR_NAME,
+        }
+        if title:
+            entry['title'] = unescape(title)
+        if bookmarked_at:
+            entry['bookmarked_at'] = bookmarked_at
+        if tags:
+            entry['tags'] = tags
+        urls_found.append(entry)
+
+    if not urls_found:
+        click.echo('No valid URLs found in feed entries', err=True)
+        sys.exit(1)
+
+    # Collect unique tags
+    all_tags = set()
+    for entry in urls_found:
+        if entry.get('tags'):
+            for tag in entry['tags'].split(','):
+                tag = tag.strip()
+                if tag:
+                    all_tags.add(tag)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        # Write Tag records first
+        for tag_name in sorted(all_tags):
+            f.write(json.dumps({
+                'type': 'Tag',
+                'name': tag_name,
+            }) + '\n')
+        # Write Snapshot records
+        for entry in urls_found:
+            f.write(json.dumps(entry) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_rss_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
+
+
+class TestParseRssUrls:
+    """Test the parse_rss_urls extractor CLI."""
+
+    def test_parses_real_rss_feed(self, tmp_path):
+        """Test parsing a real RSS feed from the web."""
+        # Use httpbin.org which provides a sample RSS feed
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'https://news.ycombinator.com/rss'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        # HN RSS feed should parse successfully
+        if result.returncode == 0:
+            output_file = tmp_path / 'urls.jsonl'
+            assert output_file.exists(), "Output file not created"
+
+            content = output_file.read_text()
+            assert len(content) > 0, "No URLs extracted from real RSS feed"
+
+            # Verify at least one URL was extracted
+            lines = content.strip().split('\n')
+            assert len(lines) > 0, "No entries found in RSS feed"
+
+    def test_extracts_urls_from_rss_feed(self, tmp_path):
+        """Test extracting URLs from an RSS 2.0 feed."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Test Feed</title>
+    <link>https://example.com</link>
+    <item>
+      <title>First Post</title>
+      <link>https://example.com/post/1</link>
+      <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
+    </item>
+    <item>
+      <title>Second Post</title>
+      <link>https://example.com/post/2</link>
+      <pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 2 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+        titles = {e.get('title') for e in entries}
+
+        assert 'https://example.com/post/1' in urls
+        assert 'https://example.com/post/2' in urls
+        assert 'First Post' in titles
+        assert 'Second Post' in titles
+
+    def test_extracts_urls_from_atom_feed(self, tmp_path):
+        """Test extracting URLs from an Atom feed."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <title>Test Atom Feed</title>
+  <entry>
+    <title>Atom Post 1</title>
+    <link href="https://atom.example.com/entry/1"/>
+    <updated>2024-01-01T12:00:00Z</updated>
+  </entry>
+  <entry>
+    <title>Atom Post 2</title>
+    <link href="https://atom.example.com/entry/2"/>
+    <updated>2024-01-02T12:00:00Z</updated>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        urls = {json.loads(line)['url'] for line in lines}
+
+        assert 'https://atom.example.com/entry/1' in urls
+        assert 'https://atom.example.com/entry/2' in urls
+
+    def test_exits_1_when_no_entries(self, tmp_path):
+        """Test that script exits with code 1 when feed has no entries."""
+        input_file = tmp_path / 'empty.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <title>Empty Feed</title>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No entries found' in result.stderr
+
+    def test_exits_1_when_file_not_found(self, tmp_path):
+        """Test that script exits with code 1 when file doesn't exist."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/feed.rss'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'Failed to fetch' in result.stderr
+
+    def test_handles_html_entities_in_urls(self, tmp_path):
+        """Test that HTML entities in URLs are decoded."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Entity Test</title>
+      <link>https://example.com/page?a=1&amp;b=2</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+
+    def test_includes_optional_metadata(self, tmp_path):
+        """Test that title and timestamp are included when present."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Test Title</title>
+      <link>https://example.com/test</link>
+      <pubDate>Wed, 15 Jan 2020 10:30:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/test'
+        assert entry['title'] == 'Test Title'
+        # Parser converts timestamp to bookmarked_at
+        assert 'bookmarked_at' in entry
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""
+Parse plain text files and extract URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads text content from a URL (file:// or https://) and extracts all URLs found.
+
+Usage: ./on_Snapshot__52_parse_txt_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Examples:
+    ./on_Snapshot__52_parse_txt_urls.py --url=file:///path/to/urls.txt
+    ./on_Snapshot__52_parse_txt_urls.py --url=https://example.com/urls.txt
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from html import unescape
+from pathlib import Path
+from urllib.parse import urlparse
+from urllib.request import urlopen
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_txt_urls'
+
+# URL regex from archivebox/misc/util.py
+# https://mathiasbynens.be/demo/url-regex
+URL_REGEX = re.compile(
+    r'(?=('
+    r'http[s]?://'                     # start matching from allowed schemes
+    r'(?:[a-zA-Z]|[0-9]'               # followed by allowed alphanum characters
+    r'|[-_$@.&+!*\(\),]'               #   or allowed symbols (keep hyphen first to match literal hyphen)
+    r'|[^\u0000-\u007F])+'             #   or allowed unicode bytes
+    r'[^\]\[<>"\'\s]+'                 # stop parsing at these symbols
+    r'))',
+    re.IGNORECASE | re.UNICODE,
+)
+
+
+def parens_are_matched(string: str, open_char='(', close_char=')') -> bool:
+    """Check that all parentheses in a string are balanced and nested properly."""
+    count = 0
+    for c in string:
+        if c == open_char:
+            count += 1
+        elif c == close_char:
+            count -= 1
+        if count < 0:
+            return False
+    return count == 0
+
+
+def fix_url_from_markdown(url_str: str) -> str:
+    """
+    Cleanup a regex-parsed URL that may contain trailing parens from markdown syntax.
+    Example: https://wiki.org/article_(Disambiguation).html?q=1).text -> https://wiki.org/article_(Disambiguation).html?q=1
+    """
+    trimmed_url = url_str
+
+    # Cut off trailing characters until parens are balanced
+    while not parens_are_matched(trimmed_url):
+        trimmed_url = trimmed_url[:-1]
+
+    # Verify trimmed URL is still valid
+    if re.findall(URL_REGEX, trimmed_url):
+        return trimmed_url
+
+    return url_str
+
+
+def find_all_urls(text: str):
+    """Find all URLs in a text string."""
+    for url in re.findall(URL_REGEX, text):
+        yield fix_url_from_markdown(url)
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        # Local file
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        # Remote URL
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to parse (file:// or https://)')
+def main(url: str):
+    """Parse plain text and extract URLs."""
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    urls_found = set()
+    for found_url in find_all_urls(content):
+        cleaned_url = unescape(found_url)
+        # Skip the source URL itself
+        if cleaned_url != url:
+            urls_found.add(cleaned_url)
+
+    if not urls_found:
+        click.echo('No URLs found', err=True)
+        sys.exit(1)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        for found_url in sorted(urls_found):
+            f.write(json.dumps({
+                'type': 'Snapshot',
+                'url': found_url,
+                'via_extractor': EXTRACTOR_NAME,
+            }) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_txt_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.py'), None)
+
+
+class TestParseTxtUrls:
+    """Test the parse_txt_urls extractor CLI."""
+
+    def test_extracts_urls_including_real_example_com(self, tmp_path):
+        """Test extracting URLs from plain text including real example.com."""
+        input_file = tmp_path / 'urls.txt'
+        input_file.write_text('''
+https://example.com
+https://example.com/page
+https://www.iana.org/domains/reserved
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0, f"Failed: {result.stderr}"
+        assert 'Found 3 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+
+        urls = set()
+        for line in lines:
+            entry = json.loads(line)
+            assert 'url' in entry
+            urls.add(entry['url'])
+
+        # Verify real URLs are extracted correctly
+        assert 'https://example.com' in urls
+        assert 'https://example.com/page' in urls
+        assert 'https://www.iana.org/domains/reserved' in urls
+
+    def test_extracts_urls_from_mixed_content(self, tmp_path):
+        """Test extracting URLs embedded in prose text."""
+        input_file = tmp_path / 'mixed.txt'
+        input_file.write_text('''
+Check out this great article at https://blog.example.com/post
+You can also visit http://docs.test.org for more info.
+Also see https://github.com/user/repo for the code.
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        urls = {json.loads(line)['url'] for line in lines}
+
+        assert 'https://blog.example.com/post' in urls
+        assert 'http://docs.test.org' in urls
+        assert 'https://github.com/user/repo' in urls
+
+    def test_handles_markdown_urls(self, tmp_path):
+        """Test handling URLs in markdown format with parentheses."""
+        input_file = tmp_path / 'markdown.txt'
+        input_file.write_text('''
+[Example](https://example.com/page)
+[Wiki](https://en.wikipedia.org/wiki/Article_(Disambiguation))
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        urls = {json.loads(line)['url'] for line in lines}
+
+        assert 'https://example.com/page' in urls
+        assert any('wikipedia.org' in u for u in urls)
+
+    def test_exits_1_when_no_urls_found(self, tmp_path):
+        """Test that script exits with code 1 when no URLs found."""
+        input_file = tmp_path / 'empty.txt'
+        input_file.write_text('no urls here, just plain text')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No URLs found' in result.stderr
+
+    def test_exits_1_when_file_not_found(self, tmp_path):
+        """Test that script exits with code 1 when file doesn't exist."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/path.txt'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'Failed to fetch' in result.stderr
+
+    def test_deduplicates_urls(self, tmp_path):
+        """Test that duplicate URLs are deduplicated."""
+        input_file = tmp_path / 'dupes.txt'
+        input_file.write_text('''
+https://example.com
+https://example.com
+https://example.com
+https://other.com
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_appends_to_existing_file(self, tmp_path):
+        """Test that output creates urls.jsonl with extracted URLs."""
+        input_file = tmp_path / 'urls.txt'
+        input_file.write_text('https://new.com\nhttps://other.com')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+        urls = {json.loads(line)['url'] for line in lines}
+        assert 'https://new.com' in urls
+        assert 'https://other.com' in urls
+
+    def test_output_is_valid_json(self, tmp_path):
+        """Test that output contains required fields."""
+        input_file = tmp_path / 'urls.txt'
+        input_file.write_text('https://example.com')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+        assert 'type' in entry
+        assert 'via_extractor' in entry
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
@@ -0,0 +1,295 @@
+#!/usr/bin/env node
+/**
+ * Print a URL to PDF using Chrome/Puppeteer.
+ *
+ * If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
+ * Otherwise launches a new Chrome instance.
+ *
+ * Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes pdf/output.pdf
+ *
+ * Environment variables:
+ *     CHROME_BINARY: Path to Chrome/Chromium binary
+ *     CHROME_TIMEOUT: Timeout in seconds (default: 60)
+ *     CHROME_RESOLUTION: Page resolution (default: 1440,2000)
+ *     CHROME_USER_AGENT: User agent string (optional)
+ *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
+ *     CHROME_HEADLESS: Run in headless mode (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'pdf';
+const OUTPUT_DIR = 'pdf';
+const OUTPUT_FILE = 'output.pdf';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Check if staticfile extractor already downloaded this URL
+const STATICFILE_DIR = 'staticfile';
+function hasStaticFileOutput() {
+    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
+}
+
+// Get CDP URL from chrome_session if available
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Find Chrome binary
+function findChrome() {
+    const chromeBinary = getEnv('CHROME_BINARY');
+    if (chromeBinary && fs.existsSync(chromeBinary)) {
+        return chromeBinary;
+    }
+
+    const candidates = [
+        // Linux
+        '/usr/bin/google-chrome',
+        '/usr/bin/google-chrome-stable',
+        '/usr/bin/chromium',
+        '/usr/bin/chromium-browser',
+        // macOS
+        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+        '/Applications/Chromium.app/Contents/MacOS/Chromium',
+    ];
+
+    for (const candidate of candidates) {
+        if (candidate.startsWith('/') && fs.existsSync(candidate)) {
+            return candidate;
+        }
+    }
+
+    return null;
+}
+
+// Parse resolution string
+function parseResolution(resolution) {
+    const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
+    return { width: width || 1440, height: height || 2000 };
+}
+
+async function printToPdf(url) {
+    const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
+    const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
+    const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
+    const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
+    const headless = getEnvBool('CHROME_HEADLESS', true);
+
+    const { width, height } = parseResolution(resolution);
+
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+    let page = null;
+    let connectedToSession = false;
+
+    try {
+        // Try to connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            try {
+                browser = await puppeteer.connect({
+                    browserWSEndpoint: cdpUrl,
+                    defaultViewport: { width, height },
+                });
+                connectedToSession = true;
+
+                // Get existing pages or create new one
+                const pages = await browser.pages();
+                page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+                if (!page) {
+                    page = await browser.newPage();
+                }
+
+                // Set viewport on the page
+                await page.setViewport({ width, height });
+
+            } catch (e) {
+                console.error(`Failed to connect to CDP session: ${e.message}`);
+                browser = null;
+            }
+        }
+
+        // Fall back to launching new browser
+        if (!browser) {
+            const executablePath = findChrome();
+            if (!executablePath) {
+                return { success: false, error: 'Chrome binary not found' };
+            }
+
+            browser = await puppeteer.launch({
+                executablePath,
+                headless: headless ? 'new' : false,
+                args: [
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-gpu',
+                    `--window-size=${width},${height}`,
+                    ...(checkSsl ? [] : ['--ignore-certificate-errors']),
+                ],
+                defaultViewport: { width, height },
+            });
+
+            page = await browser.newPage();
+
+            // Navigate to URL (only if we launched fresh browser)
+            if (userAgent) {
+                await page.setUserAgent(userAgent);
+            }
+
+            await page.goto(url, {
+                waitUntil: 'networkidle2',
+                timeout,
+            });
+        }
+
+        // Print to PDF
+        await page.pdf({
+            path: outputPath,
+            format: 'A4',
+            printBackground: true,
+            margin: {
+                top: '0.5in',
+                right: '0.5in',
+                bottom: '0.5in',
+                left: '0.5in',
+            },
+        });
+
+        if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
+            return { success: true, output: outputPath };
+        } else {
+            return { success: false, error: 'PDF file not created' };
+        }
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        // Only close browser if we launched it (not if we connected to session)
+        if (browser && !connectedToSession) {
+            await browser.close();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if staticfile extractor already handled this (permanent skip)
+        if (hasStaticFileOutput()) {
+            console.log(`Skipping PDF - staticfile extractor already downloaded this`);
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${new Date().toISOString()}`);
+            console.log(`STATUS=skipped`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
+            process.exit(0);  // Permanent skip - staticfile already handled
+        } else {
+            const result = await printToPdf(url);
+
+            if (result.success) {
+                status = 'succeeded';
+                output = result.output;
+                const size = fs.statSync(output).size;
+                console.log(`PDF saved (${size} bytes)`);
+            } else {
+                status = 'failed';
+                error = result.error;
+            }
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/pip/on_Dependency__install_using_pip_provider.py
+++ b/archivebox/plugins/pip/on_Dependency__install_using_pip_provider.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+Install a binary using pip package manager.
+
+Usage: on_Dependency__install_using_pip_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
+Output: InstalledBinary JSONL record to stdout after installation
+
+Environment variables:
+    MACHINE_ID: Machine UUID (set by orchestrator)
+"""
+
+import json
+import os
+import sys
+
+import rich_click as click
+from abx_pkg import Binary, PipProvider, BinProviderOverrides
+
+# Fix pydantic forward reference issue
+PipProvider.model_rebuild()
+
+
+@click.command()
+@click.option('--dependency-id', required=True, help="Dependency UUID")
+@click.option('--bin-name', required=True, help="Binary name to install")
+@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
+@click.option('--custom-cmd', default=None, help="Custom install command")
+def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
+    """Install binary using pip."""
+
+    if bin_providers != '*' and 'pip' not in bin_providers.split(','):
+        click.echo(f"pip provider not allowed for {bin_name}", err=True)
+        sys.exit(0)
+
+    # Use abx-pkg PipProvider to install binary
+    provider = PipProvider()
+    if not provider.INSTALLER_BIN:
+        click.echo("pip not available on this system", err=True)
+        sys.exit(1)
+
+    click.echo(f"Installing {bin_name} via pip...", err=True)
+
+    try:
+        binary = Binary(name=bin_name, binproviders=[provider]).install()
+    except Exception as e:
+        click.echo(f"pip install failed: {e}", err=True)
+        sys.exit(1)
+
+    if not binary.abspath:
+        click.echo(f"{bin_name} not found after pip install", err=True)
+        sys.exit(1)
+
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    # Output InstalledBinary JSONL record to stdout
+    record = {
+        'type': 'InstalledBinary',
+        'name': bin_name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'pip',
+        'machine_id': machine_id,
+        'dependency_id': dependency_id,
+    }
+    print(json.dumps(record))
+
+    # Log human-readable info to stderr
+    click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
+    click.echo(f"  version: {binary.version}", err=True)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/readability/config.json
+++ b/archivebox/plugins/readability/config.json
@@ -0,0 +1,29 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_READABILITY": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable Readability text extraction"
+    },
+    "READABILITY_BINARY": {
+      "type": "string",
+      "default": "readability-extractor",
+      "description": "Path to readability-extractor binary"
+    },
+    "NODE_BINARY": {
+      "type": "string",
+      "default": "node",
+      "description": "Path to Node.js binary"
+    },
+    "READABILITY_TIMEOUT": {
+      "type": "integer",
+      "default": 30,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for Readability in seconds"
+    }
+  }
+}
--- a/archivebox/plugins/readability/on_Snapshot__52_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+Extract article content using Mozilla's Readability.
+
+Usage: on_Snapshot__readability.py --url=<url> --snapshot-id=<uuid>
+Output: Creates readability/ directory with content.html, content.txt, article.json
+
+Environment variables:
+    READABILITY_BINARY: Path to readability-cli binary
+    TIMEOUT: Timeout in seconds (default: 60)
+
+Note: Requires readability-cli: npm install -g readability-cli
+      This extractor looks for HTML source from other extractors (wget, singlefile, dom)
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'readability'
+BIN_NAME = 'readability-cli'
+BIN_PROVIDERS = 'npm,env'
+OUTPUT_DIR = 'readability'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def find_readability() -> str | None:
+    """Find readability-cli binary."""
+    readability = get_env('READABILITY_BINARY')
+    if readability and os.path.isfile(readability):
+        return readability
+
+    for name in ['readability-cli', 'readable']:
+        binary = shutil.which(name)
+        if binary:
+            return binary
+
+    return None
+
+
+def get_version(binary: str) -> str:
+    """Get readability-cli version."""
+    try:
+        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
+        return result.stdout.strip()[:64]
+    except Exception:
+        return ''
+
+
+def find_html_source() -> str | None:
+    """Find HTML content from other extractors in the snapshot directory."""
+    # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
+    search_patterns = [
+        'singlefile/singlefile.html',
+        'singlefile/*.html',
+        'dom/output.html',
+        'dom/*.html',
+        'wget/**/*.html',
+        'wget/**/*.htm',
+    ]
+
+    cwd = Path.cwd()
+    for pattern in search_patterns:
+        matches = list(cwd.glob(pattern))
+        for match in matches:
+            if match.is_file() and match.stat().st_size > 0:
+                return str(match)
+
+    return None
+
+
+def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
+    """
+    Extract article using Readability.
+
+    Returns: (success, output_path, error_message)
+    """
+    timeout = get_env_int('TIMEOUT', 60)
+
+    # Find HTML source
+    html_source = find_html_source()
+    if not html_source:
+        return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
+
+    # Create output directory
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(exist_ok=True)
+
+    try:
+        # Run readability-cli
+        cmd = [binary, '--json', html_source]
+        result = subprocess.run(cmd, capture_output=True, timeout=timeout)
+
+        if result.returncode != 0:
+            stderr = result.stderr.decode('utf-8', errors='replace')
+            return False, None, f'readability-cli failed: {stderr[:200]}'
+
+        # Parse JSON output
+        try:
+            result_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            return False, None, 'readability-cli returned invalid JSON'
+
+        # Extract and save content
+        # readability-cli v2.x uses hyphenated field names
+        text_content = result_json.pop('text-content', result_json.pop('textContent', ''))
+        html_content = result_json.pop('html-content', result_json.pop('content', ''))
+
+        if not text_content and not html_content:
+            return False, None, 'No content extracted'
+
+        (output_dir / 'content.html').write_text(html_content, encoding='utf-8')
+        (output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
+        (output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
+
+        return True, OUTPUT_DIR, ''
+
+    except subprocess.TimeoutExpired:
+        return False, None, f'Timed out after {timeout} seconds'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to extract article from')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Extract article content using Mozilla's Readability."""
+
+    start_ts = datetime.now(timezone.utc)
+    version = ''
+    output = None
+    status = 'failed'
+    error = ''
+    binary = None
+
+    try:
+        # Find binary
+        binary = find_readability()
+        if not binary:
+            print(f'ERROR: readability-cli binary not found', file=sys.stderr)
+            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
+            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
+            sys.exit(1)
+
+        version = get_version(binary)
+
+        # Run extraction
+        success, output, error = extract_readability(url, binary)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            text_file = Path(output) / 'content.txt'
+            html_file = Path(output) / 'content.html'
+            text_len = text_file.stat().st_size if text_file.exists() else 0
+            html_len = html_file.stat().st_size if html_file.exists() else 0
+            print(f'Readability extracted: {text_len} chars text, {html_len} chars HTML')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if binary:
+        print(f'CMD={binary} --json <html>')
+    if version:
+        print(f'VERSION={version}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'cmd_version': version,
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/readability/tests/test_readability.py
+++ b/archivebox/plugins/readability/tests/test_readability.py
@@ -0,0 +1,254 @@
+"""
+Integration tests for readability plugin
+
+Tests verify:
+1. Plugin reports missing dependency correctly
+2. readability-cli can be installed via npm (note: package name != binary name)
+3. Extraction works against real example.com content
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+PLUGINS_ROOT = PLUGIN_DIR.parent
+READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
+TEST_URL = 'https://example.com'
+
+
+def create_example_html(tmpdir: Path) -> Path:
+    """Create sample HTML that looks like example.com with enough content for Readability."""
+    singlefile_dir = tmpdir / 'singlefile'
+    singlefile_dir.mkdir()
+
+    html_file = singlefile_dir / 'singlefile.html'
+    html_file.write_text('''
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>Example Domain</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+</head>
+<body>
+    <article>
+        <header>
+            <h1>Example Domain</h1>
+        </header>
+        <div class="content">
+            <p>This domain is for use in illustrative examples in documents. You may use this
+            domain in literature without prior coordination or asking for permission.</p>
+
+            <p>Example domains are maintained by the Internet Assigned Numbers Authority (IANA)
+            to provide a well-known address for documentation purposes. This helps authors create
+            examples that readers can understand without confusion about actual domain ownership.</p>
+
+            <p>The practice of using example domains dates back to the early days of the internet.
+            These reserved domains ensure that example code and documentation doesn't accidentally
+            point to real, active websites that might change or disappear over time.</p>
+
+            <p>For more information about example domains and their history, you can visit the
+            IANA website. They maintain several example domains including example.com, example.net,
+            and example.org, all specifically reserved for this purpose.</p>
+
+            <p><a href="https://www.iana.org/domains/example">More information about example domains...</a></p>
+        </div>
+    </article>
+</body>
+</html>
+    ''')
+
+    return html_file
+
+
+def test_hook_script_exists():
+    """Verify hook script exists."""
+    assert READABILITY_HOOK.exists(), f"Hook script not found: {READABILITY_HOOK}"
+
+
+def test_reports_missing_dependency_when_not_installed():
+    """Test that script reports DEPENDENCY_NEEDED when readability-cli is not found."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Create HTML source so it doesn't fail on missing HTML
+        create_example_html(tmpdir)
+
+        # Run with empty PATH so binary won't be found
+        env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
+
+        result = subprocess.run(
+            [sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env
+        )
+
+        # Should fail and report missing dependency
+        assert result.returncode != 0, "Should exit non-zero when dependency missing"
+        combined = result.stdout + result.stderr
+        assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
+        assert 'readability-cli' in combined or 'BIN_NAME' in combined, "Should mention readability-cli"
+
+
+def test_can_install_readability_via_npm():
+    """Test that readability-cli can be installed via npm and binary becomes available.
+
+    Note: The npm package 'readability-cli' installs a binary named 'readable',
+    so we test the full installation flow using npm install directly.
+    """
+
+    # Check npm is available
+    if not shutil.which('npm'):
+        pytest.skip("npm not available on this system")
+
+    # Install readability-cli package via npm
+    # The orchestrator/dependency hooks would call this via npm provider
+    result = subprocess.run(
+        ['npm', 'install', '-g', 'readability-cli'],
+        capture_output=True,
+        text=True,
+        timeout=300
+    )
+
+    assert result.returncode == 0, f"npm install failed: {result.stderr}"
+
+    # Verify the 'readable' binary is now available
+    # (readability-cli package installs as 'readable' not 'readability-cli')
+    result = subprocess.run(['which', 'readable'], capture_output=True, text=True)
+    assert result.returncode == 0, "readable binary not found after npm install"
+
+    binary_path = result.stdout.strip()
+    assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
+
+    # Test that it's executable and responds to --version
+    result = subprocess.run(
+        [binary_path, '--version'],
+        capture_output=True,
+        text=True,
+        timeout=10
+    )
+    assert result.returncode == 0, f"Binary not executable: {result.stderr}"
+
+
+def test_extracts_article_after_installation():
+    """Test full workflow: ensure readability-cli installed then extract from example.com HTML."""
+
+    # Check npm is available
+    if not shutil.which('npm'):
+        pytest.skip("npm not available on this system")
+
+    # Ensure readability-cli is installed (orchestrator would handle this)
+    install_result = subprocess.run(
+        ['npm', 'install', '-g', 'readability-cli'],
+        capture_output=True,
+        text=True,
+        timeout=300
+    )
+
+    if install_result.returncode != 0:
+        pytest.skip(f"Could not install readability-cli: {install_result.stderr}")
+
+    # Now test extraction
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Create example.com HTML for readability to process
+        create_example_html(tmpdir)
+
+        # Run readability extraction (should find the installed binary)
+        result = subprocess.run(
+            [sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+
+        # Verify output directory created
+        readability_dir = tmpdir / 'readability'
+        assert readability_dir.exists(), "Output directory not created"
+
+        # Verify output files exist
+        html_file = readability_dir / 'content.html'
+        txt_file = readability_dir / 'content.txt'
+        json_file = readability_dir / 'article.json'
+
+        assert html_file.exists(), "content.html not created"
+        assert txt_file.exists(), "content.txt not created"
+        assert json_file.exists(), "article.json not created"
+
+        # Verify HTML content contains REAL example.com text
+        html_content = html_file.read_text()
+        assert len(html_content) > 100, f"HTML content too short: {len(html_content)} bytes"
+        assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
+        assert ('illustrative examples' in html_content.lower() or
+                'use in' in html_content.lower() or
+                'literature' in html_content.lower()), \
+            "Missing example.com description in HTML"
+
+        # Verify text content contains REAL example.com text
+        txt_content = txt_file.read_text()
+        assert len(txt_content) > 50, f"Text content too short: {len(txt_content)} bytes"
+        assert 'example' in txt_content.lower(), "Missing 'example' in text"
+
+        # Verify JSON metadata
+        json_data = json.loads(json_file.read_text())
+        assert isinstance(json_data, dict), "article.json should be a dict"
+
+        # Verify stdout contains expected output
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+        assert 'OUTPUT=readability' in result.stdout, "Should report output directory"
+
+
+def test_fails_gracefully_without_html_source():
+    """Test that extraction fails gracefully when no HTML source is available."""
+
+    # Check npm is available
+    if not shutil.which('npm'):
+        pytest.skip("npm not available on this system")
+
+    # Ensure readability-cli is installed
+    install_result = subprocess.run(
+        ['npm', 'install', '-g', 'readability-cli'],
+        capture_output=True,
+        text=True,
+        timeout=300
+    )
+
+    if install_result.returncode != 0:
+        pytest.skip("Could not install readability-cli")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Don't create any HTML source files
+
+        result = subprocess.run(
+            [sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        assert result.returncode != 0, "Should fail without HTML source"
+        combined_output = result.stdout + result.stderr
+        assert ('no html source' in combined_output.lower() or
+                'not found' in combined_output.lower() or
+                'ERROR=' in combined_output), \
+            "Should report missing HTML source"
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
+++ b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
@@ -0,0 +1,281 @@
+#!/usr/bin/env node
+/**
+ * Track complete redirect chains for a URL.
+ *
+ * Captures:
+ * - HTTP redirects (301, 302, 303, 307, 308)
+ * - Meta refresh redirects
+ * - JavaScript redirects (basic detection)
+ * - Full redirect chain with timestamps
+ *
+ * Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes redirects/redirects.json
+ *
+ * Environment variables:
+ *     SAVE_REDIRECTS: Enable redirect tracking (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'redirects';
+const OUTPUT_DIR = 'redirects';
+const OUTPUT_FILE = 'redirects.json';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Track redirect chain
+async function trackRedirects(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+    const redirectChain = [];
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Track all responses to capture redirects
+        page.on('response', async (response) => {
+            const status = response.status();
+            const responseUrl = response.url();
+            const headers = response.headers();
+
+            // Check if it's a redirect
+            if (status >= 300 && status < 400) {
+                redirectChain.push({
+                    timestamp: new Date().toISOString(),
+                    url: responseUrl,
+                    status,
+                    statusText: response.statusText(),
+                    location: headers['location'] || headers['Location'] || '',
+                    type: 'http',
+                });
+            }
+        });
+
+        // Get the current URL (which is the final destination after redirects)
+        const finalUrl = page.url();
+
+        // Check for meta refresh redirects
+        const metaRefresh = await page.evaluate(() => {
+            const meta = document.querySelector('meta[http-equiv="refresh"]');
+            if (meta) {
+                const content = meta.getAttribute('content') || '';
+                const match = content.match(/url=['"]?([^'"]+)['"]?/i);
+                return {
+                    content,
+                    url: match ? match[1] : null,
+                };
+            }
+            return null;
+        });
+
+        if (metaRefresh && metaRefresh.url) {
+            redirectChain.push({
+                timestamp: new Date().toISOString(),
+                url: finalUrl,
+                status: null,
+                statusText: 'Meta Refresh',
+                location: metaRefresh.url,
+                type: 'meta_refresh',
+                content: metaRefresh.content,
+            });
+        }
+
+        // Check for JavaScript redirects (basic detection)
+        const jsRedirect = await page.evaluate(() => {
+            // Check for common JavaScript redirect patterns
+            const html = document.documentElement.outerHTML;
+            const patterns = [
+                /window\.location\s*=\s*['"]([^'"]+)['"]/i,
+                /window\.location\.href\s*=\s*['"]([^'"]+)['"]/i,
+                /window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i,
+                /document\.location\s*=\s*['"]([^'"]+)['"]/i,
+            ];
+
+            for (const pattern of patterns) {
+                const match = html.match(pattern);
+                if (match) {
+                    return {
+                        pattern: pattern.toString(),
+                        url: match[1],
+                    };
+                }
+            }
+            return null;
+        });
+
+        if (jsRedirect && jsRedirect.url) {
+            redirectChain.push({
+                timestamp: new Date().toISOString(),
+                url: finalUrl,
+                status: null,
+                statusText: 'JavaScript Redirect',
+                location: jsRedirect.url,
+                type: 'javascript',
+                pattern: jsRedirect.pattern,
+            });
+        }
+
+        const redirectData = {
+            original_url: url,
+            final_url: finalUrl,
+            redirect_count: redirectChain.length,
+            redirects: redirectChain,
+            is_redirect: redirectChain.length > 0,
+        };
+
+        // Write output
+        fs.writeFileSync(outputPath, JSON.stringify(redirectData, null, 2));
+
+        return { success: true, output: outputPath, redirectData };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_REDIRECTS', true)) {
+            console.log('Skipping redirects (SAVE_REDIRECTS=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await trackRedirects(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            const redirectCount = result.redirectData.redirect_count;
+            const finalUrl = result.redirectData.final_url;
+            if (redirectCount > 0) {
+                console.log(`Tracked ${redirectCount} redirect(s) to: ${finalUrl}`);
+            } else {
+                console.log('No redirects detected');
+            }
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.js
@@ -0,0 +1,381 @@
+#!/usr/bin/env node
+/**
+ * Archive all network responses during page load.
+ *
+ * Connects to Chrome session and captures ALL network responses (XHR, images, scripts, etc.)
+ * Saves them in an organized directory structure with both timestamped unique files
+ * and URL-organized symlinks.
+ *
+ * Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>
+ * Output: Creates responses/ directory with:
+ *   - all/<timestamp>__<METHOD>__<URL>.<ext>: Timestamped unique files
+ *   - <type>/<domain>/<path>/: URL-organized symlinks by resource type
+ *   - index.jsonl: Searchable index of all responses
+ *
+ * Environment variables:
+ *     SAVE_RESPONSES: Enable response archiving (default: true)
+ *     RESPONSES_TIMEOUT: Timeout in seconds (default: 120)
+ *     RESPONSES_TYPES: Comma-separated resource types to save (default: all)
+ *                      Options: script,stylesheet,font,image,media,xhr,websocket,document
+ */
+
+const fs = require('fs');
+const path = require('path');
+const crypto = require('crypto');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'responses';
+const OUTPUT_DIR = 'responses';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Resource types to capture (by default, capture everything)
+const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Get file extension from MIME type
+function getExtensionFromMimeType(mimeType) {
+    const mimeMap = {
+        'text/html': 'html',
+        'text/css': 'css',
+        'text/javascript': 'js',
+        'application/javascript': 'js',
+        'application/x-javascript': 'js',
+        'application/json': 'json',
+        'application/xml': 'xml',
+        'text/xml': 'xml',
+        'image/png': 'png',
+        'image/jpeg': 'jpg',
+        'image/gif': 'gif',
+        'image/svg+xml': 'svg',
+        'image/webp': 'webp',
+        'font/woff': 'woff',
+        'font/woff2': 'woff2',
+        'font/ttf': 'ttf',
+        'font/otf': 'otf',
+        'application/font-woff': 'woff',
+        'application/font-woff2': 'woff2',
+        'video/mp4': 'mp4',
+        'video/webm': 'webm',
+        'audio/mpeg': 'mp3',
+        'audio/ogg': 'ogg',
+    };
+
+    const mimeBase = (mimeType || '').split(';')[0].trim().toLowerCase();
+    return mimeMap[mimeBase] || '';
+}
+
+// Get extension from URL path
+function getExtensionFromUrl(url) {
+    try {
+        const pathname = new URL(url).pathname;
+        const match = pathname.match(/\.([a-z0-9]+)$/i);
+        return match ? match[1].toLowerCase() : '';
+    } catch (e) {
+        return '';
+    }
+}
+
+// Sanitize filename
+function sanitizeFilename(str, maxLen = 200) {
+    return str
+        .replace(/[^a-zA-Z0-9._-]/g, '_')
+        .slice(0, maxLen);
+}
+
+// Create symlink (handle errors gracefully)
+async function createSymlink(target, linkPath) {
+    try {
+        // Create parent directory
+        const dir = path.dirname(linkPath);
+        if (!fs.existsSync(dir)) {
+            fs.mkdirSync(dir, { recursive: true });
+        }
+
+        // Remove existing symlink/file if present
+        if (fs.existsSync(linkPath)) {
+            fs.unlinkSync(linkPath);
+        }
+
+        // Create relative symlink
+        const relativePath = path.relative(dir, target);
+        fs.symlinkSync(relativePath, linkPath);
+    } catch (e) {
+        // Ignore symlink errors (file conflicts, permissions, etc.)
+        console.error(`Failed to create symlink: ${e.message}`);
+    }
+}
+
+// Archive responses by intercepting network traffic
+async function archiveResponses(originalUrl) {
+    const timeout = (getEnvInt('RESPONSES_TIMEOUT') || getEnvInt('TIMEOUT', 120)) * 1000;
+    const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
+    const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
+
+    // Create output directories
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const allDir = path.join(OUTPUT_DIR, 'all');
+    if (!fs.existsSync(allDir)) {
+        fs.mkdirSync(allDir, { recursive: true });
+    }
+
+    // Create index file
+    const indexPath = path.join(OUTPUT_DIR, 'index.jsonl');
+    fs.writeFileSync(indexPath, '');  // Clear existing
+
+    let browser = null;
+    let savedCount = 0;
+    const savedResponses = [];
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Enable request interception
+        await page.setRequestInterception(false);  // Don't block requests
+
+        // Listen for responses
+        page.on('response', async (response) => {
+            try {
+                const request = response.request();
+                const url = response.url();
+                const resourceType = request.resourceType().toLowerCase();
+                const method = request.method();
+                const status = response.status();
+
+                // Skip redirects and errors
+                if (status >= 300 && status < 400) return;
+                if (status >= 400 && status < 600) return;
+
+                // Check if we should save this resource type
+                if (typesToSave.length && !typesToSave.includes(resourceType)) {
+                    return;
+                }
+
+                // Get response body
+                let bodyBuffer = null;
+                try {
+                    bodyBuffer = await response.buffer();
+                } catch (e) {
+                    // Some responses can't be captured (already consumed, etc.)
+                    return;
+                }
+
+                if (!bodyBuffer || bodyBuffer.length === 0) {
+                    return;
+                }
+
+                // Determine file extension
+                const mimeType = response.headers()['content-type'] || '';
+                let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url);
+
+                // Create timestamp-based unique filename
+                const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '');
+                const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64));
+                const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`;
+                const uniquePath = path.join(allDir, uniqueFilename);
+
+                // Save to unique file
+                fs.writeFileSync(uniquePath, bodyBuffer);
+
+                // Create URL-organized symlink
+                try {
+                    const urlObj = new URL(url);
+                    const hostname = urlObj.hostname;
+                    const pathname = urlObj.pathname || '/';
+                    const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
+                    const dirPath = path.dirname(pathname);
+
+                    // Create symlink: responses/<type>/<hostname>/<path>/<filename>
+                    const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
+                    const symlinkPath = path.join(symlinkDir, filename);
+                    await createSymlink(uniquePath, symlinkPath);
+                } catch (e) {
+                    // URL parsing or symlink creation failed, skip
+                }
+
+                // Calculate SHA256
+                const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex');
+                const urlSha256 = crypto.createHash('sha256').update(url).digest('hex');
+
+                // Write to index
+                const indexEntry = {
+                    ts: timestamp,
+                    method,
+                    url: method === 'DATA' ? url.slice(0, 128) : url,  // Truncate data: URLs
+                    urlSha256,
+                    status,
+                    resourceType,
+                    mimeType: mimeType.split(';')[0],
+                    responseSha256: sha256,
+                    path: './' + path.relative(OUTPUT_DIR, uniquePath),
+                    extension,
+                };
+
+                fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n');
+                savedResponses.push(indexEntry);
+                savedCount++;
+
+            } catch (e) {
+                // Log but don't fail the whole extraction
+                console.error(`Error capturing response: ${e.message}`);
+            }
+        });
+
+        // Wait a bit to ensure we capture responses
+        // (chrome_session already loaded the page, just capture any remaining traffic)
+        await new Promise(resolve => setTimeout(resolve, 2000));
+
+        return {
+            success: true,
+            output: OUTPUT_DIR,
+            savedCount,
+            indexPath,
+        };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+    let savedCount = 0;
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_RESPONSES', true)) {
+            console.log('Skipping responses (SAVE_RESPONSES=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await archiveResponses(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            savedCount = result.savedCount || 0;
+            console.log(`Saved ${savedCount} network responses to ${output}/`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        saved_count: savedCount,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/run_all_tests.sh
+++ b/archivebox/plugins/run_all_tests.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# Run all plugin tests
+#
+# Usage: ./run_all_tests.sh
+
+set -e
+
+echo "=========================================="
+echo "Running All Plugin Tests"
+echo "=========================================="
+echo ""
+
+# Color codes
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Track results
+TOTAL_TESTS=0
+PASSED_TESTS=0
+FAILED_TESTS=0
+
+run_test_suite() {
+    local test_file=$1
+    local test_name=$(basename $(dirname $test_file))
+
+    echo -e "${YELLOW}[RUNNING]${NC} $test_name tests..."
+
+    if node --test "$test_file" 2>&1; then
+        echo -e "${GREEN}[PASSED]${NC} $test_name tests"
+        PASSED_TESTS=$((PASSED_TESTS + 1))
+    else
+        echo -e "${RED}[FAILED]${NC} $test_name tests"
+        FAILED_TESTS=$((FAILED_TESTS + 1))
+    fi
+
+    TOTAL_TESTS=$((TOTAL_TESTS + 1))
+    echo ""
+}
+
+# Find and run all test files
+echo "Finding test files..."
+echo ""
+
+# Chrome extensions utils tests
+if [ -f "chrome_extensions/tests/test_chrome_extension_utils.js" ]; then
+    run_test_suite "chrome_extensions/tests/test_chrome_extension_utils.js"
+fi
+
+# Captcha2 tests
+if [ -f "captcha2/tests/test_captcha2_install.js" ]; then
+    run_test_suite "captcha2/tests/test_captcha2_install.js"
+fi
+
+if [ -f "captcha2/tests/test_captcha2_config.js" ]; then
+    run_test_suite "captcha2/tests/test_captcha2_config.js"
+fi
+
+# I Still Don't Care About Cookies tests
+if [ -f "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js" ]; then
+    run_test_suite "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js"
+fi
+
+# uBlock tests
+if [ -f "ublock/tests/test_ublock.js" ]; then
+    run_test_suite "ublock/tests/test_ublock.js"
+fi
+
+# SingleFile tests
+if [ -f "singlefile/tests/test_singlefile.js" ]; then
+    run_test_suite "singlefile/tests/test_singlefile.js"
+fi
+
+# Print summary
+echo "=========================================="
+echo "Test Summary"
+echo "=========================================="
+echo -e "Total test suites:  $TOTAL_TESTS"
+echo -e "${GREEN}Passed:${NC}            $PASSED_TESTS"
+echo -e "${RED}Failed:${NC}            $FAILED_TESTS"
+echo ""
+
+if [ $FAILED_TESTS -eq 0 ]; then
+    echo -e "${GREEN}✓ All tests passed!${NC}"
+    exit 0
+else
+    echo -e "${RED}✗ Some tests failed${NC}"
+    exit 1
+fi
--- a/archivebox/plugins/run_tests.sh
+++ b/archivebox/plugins/run_tests.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Run all plugin tests
+#
+# Usage: ./run_tests.sh [plugin_name]
+#
+# Examples:
+#   ./run_tests.sh                 # Run all tests
+#   ./run_tests.sh captcha2        # Run only captcha2 tests
+#   ./run_tests.sh chrome_*        # Run all chrome tests
+
+set -e
+
+echo "=========================================="
+echo "Running ArchiveBox Plugin Tests"
+echo "=========================================="
+echo ""
+
+if [ -n "$1" ]; then
+    echo "Running tests for: $1"
+    python -m pytest "$1"/tests/ -v
+else
+    echo "Running all plugin tests..."
+    python -m pytest */tests/test_*.py -v
+fi
+
+echo ""
+echo "=========================================="
+echo "Tests Complete"
+echo "=========================================="
--- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
@@ -0,0 +1,291 @@
+#!/usr/bin/env node
+/**
+ * Take a screenshot of a URL using Chrome/Puppeteer.
+ *
+ * If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
+ * Otherwise launches a new Chrome instance.
+ *
+ * Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes screenshot/screenshot.png
+ *
+ * Environment variables:
+ *     CHROME_BINARY: Path to Chrome/Chromium binary
+ *     CHROME_TIMEOUT: Timeout in seconds (default: 60)
+ *     CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
+ *     CHROME_USER_AGENT: User agent string (optional)
+ *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
+ *     CHROME_HEADLESS: Run in headless mode (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'screenshot';
+const OUTPUT_DIR = 'screenshot';
+const OUTPUT_FILE = 'screenshot.png';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Check if staticfile extractor already downloaded this URL
+const STATICFILE_DIR = 'staticfile';
+function hasStaticFileOutput() {
+    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
+}
+
+// Get CDP URL from chrome_session if available
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Find Chrome binary
+function findChrome() {
+    const chromeBinary = getEnv('CHROME_BINARY');
+    if (chromeBinary && fs.existsSync(chromeBinary)) {
+        return chromeBinary;
+    }
+
+    const candidates = [
+        // Linux
+        '/usr/bin/google-chrome',
+        '/usr/bin/google-chrome-stable',
+        '/usr/bin/chromium',
+        '/usr/bin/chromium-browser',
+        // macOS
+        '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+        '/Applications/Chromium.app/Contents/MacOS/Chromium',
+        // Common paths
+        'google-chrome',
+        'chromium',
+    ];
+
+    for (const candidate of candidates) {
+        if (candidate.startsWith('/') && fs.existsSync(candidate)) {
+            return candidate;
+        }
+    }
+
+    return null;
+}
+
+// Parse resolution string
+function parseResolution(resolution) {
+    const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
+    return { width: width || 1440, height: height || 2000 };
+}
+
+async function takeScreenshot(url) {
+    const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
+    const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
+    const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
+    const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
+    const headless = getEnvBool('CHROME_HEADLESS', true);
+
+    const { width, height } = parseResolution(resolution);
+
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+    let page = null;
+    let connectedToSession = false;
+
+    try {
+        // Try to connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            try {
+                browser = await puppeteer.connect({
+                    browserWSEndpoint: cdpUrl,
+                    defaultViewport: { width, height },
+                });
+                connectedToSession = true;
+
+                // Get existing pages or create new one
+                const pages = await browser.pages();
+                page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+                if (!page) {
+                    page = await browser.newPage();
+                }
+
+                // Set viewport on the page
+                await page.setViewport({ width, height });
+
+            } catch (e) {
+                console.error(`Failed to connect to CDP session: ${e.message}`);
+                browser = null;
+            }
+        }
+
+        // Fall back to launching new browser
+        if (!browser) {
+            const executablePath = findChrome();
+            if (!executablePath) {
+                return { success: false, error: 'Chrome binary not found' };
+            }
+
+            browser = await puppeteer.launch({
+                executablePath,
+                headless: headless ? 'new' : false,
+                args: [
+                    '--no-sandbox',
+                    '--disable-setuid-sandbox',
+                    '--disable-dev-shm-usage',
+                    '--disable-gpu',
+                    `--window-size=${width},${height}`,
+                    ...(checkSsl ? [] : ['--ignore-certificate-errors']),
+                ],
+                defaultViewport: { width, height },
+            });
+
+            page = await browser.newPage();
+
+            // Navigate to URL (only if we launched fresh browser)
+            if (userAgent) {
+                await page.setUserAgent(userAgent);
+            }
+
+            await page.goto(url, {
+                waitUntil: 'networkidle2',
+                timeout,
+            });
+        }
+
+        // Take screenshot
+        await page.screenshot({
+            path: outputPath,
+            fullPage: true,
+        });
+
+        if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
+            return { success: true, output: outputPath };
+        } else {
+            return { success: false, error: 'Screenshot file not created' };
+        }
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        // Only close browser if we launched it (not if we connected to session)
+        if (browser && !connectedToSession) {
+            await browser.close();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if staticfile extractor already handled this (permanent skip)
+        if (hasStaticFileOutput()) {
+            console.log(`Skipping screenshot - staticfile extractor already downloaded this`);
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${new Date().toISOString()}`);
+            console.log(`STATUS=skipped`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
+            process.exit(0);  // Permanent skip - staticfile already handled
+        } else {
+            const result = await takeScreenshot(url);
+
+            if (result.success) {
+                status = 'succeeded';
+                output = result.output;
+                const size = fs.statSync(output).size;
+                console.log(`Screenshot saved (${size} bytes)`);
+            } else {
+                status = 'failed';
+                error = result.error;
+            }
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/search_backend_ripgrep/init.py
+++ b/archivebox/plugins/search_backend_ripgrep/init.py
--- a/archivebox/plugins/search_backend_ripgrep/config.json
+++ b/archivebox/plugins/search_backend_ripgrep/config.json
@@ -0,0 +1,24 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "RIPGREP_BINARY": {
+      "type": "string",
+      "default": "rg",
+      "description": "Path to ripgrep binary"
+    },
+    "RIPGREP_IGNORE_EXTENSIONS": {
+      "type": "string",
+      "default": "css,js,orig,svg",
+      "description": "Comma-separated file extensions to ignore"
+    },
+    "SEARCH_BACKEND_TIMEOUT": {
+      "type": "integer",
+      "default": 90,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Search timeout in seconds"
+    }
+  }
+}
--- a/archivebox/plugins/search_backend_ripgrep/search.py
+++ b/archivebox/plugins/search_backend_ripgrep/search.py
@@ -0,0 +1,80 @@
+"""
+Ripgrep search backend - searches files directly without indexing.
+
+This backend doesn't maintain an index - it searches archived files directly
+using ripgrep (rg). This is simpler but slower for large archives.
+
+Environment variables:
+    RIPGREP_BINARY: Path to ripgrep binary (default: rg)
+    RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg)
+    SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90)
+"""
+
+import os
+import subprocess
+import shutil
+from pathlib import Path
+from typing import List, Iterable
+
+from django.conf import settings
+
+
+# Config with old var names for backwards compatibility
+RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip()
+RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip()
+SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90'))
+
+
+def search(query: str) -> List[str]:
+    """Search for snapshots using ripgrep."""
+    rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY
+    if not rg_binary or not Path(rg_binary).exists():
+        raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep')
+
+    archive_dir = Path(settings.ARCHIVE_DIR)
+    if not archive_dir.exists():
+        return []
+
+    # Build ignore pattern from config
+    ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}'
+
+    cmd = [
+        rg_binary,
+        f'--type-add=ignore:{ignore_pattern}',
+        '--type-not=ignore',
+        '--files-with-matches',
+        '--no-messages',
+        '--ignore-case',
+        '--regexp',
+        query,
+        str(archive_dir),
+    ]
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT)
+
+        # Extract snapshot IDs from file paths
+        # Paths look like: archive/<snapshot_id>/<extractor>/file.txt
+        snapshot_ids = set()
+        for line in result.stdout.strip().split('\n'):
+            if not line:
+                continue
+            path = Path(line)
+            try:
+                relative = path.relative_to(archive_dir)
+                snapshot_id = relative.parts[0]
+                snapshot_ids.add(snapshot_id)
+            except (ValueError, IndexError):
+                continue
+
+        return list(snapshot_ids)
+
+    except subprocess.TimeoutExpired:
+        return []
+    except Exception:
+        return []
+
+
+def flush(snapshot_ids: Iterable[str]) -> None:
+    """No-op for ripgrep - it searches files directly."""
+    pass
--- a/archivebox/plugins/search_backend_sonic/init.py
+++ b/archivebox/plugins/search_backend_sonic/init.py
--- a/archivebox/plugins/search_backend_sonic/config.json
+++ b/archivebox/plugins/search_backend_sonic/config.json
@@ -0,0 +1,37 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SEARCH_BACKEND_HOST_NAME": {
+      "type": "string",
+      "default": "127.0.0.1",
+      "x-aliases": ["SONIC_HOST"],
+      "description": "Sonic server hostname"
+    },
+    "SEARCH_BACKEND_PORT": {
+      "type": "integer",
+      "default": 1491,
+      "minimum": 1,
+      "maximum": 65535,
+      "x-aliases": ["SONIC_PORT"],
+      "description": "Sonic server port"
+    },
+    "SEARCH_BACKEND_PASSWORD": {
+      "type": "string",
+      "default": "SecretPassword",
+      "x-aliases": ["SONIC_PASSWORD"],
+      "description": "Sonic server password"
+    },
+    "SONIC_COLLECTION": {
+      "type": "string",
+      "default": "archivebox",
+      "description": "Sonic collection name"
+    },
+    "SONIC_BUCKET": {
+      "type": "string",
+      "default": "snapshots",
+      "description": "Sonic bucket name"
+    }
+  }
+}
--- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
+++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Sonic search backend - indexes snapshot content in Sonic server.
+
+This hook runs after all extractors and indexes text content in Sonic.
+Only runs if SEARCH_BACKEND_ENGINE=sonic.
+
+Usage: on_Snapshot__91_index_sonic.py --url=<url> --snapshot-id=<uuid>
+
+Environment variables:
+    SEARCH_BACKEND_ENGINE: Must be 'sonic' for this hook to run
+    USE_INDEXING_BACKEND: Enable search indexing (default: true)
+    SEARCH_BACKEND_HOST_NAME: Sonic server host (default: 127.0.0.1)
+    SEARCH_BACKEND_PORT: Sonic server port (default: 1491)
+    SEARCH_BACKEND_PASSWORD: Sonic server password (default: SecretPassword)
+    SONIC_COLLECTION: Collection name (default: archivebox)
+    SONIC_BUCKET: Bucket name (default: snapshots)
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'index_sonic'
+OUTPUT_DIR = 'search_index'
+
+# Text file patterns to index
+INDEXABLE_FILES = [
+    ('readability', 'content.txt'),
+    ('readability', 'content.html'),
+    ('mercury', 'content.txt'),
+    ('mercury', 'content.html'),
+    ('htmltotext', 'output.txt'),
+    ('singlefile', 'singlefile.html'),
+    ('dom', 'output.html'),
+    ('wget', '**/*.html'),
+    ('wget', '**/*.htm'),
+    ('title', 'title.txt'),
+]
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def strip_html_tags(html: str) -> str:
+    """Remove HTML tags, keeping text content."""
+    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'<[^>]+>', ' ', html)
+    html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
+    html = html.replace('&lt;', '<').replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = re.sub(r'\s+', ' ', html)
+    return html.strip()
+
+
+def find_indexable_content() -> list[tuple[str, str]]:
+    """Find text content to index from extractor outputs."""
+    results = []
+    cwd = Path.cwd()
+
+    for extractor, file_pattern in INDEXABLE_FILES:
+        extractor_dir = cwd / extractor
+        if not extractor_dir.exists():
+            continue
+
+        if '*' in file_pattern:
+            matches = list(extractor_dir.glob(file_pattern))
+        else:
+            match = extractor_dir / file_pattern
+            matches = [match] if match.exists() else []
+
+        for match in matches:
+            if match.is_file() and match.stat().st_size > 0:
+                try:
+                    content = match.read_text(encoding='utf-8', errors='ignore')
+                    if content.strip():
+                        if match.suffix in ('.html', '.htm'):
+                            content = strip_html_tags(content)
+                        results.append((f'{extractor}/{match.name}', content))
+                except Exception:
+                    continue
+
+    return results
+
+
+def get_sonic_config() -> dict:
+    """Get Sonic connection configuration."""
+    return {
+        'host': get_env('SEARCH_BACKEND_HOST_NAME', '127.0.0.1'),
+        'port': get_env_int('SEARCH_BACKEND_PORT', 1491),
+        'password': get_env('SEARCH_BACKEND_PASSWORD', 'SecretPassword'),
+        'collection': get_env('SONIC_COLLECTION', 'archivebox'),
+        'bucket': get_env('SONIC_BUCKET', 'snapshots'),
+    }
+
+
+def index_in_sonic(snapshot_id: str, texts: list[str]) -> None:
+    """Index texts in Sonic."""
+    try:
+        from sonic import IngestClient
+    except ImportError:
+        raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
+
+    config = get_sonic_config()
+
+    with IngestClient(config['host'], config['port'], config['password']) as ingest:
+        # Flush existing content
+        try:
+            ingest.flush_object(config['collection'], config['bucket'], snapshot_id)
+        except Exception:
+            pass
+
+        # Index new content in chunks (Sonic has size limits)
+        content = ' '.join(texts)
+        chunk_size = 10000
+        for i in range(0, len(content), chunk_size):
+            chunk = content[i:i + chunk_size]
+            ingest.push(config['collection'], config['bucket'], snapshot_id, chunk)
+
+
+@click.command()
+@click.option('--url', required=True, help='URL that was archived')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Index snapshot content in Sonic."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+    indexed_sources = []
+
+    try:
+        # Check if this backend is enabled (permanent skips - don't retry)
+        backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
+        if backend != 'sonic':
+            print(f'Skipping Sonic indexing (SEARCH_BACKEND_ENGINE={backend})')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - different backend selected
+        if not get_env_bool('USE_INDEXING_BACKEND', True):
+            print('Skipping indexing (USE_INDEXING_BACKEND=False)')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - indexing disabled
+        else:
+            contents = find_indexable_content()
+            indexed_sources = [source for source, _ in contents]
+
+            if not contents:
+                status = 'skipped'
+                print('No indexable content found')
+            else:
+                texts = [content for _, content in contents]
+                index_in_sonic(snapshot_id, texts)
+                status = 'succeeded'
+                output = OUTPUT_DIR
+                print(f'Sonic indexed {len(texts)} documents')
+                print(f'Sources: {", ".join(indexed_sources)}')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'indexed_sources': indexed_sources,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/search_backend_sonic/search.py
+++ b/archivebox/plugins/search_backend_sonic/search.py
@@ -0,0 +1,50 @@
+"""
+Sonic search backend - search and flush operations.
+
+This module provides the search interface for the Sonic backend.
+"""
+
+import os
+from typing import List, Iterable
+
+
+def get_sonic_config() -> dict:
+    """Get Sonic connection configuration."""
+    return {
+        'host': os.environ.get('SEARCH_BACKEND_HOST_NAME', '127.0.0.1').strip(),
+        'port': int(os.environ.get('SEARCH_BACKEND_PORT', '1491')),
+        'password': os.environ.get('SEARCH_BACKEND_PASSWORD', 'SecretPassword').strip(),
+        'collection': os.environ.get('SONIC_COLLECTION', 'archivebox').strip(),
+        'bucket': os.environ.get('SONIC_BUCKET', 'snapshots').strip(),
+    }
+
+
+def search(query: str) -> List[str]:
+    """Search for snapshots in Sonic."""
+    try:
+        from sonic import SearchClient
+    except ImportError:
+        raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
+
+    config = get_sonic_config()
+
+    with SearchClient(config['host'], config['port'], config['password']) as search_client:
+        results = search_client.query(config['collection'], config['bucket'], query, limit=100)
+        return results
+
+
+def flush(snapshot_ids: Iterable[str]) -> None:
+    """Remove snapshots from Sonic index."""
+    try:
+        from sonic import IngestClient
+    except ImportError:
+        raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
+
+    config = get_sonic_config()
+
+    with IngestClient(config['host'], config['port'], config['password']) as ingest:
+        for snapshot_id in snapshot_ids:
+            try:
+                ingest.flush_object(config['collection'], config['bucket'], snapshot_id)
+            except Exception:
+                pass
--- a/archivebox/plugins/search_backend_sqlite/init.py
+++ b/archivebox/plugins/search_backend_sqlite/init.py
--- a/archivebox/plugins/search_backend_sqlite/config.json
+++ b/archivebox/plugins/search_backend_sqlite/config.json
@@ -0,0 +1,24 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SQLITEFTS_DB": {
+      "type": "string",
+      "default": "search.sqlite3",
+      "description": "SQLite FTS database filename"
+    },
+    "FTS_SEPARATE_DATABASE": {
+      "type": "boolean",
+      "default": true,
+      "x-aliases": ["SQLITEFTS_SEPARATE_DATABASE"],
+      "description": "Use separate database file for FTS index"
+    },
+    "FTS_TOKENIZERS": {
+      "type": "string",
+      "default": "porter unicode61 remove_diacritics 2",
+      "x-aliases": ["SQLITEFTS_TOKENIZERS"],
+      "description": "FTS5 tokenizer configuration"
+    }
+  }
+}
--- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
+++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+SQLite FTS5 search backend - indexes snapshot content for full-text search.
+
+This hook runs after all extractors and indexes text content in SQLite FTS5.
+Only runs if SEARCH_BACKEND_ENGINE=sqlite.
+
+Usage: on_Snapshot__90_index_sqlite.py --url=<url> --snapshot-id=<uuid>
+
+Environment variables:
+    SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run
+    USE_INDEXING_BACKEND: Enable search indexing (default: true)
+    SQLITEFTS_DB: Database filename (default: search.sqlite3)
+    FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
+"""
+
+import json
+import os
+import re
+import sqlite3
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'index_sqlite'
+OUTPUT_DIR = 'search_index'
+
+# Text file patterns to index, in priority order
+INDEXABLE_FILES = [
+    ('readability', 'content.txt'),
+    ('readability', 'content.html'),
+    ('mercury', 'content.txt'),
+    ('mercury', 'content.html'),
+    ('htmltotext', 'output.txt'),
+    ('singlefile', 'singlefile.html'),
+    ('dom', 'output.html'),
+    ('wget', '**/*.html'),
+    ('wget', '**/*.htm'),
+    ('title', 'title.txt'),
+]
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def strip_html_tags(html: str) -> str:
+    """Remove HTML tags, keeping text content."""
+    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'<[^>]+>', ' ', html)
+    html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
+    html = html.replace('&lt;', '<').replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = re.sub(r'\s+', ' ', html)
+    return html.strip()
+
+
+def find_indexable_content() -> list[tuple[str, str]]:
+    """Find text content to index from extractor outputs."""
+    results = []
+    cwd = Path.cwd()
+
+    for extractor, file_pattern in INDEXABLE_FILES:
+        extractor_dir = cwd / extractor
+        if not extractor_dir.exists():
+            continue
+
+        if '*' in file_pattern:
+            matches = list(extractor_dir.glob(file_pattern))
+        else:
+            match = extractor_dir / file_pattern
+            matches = [match] if match.exists() else []
+
+        for match in matches:
+            if match.is_file() and match.stat().st_size > 0:
+                try:
+                    content = match.read_text(encoding='utf-8', errors='ignore')
+                    if content.strip():
+                        if match.suffix in ('.html', '.htm'):
+                            content = strip_html_tags(content)
+                        results.append((f'{extractor}/{match.name}', content))
+                except Exception:
+                    continue
+
+    return results
+
+
+def get_db_path() -> Path:
+    """Get path to the search index database."""
+    data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent))
+    db_name = get_env('SQLITEFTS_DB', 'search.sqlite3')
+    return Path(data_dir) / db_name
+
+
+def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None:
+    """Index texts in SQLite FTS5."""
+    db_path = get_db_path()
+    tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2')
+    conn = sqlite3.connect(str(db_path))
+
+    try:
+        # Create FTS5 table if needed
+        conn.execute(f'''
+            CREATE VIRTUAL TABLE IF NOT EXISTS search_index
+            USING fts5(snapshot_id, content, tokenize='{tokenizers}')
+        ''')
+
+        # Remove existing entries
+        conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
+
+        # Insert new content
+        content = '\n\n'.join(texts)
+        conn.execute(
+            'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)',
+            (snapshot_id, content)
+        )
+        conn.commit()
+    finally:
+        conn.close()
+
+
+@click.command()
+@click.option('--url', required=True, help='URL that was archived')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Index snapshot content in SQLite FTS5."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+    indexed_sources = []
+
+    try:
+        # Check if this backend is enabled (permanent skips - don't retry)
+        backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
+        if backend != 'sqlite':
+            print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - different backend selected
+        if not get_env_bool('USE_INDEXING_BACKEND', True):
+            print('Skipping indexing (USE_INDEXING_BACKEND=False)')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - indexing disabled
+        else:
+            contents = find_indexable_content()
+            indexed_sources = [source for source, _ in contents]
+
+            if not contents:
+                status = 'skipped'
+                print('No indexable content found')
+            else:
+                texts = [content for _, content in contents]
+                index_in_sqlite(snapshot_id, texts)
+                status = 'succeeded'
+                output = OUTPUT_DIR
+                print(f'SQLite FTS indexed {len(texts)} documents')
+                print(f'Sources: {", ".join(indexed_sources)}')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'indexed_sources': indexed_sources,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/search_backend_sqlite/search.py
+++ b/archivebox/plugins/search_backend_sqlite/search.py
@@ -0,0 +1,65 @@
+"""
+SQLite FTS5 search backend - search and flush operations.
+
+This module provides the search interface for the SQLite FTS backend.
+
+Environment variables:
+    SQLITEFTS_DB: Database filename (default: search.sqlite3)
+    FTS_SEPARATE_DATABASE: Use separate database file (default: true)
+    FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
+"""
+
+import os
+import sqlite3
+from pathlib import Path
+from typing import List, Iterable
+
+from django.conf import settings
+
+
+# Config with old var names for backwards compatibility
+SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip()
+FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() in ('true', '1', 'yes')
+FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip()
+
+
+def get_db_path() -> Path:
+    """Get path to the search index database."""
+    return Path(settings.DATA_DIR) / SQLITEFTS_DB
+
+
+def search(query: str) -> List[str]:
+    """Search for snapshots matching the query."""
+    db_path = get_db_path()
+    if not db_path.exists():
+        return []
+
+    conn = sqlite3.connect(str(db_path))
+    try:
+        cursor = conn.execute(
+            'SELECT DISTINCT snapshot_id FROM search_index WHERE search_index MATCH ?',
+            (query,)
+        )
+        return [row[0] for row in cursor.fetchall()]
+    except sqlite3.OperationalError:
+        # Table doesn't exist yet
+        return []
+    finally:
+        conn.close()
+
+
+def flush(snapshot_ids: Iterable[str]) -> None:
+    """Remove snapshots from the index."""
+    db_path = get_db_path()
+    if not db_path.exists():
+        return
+
+    conn = sqlite3.connect(str(db_path))
+    try:
+        for snapshot_id in snapshot_ids:
+            conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
+        conn.commit()
+    except sqlite3.OperationalError:
+        pass  # Table doesn't exist
+    finally:
+        conn.close()
--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -0,0 +1,219 @@
+#!/usr/bin/env node
+/**
+ * Extract SEO metadata from a URL.
+ *
+ * Extracts all <meta> tags including:
+ * - og:* (Open Graph)
+ * - twitter:*
+ * - description, keywords, author
+ * - Any other meta tags
+ *
+ * Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes seo/seo.json
+ *
+ * Environment variables:
+ *     SAVE_SEO: Enable SEO extraction (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'seo';
+const OUTPUT_DIR = 'seo';
+const OUTPUT_FILE = 'seo.json';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Extract SEO metadata
+async function extractSeo(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    let browser = null;
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Extract all meta tags
+        const seoData = await page.evaluate(() => {
+            const metaTags = Array.from(document.querySelectorAll('meta'));
+            const seo = {
+                url: window.location.href,
+                title: document.title || '',
+            };
+
+            // Process each meta tag
+            metaTags.forEach(tag => {
+                // Get the key (name or property attribute)
+                const key = tag.getAttribute('name') || tag.getAttribute('property') || '';
+                const content = tag.getAttribute('content') || '';
+
+                if (key && content) {
+                    // Store by key
+                    seo[key] = content;
+                }
+            });
+
+            // Also get canonical URL if present
+            const canonical = document.querySelector('link[rel="canonical"]');
+            if (canonical) {
+                seo.canonical = canonical.getAttribute('href');
+            }
+
+            // Get language
+            const htmlLang = document.documentElement.lang;
+            if (htmlLang) {
+                seo.language = htmlLang;
+            }
+
+            return seo;
+        });
+
+        // Write output
+        fs.writeFileSync(outputPath, JSON.stringify(seoData, null, 2));
+
+        return { success: true, output: outputPath, seoData };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_SEO', true)) {
+            console.log('Skipping SEO (SAVE_SEO=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await extractSeo(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            const metaCount = Object.keys(result.seoData).length - 2;  // Subtract url and title
+            console.log(`SEO metadata extracted: ${metaCount} meta tags`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/singlefile/config.json
+++ b/archivebox/plugins/singlefile/config.json
@@ -0,0 +1,53 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_SINGLEFILE": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable SingleFile archiving"
+    },
+    "SINGLEFILE_BINARY": {
+      "type": "string",
+      "default": "single-file",
+      "x-aliases": ["SINGLE_FILE_BINARY"],
+      "description": "Path to single-file binary"
+    },
+    "NODE_BINARY": {
+      "type": "string",
+      "default": "node",
+      "description": "Path to Node.js binary"
+    },
+    "SINGLEFILE_TIMEOUT": {
+      "type": "integer",
+      "default": 60,
+      "minimum": 10,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for SingleFile in seconds"
+    },
+    "SINGLEFILE_USER_AGENT": {
+      "type": "string",
+      "default": "",
+      "x-fallback": "USER_AGENT",
+      "description": "User agent string"
+    },
+    "SINGLEFILE_COOKIES_FILE": {
+      "type": "string",
+      "default": "",
+      "x-fallback": "COOKIES_FILE",
+      "description": "Path to cookies file"
+    },
+    "SINGLEFILE_ARGS": {
+      "type": "array",
+      "items": {"type": "string"},
+      "default": [],
+      "description": "Default single-file arguments"
+    },
+    "SINGLEFILE_EXTRA_ARGS": {
+      "type": "string",
+      "default": "",
+      "description": "Extra arguments for single-file"
+    }
+  }
+}
--- a/archivebox/plugins/singlefile/on_Crawl__00_validate_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Crawl__00_validate_singlefile.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Validation hook for single-file binary.
+
+Runs at crawl start to verify single-file (npm package) is available.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+def get_binary_version(abspath: str) -> str | None:
+    """Get version string from single-file binary."""
+    try:
+        result = subprocess.run(
+            [abspath, '--version'],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode == 0 and result.stdout:
+            return result.stdout.strip().split('\n')[0][:32]
+    except Exception:
+        pass
+    return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+    """Get SHA256 hash of binary."""
+    try:
+        # For scripts, hash the script content
+        with open(abspath, 'rb') as f:
+            return hashlib.sha256(f.read()).hexdigest()
+    except Exception:
+        return None
+
+
+def find_singlefile() -> dict | None:
+    """Find single-file binary."""
+    # Check env var first
+    env_path = os.environ.get('SINGLEFILE_BINARY', '')
+    if env_path and Path(env_path).is_file():
+        return {
+            'name': 'single-file',
+            'abspath': env_path,
+            'version': get_binary_version(env_path),
+            'sha256': get_binary_hash(env_path),
+            'binprovider': 'env',
+        }
+
+    # Try shutil.which
+    for name in ['single-file', 'singlefile']:
+        abspath = shutil.which(name)
+        if abspath:
+            return {
+                'name': 'single-file',
+                'abspath': abspath,
+                'version': get_binary_version(abspath),
+                'sha256': get_binary_hash(abspath),
+                'binprovider': 'npm',
+            }
+
+    # Check common npm paths
+    npm_paths = [
+        Path.home() / '.npm-global/bin/single-file',
+        Path.home() / 'node_modules/.bin/single-file',
+        Path('/usr/local/bin/single-file'),
+        Path('/usr/local/lib/node_modules/.bin/single-file'),
+    ]
+    for path in npm_paths:
+        if path.is_file():
+            return {
+                'name': 'single-file',
+                'abspath': str(path),
+                'version': get_binary_version(str(path)),
+                'sha256': get_binary_hash(str(path)),
+                'binprovider': 'npm',
+            }
+
+    return None
+
+
+def main():
+    result = find_singlefile()
+
+    if result and result.get('abspath'):
+        print(json.dumps({
+            'type': 'InstalledBinary',
+            'name': result['name'],
+            'abspath': result['abspath'],
+            'version': result['version'],
+            'sha256': result['sha256'],
+            'binprovider': result['binprovider'],
+        }))
+
+        print(json.dumps({
+            'type': 'Machine',
+            '_method': 'update',
+            'key': 'config/SINGLEFILE_BINARY',
+            'value': result['abspath'],
+        }))
+
+        if result['version']:
+            print(json.dumps({
+                'type': 'Machine',
+                '_method': 'update',
+                'key': 'config/SINGLEFILE_VERSION',
+                'value': result['version'],
+            }))
+
+        sys.exit(0)
+    else:
+        print(json.dumps({
+            'type': 'Dependency',
+            'bin_name': 'single-file',
+            'bin_providers': 'npm,env',
+        }))
+        print(f"single-file binary not found", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
+++ b/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
@@ -0,0 +1,270 @@
+#!/usr/bin/env node
+/**
+ * SingleFile Extension Plugin
+ *
+ * Installs and uses the SingleFile Chrome extension for archiving complete web pages.
+ * Falls back to single-file-cli if the extension is not available.
+ *
+ * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
+ *
+ * Priority: 04 (early) - Must install before Chrome session starts
+ * Hook: on_Snapshot
+ *
+ * This extension automatically:
+ * - Saves complete web pages as single HTML files
+ * - Inlines all resources (CSS, JS, images, fonts)
+ * - Preserves page fidelity better than wget/curl
+ * - Works with SPAs and dynamically loaded content
+ */
+
+const path = require('path');
+const fs = require('fs');
+const { promisify } = require('util');
+const { exec } = require('child_process');
+
+const execAsync = promisify(exec);
+
+// Import extension utilities
+const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
+
+// Extension metadata
+const EXTENSION = {
+    webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
+    name: 'singlefile',
+};
+
+// Get extensions directory from environment or use default
+const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
+    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
+
+const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
+    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
+
+const OUTPUT_DIR = 'singlefile';
+const OUTPUT_FILE = 'singlefile.html';
+
+/**
+ * Install the SingleFile extension
+ */
+async function installSinglefileExtension() {
+    console.log('[*] Installing SingleFile extension...');
+
+    // Install the extension
+    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
+
+    if (!extension) {
+        console.error('[❌] Failed to install SingleFile extension');
+        return null;
+    }
+
+    console.log('[+] SingleFile extension installed');
+    console.log('[+] Web pages will be saved as single HTML files');
+
+    return extension;
+}
+
+/**
+ * Wait for a specified amount of time
+ */
+function wait(ms) {
+    return new Promise(resolve => setTimeout(resolve, ms));
+}
+
+/**
+ * Save a page using the SingleFile extension
+ *
+ * @param {Object} page - Puppeteer page object
+ * @param {Object} extension - Extension metadata with dispatchAction method
+ * @param {Object} options - Additional options
+ * @returns {Promise<string|null>} - Path to saved file or null on failure
+ */
+async function saveSinglefileWithExtension(page, extension, options = {}) {
+    if (!extension || !extension.version) {
+        throw new Error('SingleFile extension not found or not loaded');
+    }
+
+    const url = await page.url();
+
+    // Check for unsupported URL schemes
+    const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
+    const scheme = url.split(':')[0];
+    if (URL_SCHEMES_IGNORED.includes(scheme)) {
+        console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
+        return null;
+    }
+
+    // Ensure downloads directory exists
+    await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
+
+    // Get list of existing files to ignore
+    const files_before = new Set(
+        (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
+            .filter(fn => fn.endsWith('.html'))
+    );
+
+    // Ensure output directory exists
+    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
+
+    // Bring page to front (extension action button acts on foreground tab)
+    await page.bringToFront();
+
+    // Trigger the extension's action (toolbar button click)
+    await extension.dispatchAction();
+
+    // Wait for file to appear in downloads directory
+    const check_delay = 3000; // 3 seconds
+    const max_tries = 10;
+    let files_new = [];
+
+    for (let attempt = 0; attempt < max_tries; attempt++) {
+        await wait(check_delay);
+
+        const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
+            .filter(fn => fn.endsWith('.html'));
+
+        files_new = files_after.filter(file => !files_before.has(file));
+
+        if (files_new.length === 0) {
+            continue;
+        }
+
+        // Find the matching file by checking if it contains the URL in the HTML header
+        for (const file of files_new) {
+            const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
+            const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
+            const dl_header = dl_text.split('meta charset')[0];
+
+            if (dl_header.includes(`url: ${url}`)) {
+                console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
+                await fs.promises.rename(dl_path, out_path);
+                return out_path;
+            }
+        }
+    }
+
+    console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
+    console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
+    return null;
+}
+
+/**
+ * Save a page using single-file-cli (fallback method)
+ *
+ * @param {string} url - URL to archive
+ * @param {Object} options - Additional options
+ * @returns {Promise<string|null>} - Path to saved file or null on failure
+ */
+async function saveSinglefileWithCLI(url, options = {}) {
+    console.log('[*] Falling back to single-file-cli...');
+
+    // Find single-file binary
+    let binary = null;
+    try {
+        const { stdout } = await execAsync('which single-file');
+        binary = stdout.trim();
+    } catch (err) {
+        console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
+        return null;
+    }
+
+    // Ensure output directory exists
+    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    // Build command
+    const cmd = [
+        binary,
+        '--browser-headless',
+        url,
+        out_path,
+    ];
+
+    // Add optional args
+    if (options.userAgent) {
+        cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
+    }
+    if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
+        cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
+    }
+    if (options.ignoreSSL) {
+        cmd.splice(2, 0, '--browser-ignore-insecure-certs');
+    }
+
+    // Execute
+    try {
+        const timeout = options.timeout || 120000;
+        await execAsync(cmd.join(' '), { timeout });
+
+        if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
+            console.log(`[+] SingleFile saved via CLI: ${out_path}`);
+            return out_path;
+        }
+
+        console.error('[❌] SingleFile CLI completed but no output file found');
+        return null;
+    } catch (err) {
+        console.error(`[❌] SingleFile CLI error: ${err.message}`);
+        return null;
+    }
+}
+
+/**
+ * Main entry point - install extension before archiving
+ */
+async function main() {
+    // Check if extension is already cached
+    const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
+
+    if (fs.existsSync(cacheFile)) {
+        try {
+            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
+            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
+
+            if (fs.existsSync(manifestPath)) {
+                console.log('[*] SingleFile extension already installed (using cache)');
+                return cached;
+            }
+        } catch (e) {
+            // Cache file corrupted, re-install
+            console.warn('[⚠️] Extension cache corrupted, re-installing...');
+        }
+    }
+
+    // Install extension
+    const extension = await installSinglefileExtension();
+
+    // Export extension metadata for chrome_session to load
+    if (extension) {
+        // Write extension info to a cache file that chrome_session can read
+        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
+        await fs.promises.writeFile(
+            cacheFile,
+            JSON.stringify(extension, null, 2)
+        );
+        console.log(`[+] Extension metadata written to ${cacheFile}`);
+    }
+
+    return extension;
+}
+
+// Export functions for use by other plugins
+module.exports = {
+    EXTENSION,
+    installSinglefileExtension,
+    saveSinglefileWithExtension,
+    saveSinglefileWithCLI,
+};
+
+// Run if executed directly
+if (require.main === module) {
+    main().then(() => {
+        console.log('[✓] SingleFile extension setup complete');
+        process.exit(0);
+    }).catch(err => {
+        console.error('[❌] SingleFile extension setup failed:', err);
+        process.exit(1);
+    });
+}
--- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""
+Archive a URL using SingleFile.
+
+Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
+Output: Writes singlefile.html to $PWD
+
+Environment variables:
+    SINGLEFILE_BINARY: Path to SingleFile binary
+    SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
+    SINGLEFILE_USER_AGENT: User agent string (optional)
+    SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
+    SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
+    SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
+
+    # Feature toggle
+    SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
+
+    # Chrome binary (SingleFile needs Chrome)
+    CHROME_BINARY: Path to Chrome/Chromium binary
+
+    # Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
+    TIMEOUT: Fallback timeout
+    USER_AGENT: Fallback user agent
+    CHECK_SSL_VALIDITY: Fallback SSL check
+    COOKIES_FILE: Fallback cookies file
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'singlefile'
+BIN_NAME = 'single-file'
+BIN_PROVIDERS = 'npm,env'
+OUTPUT_DIR = 'singlefile'
+OUTPUT_FILE = 'singlefile.html'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+STATICFILE_DIR = 'staticfile'
+
+def has_staticfile_output() -> bool:
+    """Check if staticfile extractor already downloaded this URL."""
+    staticfile_dir = Path(STATICFILE_DIR)
+    return staticfile_dir.exists() and any(staticfile_dir.iterdir())
+
+
+# Chrome binary search paths
+CHROMIUM_BINARY_NAMES_LINUX = [
+    'chromium', 'chromium-browser', 'chromium-browser-beta',
+    'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
+]
+CHROME_BINARY_NAMES_LINUX = [
+    'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
+    'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
+]
+CHROME_BINARY_NAMES_MACOS = [
+    '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+    '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
+]
+CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
+
+ALL_CHROME_BINARIES = (
+    CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
+    CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
+)
+
+
+def find_singlefile() -> str | None:
+    """Find SingleFile binary."""
+    singlefile = get_env('SINGLEFILE_BINARY')
+    if singlefile and os.path.isfile(singlefile):
+        return singlefile
+
+    for name in ['single-file', 'singlefile']:
+        binary = shutil.which(name)
+        if binary:
+            return binary
+
+    return None
+
+
+def find_chrome() -> str | None:
+    """Find Chrome/Chromium binary."""
+    chrome = get_env('CHROME_BINARY')
+    if chrome and os.path.isfile(chrome):
+        return chrome
+
+    for name in ALL_CHROME_BINARIES:
+        if '/' in name:
+            if os.path.isfile(name):
+                return name
+        else:
+            binary = shutil.which(name)
+            if binary:
+                return binary
+
+    return None
+
+
+def get_version(binary: str) -> str:
+    """Get SingleFile version."""
+    try:
+        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
+        return result.stdout.strip()[:64]
+    except Exception:
+        return ''
+
+
+CHROME_SESSION_DIR = 'chrome_session'
+
+
+def get_cdp_url() -> str | None:
+    """Get CDP URL from chrome_session if available."""
+    cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt'
+    if cdp_file.exists():
+        return cdp_file.read_text().strip()
+    return None
+
+
+def get_port_from_cdp_url(cdp_url: str) -> str | None:
+    """Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...)."""
+    import re
+    match = re.search(r':(\d+)/', cdp_url)
+    if match:
+        return match.group(1)
+    return None
+
+
+def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
+    """
+    Archive URL using SingleFile.
+
+    If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
+    Otherwise launches a new Chrome instance.
+
+    Returns: (success, output_path, error_message)
+    """
+    # Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
+    timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
+    user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
+    check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
+    cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
+    extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
+    chrome = find_chrome()
+
+    cmd = [binary]
+
+    # Try to use existing Chrome session via CDP
+    cdp_url = get_cdp_url()
+    if cdp_url:
+        # SingleFile can connect to existing browser via WebSocket
+        # Extract port from CDP URL (ws://127.0.0.1:PORT/...)
+        port = get_port_from_cdp_url(cdp_url)
+        if port:
+            cmd.extend(['--browser-server', f'http://127.0.0.1:{port}'])
+    elif chrome:
+        cmd.extend(['--browser-executable-path', chrome])
+
+    # Common options
+    cmd.extend([
+        '--browser-headless',
+    ])
+
+    # SSL handling
+    if not check_ssl:
+        cmd.append('--browser-ignore-insecure-certs')
+
+    if user_agent:
+        cmd.extend(['--browser-user-agent', user_agent])
+
+    if cookies_file and Path(cookies_file).is_file():
+        cmd.extend(['--browser-cookies-file', cookies_file])
+
+    if extra_args:
+        cmd.extend(extra_args.split())
+
+    # Create output directory
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(exist_ok=True)
+    output_path = output_dir / OUTPUT_FILE
+
+    cmd.extend([url, str(output_path)])
+
+    try:
+        result = subprocess.run(cmd, capture_output=True, timeout=timeout)
+
+        if output_path.exists() and output_path.stat().st_size > 0:
+            return True, str(output_path), ''
+        else:
+            stderr = result.stderr.decode('utf-8', errors='replace')
+            if 'ERR_NAME_NOT_RESOLVED' in stderr:
+                return False, None, 'DNS resolution failed'
+            if 'ERR_CONNECTION_REFUSED' in stderr:
+                return False, None, 'Connection refused'
+            return False, None, f'SingleFile failed: {stderr[:200]}'
+
+    except subprocess.TimeoutExpired:
+        return False, None, f'Timed out after {timeout} seconds'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to archive')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Archive a URL using SingleFile."""
+
+    start_ts = datetime.now(timezone.utc)
+    version = ''
+    output = None
+    status = 'failed'
+    error = ''
+    binary = None
+    cmd_str = ''
+
+    try:
+        # Check if SingleFile is enabled
+        if not get_env_bool('SAVE_SINGLEFILE', True):
+            print('Skipping SingleFile (SAVE_SINGLEFILE=False)')
+            status = 'skipped'
+            end_ts = datetime.now(timezone.utc)
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={end_ts.isoformat()}')
+            print(f'STATUS={status}')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)
+
+        # Check if staticfile extractor already handled this (permanent skip)
+        if has_staticfile_output():
+            print(f'Skipping SingleFile - staticfile extractor already downloaded this')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - staticfile already handled
+
+        # Find binary
+        binary = find_singlefile()
+        if not binary:
+            print(f'ERROR: SingleFile binary not found', file=sys.stderr)
+            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
+            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
+            print(f'INSTALL_HINT=npm install -g single-file-cli', file=sys.stderr)
+            sys.exit(1)
+
+        version = get_version(binary)
+        cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
+
+        # Run extraction
+        success, output, error = save_singlefile(url, binary)
+        status = 'succeeded' if success else 'failed'
+
+        if success and output:
+            size = Path(output).stat().st_size
+            print(f'SingleFile saved ({size} bytes)')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if cmd_str:
+        print(f'CMD={cmd_str}')
+    if version:
+        print(f'VERSION={version}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'cmd_version': version,
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/singlefile/tests/test_archiving.py
+++ b/archivebox/plugins/singlefile/tests/test_archiving.py
@@ -0,0 +1,110 @@
+"""
+Integration tests - archive example.com with SingleFile and verify output
+"""
+
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
+TEST_URL = "https://example.com"
+
+
+# Check if single-file CLI is available
+try:
+    result = subprocess.run(
+        ["which", "single-file"],
+        capture_output=True,
+        timeout=5
+    )
+    SINGLEFILE_CLI_AVAILABLE = result.returncode == 0
+except:
+    SINGLEFILE_CLI_AVAILABLE = False
+
+
+@pytest.mark.skipif(
+    not SINGLEFILE_CLI_AVAILABLE,
+    reason="single-file CLI not installed (npm install -g single-file-cli)"
+)
+def test_archives_example_com():
+    """Archive example.com and verify output contains expected content"""
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_dir = Path(tmpdir) / "singlefile"
+        output_dir.mkdir()
+
+        output_file = output_dir / "singlefile.html"
+
+        # Run single-file CLI
+        result = subprocess.run(
+            [
+                "single-file",
+                "--browser-headless",
+                TEST_URL,
+                str(output_file)
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120
+        )
+
+        assert result.returncode == 0, f"Archive failed: {result.stderr}"
+
+        # Verify output exists
+        assert output_file.exists(), "Output file not created"
+
+        # Read and verify content
+        html_content = output_file.read_text()
+        file_size = output_file.stat().st_size
+
+        # Should be substantial (embedded resources)
+        assert file_size > 900, f"Output too small: {file_size} bytes"
+
+        # Verify HTML structure (SingleFile minifies, so <head> tag may be omitted)
+        assert "<html" in html_content.lower()
+        assert "<body" in html_content.lower()
+        assert "<title>" in html_content.lower() or "title>" in html_content.lower()
+
+        # Verify example.com content is actually present
+        assert "example domain" in html_content.lower(), "Missing 'Example Domain' title"
+        assert "this domain is" in html_content.lower(), "Missing example.com description text"
+        assert "iana.org" in html_content.lower(), "Missing IANA link"
+
+        # Verify it's not just empty/error page
+        assert file_size > 900, f"File too small: {file_size} bytes"
+
+
+@pytest.mark.skipif(not SINGLEFILE_CLI_AVAILABLE, reason="single-file CLI not installed")
+def test_different_urls_produce_different_outputs():
+    """Verify different URLs produce different archived content"""
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        outputs = {}
+
+        for url in ["https://example.com", "https://example.org"]:
+            output_file = Path(tmpdir) / f"{url.replace('https://', '').replace('.', '_')}.html"
+
+            result = subprocess.run(
+                ["single-file", "--browser-headless", url, str(output_file)],
+                capture_output=True,
+                timeout=120
+            )
+
+            if result.returncode == 0 and output_file.exists():
+                outputs[url] = output_file.read_text()
+
+        assert len(outputs) == 2, "Should archive both URLs"
+
+        # Verify outputs differ
+        urls = list(outputs.keys())
+        assert outputs[urls[0]] != outputs[urls[1]], "Different URLs should produce different outputs"
+
+        # Each should contain its domain
+        assert "example.com" in outputs[urls[0]]
+        assert "example.org" in outputs[urls[1]]
--- a/archivebox/plugins/singlefile/tests/test_singlefile.js
+++ b/archivebox/plugins/singlefile/tests/test_singlefile.js
@@ -0,0 +1,385 @@
+/**
+ * Unit tests for singlefile plugin
+ *
+ * Run with: node --test tests/test_singlefile.js
+ */
+
+const assert = require('assert');
+const fs = require('fs');
+const path = require('path');
+const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
+
+// Test fixtures
+const TEST_DIR = path.join(__dirname, '.test_fixtures');
+const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
+const TEST_DOWNLOADS_DIR = path.join(TEST_DIR, 'chrome_downloads');
+
+describe('singlefile plugin', () => {
+    before(() => {
+        if (!fs.existsSync(TEST_DIR)) {
+            fs.mkdirSync(TEST_DIR, { recursive: true });
+        }
+    });
+
+    after(() => {
+        if (fs.existsSync(TEST_DIR)) {
+            fs.rmSync(TEST_DIR, { recursive: true, force: true });
+        }
+    });
+
+    describe('EXTENSION metadata', () => {
+        it('should have correct webstore_id', () => {
+            const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
+
+            assert.strictEqual(EXTENSION.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
+        });
+
+        it('should have correct name', () => {
+            const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
+
+            assert.strictEqual(EXTENSION.name, 'singlefile');
+        });
+    });
+
+    describe('installSinglefileExtension', () => {
+        beforeEach(() => {
+            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
+
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_EXTENSIONS_DIR;
+        });
+
+        it('should use cached extension if available', async () => {
+            const { installSinglefileExtension } = require('../on_Snapshot__04_singlefile.js');
+
+            // Create fake cache
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'singlefile.extension.json');
+            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_singlefile');
+
+            fs.mkdirSync(fakeExtensionDir, { recursive: true });
+            fs.writeFileSync(
+                path.join(fakeExtensionDir, 'manifest.json'),
+                JSON.stringify({ version: '1.22.90' })
+            );
+
+            const fakeCache = {
+                webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
+                name: 'singlefile',
+                unpacked_path: fakeExtensionDir,
+                version: '1.22.90'
+            };
+
+            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
+
+            const result = await installSinglefileExtension();
+
+            assert.notStrictEqual(result, null);
+            assert.strictEqual(result.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
+        });
+    });
+
+    describe('saveSinglefileWithExtension', () => {
+        beforeEach(() => {
+            process.env.CHROME_DOWNLOADS_DIR = TEST_DOWNLOADS_DIR;
+
+            if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
+                fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
+                fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_DOWNLOADS_DIR;
+        });
+
+        it('should require extension and version to be present', () => {
+            const mockExtension = {
+                name: 'singlefile',
+                version: '1.22.96',
+                id: 'test_id'
+            };
+
+            assert.ok(mockExtension.version);
+            assert.ok(mockExtension.id);
+        });
+
+        it('should filter unsupported URL schemes', () => {
+            const unsupportedSchemes = [
+                'about:',
+                'chrome:',
+                'chrome-extension:',
+                'data:',
+                'javascript:',
+                'blob:'
+            ];
+
+            unsupportedSchemes.forEach(scheme => {
+                const testUrl = scheme + 'something';
+                const urlScheme = testUrl.split(':')[0];
+
+                assert.ok(unsupportedSchemes.some(s => s.startsWith(urlScheme)));
+            });
+        });
+
+        it('should wait for file to appear in downloads directory', async () => {
+            const checkDelay = 3000; // 3 seconds
+            const maxTries = 10;
+
+            // Total max wait time
+            const maxWaitTime = checkDelay * maxTries;
+
+            assert.strictEqual(maxWaitTime, 30000); // 30 seconds
+        });
+
+        it('should find downloaded file by checking URL in HTML header', () => {
+            const testUrl = 'https://example.com';
+            const mockHtml = `<!-- url: ${testUrl} --><html><head><meta charset="utf-8"></head></html>`;
+
+            // Should be able to extract URL from header
+            const headerPart = mockHtml.split('meta charset')[0];
+            assert.ok(headerPart.includes(`url: ${testUrl}`));
+        });
+
+        it('should move file from downloads to output directory', () => {
+            const downloadPath = path.join(TEST_DOWNLOADS_DIR, 'temp_file.html');
+            const outputDir = 'singlefile';
+            const outputFile = 'singlefile.html';
+            const outputPath = path.join(outputDir, outputFile);
+
+            // Verify paths are different
+            assert.notStrictEqual(downloadPath, outputPath);
+        });
+    });
+
+    describe('saveSinglefileWithCLI', () => {
+        it('should use single-file-cli as fallback', () => {
+            const cliCommand = 'single-file';
+
+            // Should check for CLI availability
+            assert.strictEqual(typeof cliCommand, 'string');
+            assert.ok(cliCommand.length > 0);
+        });
+
+        it('should pass correct arguments to CLI', () => {
+            const args = [
+                '--browser-headless',
+                'https://example.com',
+                'singlefile/singlefile.html'
+            ];
+
+            assert.ok(args.includes('--browser-headless'));
+            assert.ok(args.some(arg => arg.startsWith('http')));
+        });
+
+        it('should handle optional CLI arguments', () => {
+            const options = {
+                userAgent: 'Mozilla/5.0...',
+                cookiesFile: '/path/to/cookies.txt',
+                ignoreSSL: true
+            };
+
+            // Optional args should be conditionally added
+            if (options.userAgent) {
+                assert.ok(options.userAgent.length > 0);
+            }
+
+            if (options.ignoreSSL) {
+                assert.strictEqual(options.ignoreSSL, true);
+            }
+        });
+    });
+
+    describe('priority and execution order', () => {
+        it('should have priority 04 (early)', () => {
+            const filename = 'on_Snapshot__04_singlefile.js';
+
+            const match = filename.match(/on_Snapshot__(\d+)_/);
+            assert.ok(match);
+
+            const priority = parseInt(match[1]);
+            assert.strictEqual(priority, 4);
+        });
+
+        it('should run before chrome_session (priority 20)', () => {
+            const extensionPriority = 4;
+            const chromeSessionPriority = 20;
+
+            assert.ok(extensionPriority < chromeSessionPriority);
+        });
+
+        it('should install extensions in correct order', () => {
+            const priorities = {
+                captcha2: 1,
+                istilldontcareaboutcookies: 2,
+                ublock: 3,
+                singlefile: 4
+            };
+
+            // Should be in ascending order
+            assert.ok(priorities.captcha2 < priorities.istilldontcareaboutcookies);
+            assert.ok(priorities.istilldontcareaboutcookies < priorities.ublock);
+            assert.ok(priorities.ublock < priorities.singlefile);
+        });
+    });
+
+    describe('output structure', () => {
+        it('should define output directory and file', () => {
+            const OUTPUT_DIR = 'singlefile';
+            const OUTPUT_FILE = 'singlefile.html';
+
+            assert.strictEqual(OUTPUT_DIR, 'singlefile');
+            assert.strictEqual(OUTPUT_FILE, 'singlefile.html');
+        });
+
+        it('should create output directory if not exists', () => {
+            const outputDir = path.join(TEST_DIR, 'singlefile');
+
+            // Should create directory
+            if (!fs.existsSync(outputDir)) {
+                fs.mkdirSync(outputDir, { recursive: true });
+            }
+
+            assert.ok(fs.existsSync(outputDir));
+
+            // Cleanup
+            fs.rmSync(outputDir, { recursive: true });
+        });
+    });
+
+    describe('extension vs CLI fallback', () => {
+        it('should prefer extension over CLI', () => {
+            const preferenceOrder = [
+                'extension',
+                'cli'
+            ];
+
+            assert.strictEqual(preferenceOrder[0], 'extension');
+            assert.strictEqual(preferenceOrder[1], 'cli');
+        });
+
+        it('should fallback to CLI if extension unavailable', () => {
+            const extensionAvailable = false;
+            const cliAvailable = true;
+
+            let method;
+            if (extensionAvailable) {
+                method = 'extension';
+            } else if (cliAvailable) {
+                method = 'cli';
+            }
+
+            assert.strictEqual(method, 'cli');
+        });
+
+        it('should use extension if available', () => {
+            const extensionAvailable = true;
+
+            let method;
+            if (extensionAvailable) {
+                method = 'extension';
+            } else {
+                method = 'cli';
+            }
+
+            assert.strictEqual(method, 'extension');
+        });
+    });
+
+    describe('file matching and validation', () => {
+        beforeEach(() => {
+            if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
+                fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
+                fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
+            }
+        });
+
+        it('should filter HTML files from downloads', () => {
+            // Create mock download files
+            const files = [
+                'example.html',
+                'test.pdf',
+                'image.png',
+                'page.html'
+            ];
+
+            const htmlFiles = files.filter(f => f.endsWith('.html'));
+
+            assert.strictEqual(htmlFiles.length, 2);
+            assert.ok(htmlFiles.includes('example.html'));
+            assert.ok(htmlFiles.includes('page.html'));
+        });
+
+        it('should match URL in HTML header comment', () => {
+            const testUrl = 'https://example.com/page';
+
+            const htmlContent = `<!--
+ Page saved with SingleFile
+ url: ${testUrl}
+ saved date: 2024-01-01
+-->
+<html>...</html>`;
+
+            const headerSection = htmlContent.split('meta charset')[0] || htmlContent.split('<html>')[0];
+
+            assert.ok(headerSection.includes(`url: ${testUrl}`));
+        });
+
+        it('should handle multiple new files in downloads', () => {
+            const filesBefore = new Set(['old1.html', 'old2.html']);
+            const filesAfter = ['old1.html', 'old2.html', 'new1.html', 'new2.html'];
+
+            const filesNew = filesAfter.filter(f => !filesBefore.has(f));
+
+            assert.strictEqual(filesNew.length, 2);
+            assert.ok(filesNew.includes('new1.html'));
+            assert.ok(filesNew.includes('new2.html'));
+        });
+    });
+
+    describe('error handling', () => {
+        it('should timeout after max wait time', () => {
+            const checkDelay = 3000; // ms
+            const maxTries = 10;
+            const timeoutMs = checkDelay * maxTries;
+
+            assert.strictEqual(timeoutMs, 30000); // 30 seconds
+        });
+
+        it('should handle missing extension gracefully', () => {
+            const extension = null;
+
+            if (!extension || !extension.version) {
+                // Should throw error
+                assert.ok(true);
+            }
+        });
+
+        it('should handle file not found after waiting', () => {
+            const filesNew = [];
+            const maxWaitReached = true;
+
+            if (filesNew.length === 0 && maxWaitReached) {
+                // Should return null
+                const result = null;
+                assert.strictEqual(result, null);
+            }
+        });
+    });
+});
--- a/archivebox/plugins/singlefile/tests/test_singlefile.py
+++ b/archivebox/plugins/singlefile/tests/test_singlefile.py
@@ -0,0 +1,141 @@
+"""
+Unit tests for singlefile plugin
+
+Tests invoke the plugin hook as an external process and verify outputs/side effects.
+"""
+
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
+
+
+def test_install_script_exists():
+    """Verify install script exists"""
+    assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
+
+
+def test_extension_metadata():
+    """Test that SingleFile extension has correct metadata"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
+
+        result = subprocess.run(
+            ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
+            capture_output=True,
+            text=True,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
+
+        metadata = json.loads(result.stdout)
+        assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
+        assert metadata["name"] == "singlefile"
+
+
+def test_install_creates_cache():
+    """Test that install creates extension cache"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Check output mentions installation
+        assert "SingleFile" in result.stdout or "singlefile" in result.stdout
+
+        # Check cache file was created
+        cache_file = ext_dir / "singlefile.extension.json"
+        assert cache_file.exists(), "Cache file should be created"
+
+        # Verify cache content
+        cache_data = json.loads(cache_file.read_text())
+        assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
+        assert cache_data["name"] == "singlefile"
+
+
+def test_install_uses_existing_cache():
+    """Test that install uses existing cache when available"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        # Create fake cache
+        fake_extension_dir = ext_dir / "mpiodijhokgodhhofbcjdecpffjipkle__singlefile"
+        fake_extension_dir.mkdir(parents=True)
+
+        manifest = {"version": "1.22.96", "name": "SingleFile"}
+        (fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should use cache or install successfully
+        assert result.returncode == 0
+
+
+def test_no_configuration_required():
+    """Test that SingleFile works without configuration"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+        # No API keys needed
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should work without API keys
+        assert result.returncode == 0
+
+
+def test_priority_order():
+    """Test that singlefile has correct priority (04)"""
+    # Extract priority from filename
+    filename = INSTALL_SCRIPT.name
+    assert "04" in filename, "SingleFile should have priority 04"
+    assert filename.startswith("on_Snapshot__04_"), "Should follow priority naming convention"
+
+
+def test_output_directory_structure():
+    """Test that plugin defines correct output structure"""
+    # Verify the script mentions singlefile output directory
+    script_content = INSTALL_SCRIPT.read_text()
+
+    # Should mention singlefile output directory
+    assert "singlefile" in script_content.lower()
+    # Should mention HTML output
+    assert ".html" in script_content or "html" in script_content.lower()
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
@@ -0,0 +1,243 @@
+#!/usr/bin/env node
+/**
+ * Extract SSL/TLS certificate details from a URL.
+ *
+ * Connects to Chrome session and retrieves security details including:
+ * - Protocol (TLS 1.2, TLS 1.3, etc.)
+ * - Cipher suite
+ * - Certificate issuer, validity period
+ * - Security state
+ *
+ * Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes ssl/ssl.json
+ *
+ * Environment variables:
+ *     SAVE_SSL: Enable SSL extraction (default: true)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const puppeteer = require('puppeteer-core');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'ssl';
+const OUTPUT_DIR = 'ssl';
+const OUTPUT_FILE = 'ssl.json';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Get CDP URL from chrome_session
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Extract SSL details
+async function extractSsl(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    // Only extract SSL for HTTPS URLs
+    if (!url.startsWith('https://')) {
+        return { success: false, error: 'URL is not HTTPS' };
+    }
+
+    let browser = null;
+    let sslInfo = {};
+
+    try {
+        // Connect to existing Chrome session
+        const cdpUrl = getCdpUrl();
+        if (!cdpUrl) {
+            return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
+        }
+
+        browser = await puppeteer.connect({
+            browserWSEndpoint: cdpUrl,
+        });
+
+        // Get the page
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            return { success: false, error: 'No page found in Chrome session' };
+        }
+
+        // Get CDP client for low-level access
+        const client = await page.target().createCDPSession();
+
+        // Enable Security domain
+        await client.send('Security.enable');
+
+        // Get security details from the loaded page
+        const securityState = await client.send('Security.getSecurityState');
+
+        sslInfo = {
+            url,
+            securityState: securityState.securityState,
+            schemeIsCryptographic: securityState.schemeIsCryptographic,
+            summary: securityState.summary || '',
+        };
+
+        // Try to get detailed certificate info if available
+        if (securityState.securityStateIssueIds && securityState.securityStateIssueIds.length > 0) {
+            sslInfo.issues = securityState.securityStateIssueIds;
+        }
+
+        // Get response security details from navigation
+        let mainResponse = null;
+        page.on('response', async (response) => {
+            if (response.url() === url || response.request().isNavigationRequest()) {
+                mainResponse = response;
+            }
+        });
+
+        // If we have security details from response
+        if (mainResponse) {
+            try {
+                const securityDetails = await mainResponse.securityDetails();
+                if (securityDetails) {
+                    sslInfo.protocol = securityDetails.protocol();
+                    sslInfo.subjectName = securityDetails.subjectName();
+                    sslInfo.issuer = securityDetails.issuer();
+                    sslInfo.validFrom = securityDetails.validFrom();
+                    sslInfo.validTo = securityDetails.validTo();
+                    sslInfo.certificateId = securityDetails.subjectName();
+
+                    const sanList = securityDetails.sanList();
+                    if (sanList && sanList.length > 0) {
+                        sslInfo.subjectAlternativeNames = sanList;
+                    }
+                }
+            } catch (e) {
+                // Security details not available
+            }
+        }
+
+        await client.detach();
+
+        // Write output
+        fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
+
+        return { success: true, output: outputPath, sslInfo };
+
+    } catch (e) {
+        return { success: false, error: `${e.name}: ${e.message}` };
+    } finally {
+        if (browser) {
+            browser.disconnect();
+        }
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        // Check if enabled
+        if (!getEnvBool('SAVE_SSL', true)) {
+            console.log('Skipping SSL (SAVE_SSL=False)');
+            status = 'skipped';
+            const endTs = new Date();
+            console.log(`START_TS=${startTs.toISOString()}`);
+            console.log(`END_TS=${endTs.toISOString()}`);
+            console.log(`STATUS=${status}`);
+            console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
+            process.exit(0);
+        }
+
+        const result = await extractSsl(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            const protocol = result.sslInfo?.protocol || 'unknown';
+            console.log(`SSL details extracted: ${protocol}`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
+++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+"""
+Download static files (PDFs, images, archives, etc.) directly.
+
+This extractor runs AFTER chrome_session and checks the Content-Type header
+from chrome_session/response_headers.json to determine if the URL points to
+a static file that should be downloaded directly.
+
+Other extractors check for the presence of this extractor's output directory
+to know if they should skip (since Chrome-based extractors can't meaningfully
+process static files like PDFs, images, etc.).
+
+Usage: on_Snapshot__21_staticfile.py --url=<url> --snapshot-id=<uuid>
+Output: Downloads file to staticfile/<filename>
+
+Environment variables:
+    STATICFILE_TIMEOUT: Timeout in seconds (default: 300)
+    STATICFILE_MAX_SIZE: Maximum file size in bytes (default: 1GB)
+    USER_AGENT: User agent string (optional)
+    CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
+"""
+
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.parse import urlparse, unquote
+
+import rich_click as click
+
+# Extractor metadata
+EXTRACTOR_NAME = 'staticfile'
+OUTPUT_DIR = 'staticfile'
+CHROME_SESSION_DIR = 'chrome_session'
+
+# Content-Types that indicate static files
+# These can't be meaningfully processed by Chrome-based extractors
+STATIC_CONTENT_TYPES = {
+    # Documents
+    'application/pdf',
+    'application/msword',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'application/vnd.ms-excel',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    'application/vnd.ms-powerpoint',
+    'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+    'application/rtf',
+    'application/epub+zip',
+    # Images
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/svg+xml',
+    'image/x-icon',
+    'image/bmp',
+    'image/tiff',
+    'image/avif',
+    'image/heic',
+    'image/heif',
+    # Audio
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/flac',
+    'audio/aac',
+    'audio/ogg',
+    'audio/webm',
+    'audio/m4a',
+    'audio/opus',
+    # Video
+    'video/mp4',
+    'video/webm',
+    'video/x-matroska',
+    'video/avi',
+    'video/quicktime',
+    'video/x-ms-wmv',
+    'video/x-flv',
+    # Archives
+    'application/zip',
+    'application/x-tar',
+    'application/gzip',
+    'application/x-bzip2',
+    'application/x-xz',
+    'application/x-7z-compressed',
+    'application/x-rar-compressed',
+    'application/vnd.rar',
+    # Data
+    'application/json',
+    'application/xml',
+    'text/csv',
+    'text/xml',
+    'application/x-yaml',
+    # Executables/Binaries
+    'application/octet-stream',  # Generic binary
+    'application/x-executable',
+    'application/x-msdos-program',
+    'application/x-apple-diskimage',
+    'application/vnd.debian.binary-package',
+    'application/x-rpm',
+    # Other
+    'application/x-bittorrent',
+    'application/wasm',
+}
+
+# Also check Content-Type prefixes for categories
+STATIC_CONTENT_TYPE_PREFIXES = (
+    'image/',
+    'audio/',
+    'video/',
+    'application/zip',
+    'application/x-',
+)
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def get_content_type_from_chrome_session() -> str | None:
+    """Read Content-Type from chrome_session's response headers."""
+    headers_file = Path(CHROME_SESSION_DIR) / 'response_headers.json'
+    if not headers_file.exists():
+        return None
+
+    try:
+        with open(headers_file) as f:
+            headers = json.load(f)
+        # Headers might be nested or flat depending on chrome_session format
+        content_type = headers.get('content-type') or headers.get('Content-Type') or ''
+        # Strip charset and other parameters
+        return content_type.split(';')[0].strip().lower()
+    except Exception:
+        return None
+
+
+def is_static_content_type(content_type: str) -> bool:
+    """Check if Content-Type indicates a static file."""
+    if not content_type:
+        return False
+
+    # Check exact match
+    if content_type in STATIC_CONTENT_TYPES:
+        return True
+
+    # Check prefixes
+    for prefix in STATIC_CONTENT_TYPE_PREFIXES:
+        if content_type.startswith(prefix):
+            return True
+
+    return False
+
+
+def get_filename_from_url(url: str) -> str:
+    """Extract filename from URL."""
+    parsed = urlparse(url)
+    path = unquote(parsed.path)
+    filename = path.split('/')[-1] or 'downloaded_file'
+
+    # Sanitize filename
+    filename = filename.replace('/', '_').replace('\\', '_')
+    if len(filename) > 200:
+        filename = filename[:200]
+
+    return filename
+
+
+def download_file(url: str) -> tuple[bool, str | None, str]:
+    """
+    Download a static file.
+
+    Returns: (success, output_path, error_message)
+    """
+    import requests
+
+    timeout = get_env_int('STATICFILE_TIMEOUT', 300)
+    max_size = get_env_int('STATICFILE_MAX_SIZE', 1024 * 1024 * 1024)  # 1GB default
+    user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+    check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
+
+    headers = {'User-Agent': user_agent}
+
+    try:
+        # Stream download to handle large files
+        response = requests.get(
+            url,
+            headers=headers,
+            timeout=timeout,
+            stream=True,
+            verify=check_ssl,
+            allow_redirects=True,
+        )
+        response.raise_for_status()
+
+        # Check content length if available
+        content_length = response.headers.get('content-length')
+        if content_length and int(content_length) > max_size:
+            return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
+
+        # Create output directory
+        output_dir = Path(OUTPUT_DIR)
+        output_dir.mkdir(exist_ok=True)
+
+        # Determine filename
+        filename = get_filename_from_url(url)
+
+        # Check content-disposition header for better filename
+        content_disp = response.headers.get('content-disposition', '')
+        if 'filename=' in content_disp:
+            import re
+            match = re.search(r'filename[*]?=["\']?([^"\';\n]+)', content_disp)
+            if match:
+                filename = match.group(1).strip()
+
+        output_path = output_dir / filename
+
+        # Download in chunks
+        downloaded_size = 0
+        with open(output_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    downloaded_size += len(chunk)
+                    if downloaded_size > max_size:
+                        f.close()
+                        output_path.unlink()
+                        return False, None, f'File too large: exceeded {max_size} bytes'
+                    f.write(chunk)
+
+        return True, str(output_path), ''
+
+    except requests.exceptions.Timeout:
+        return False, None, f'Timed out after {timeout} seconds'
+    except requests.exceptions.SSLError as e:
+        return False, None, f'SSL error: {e}'
+    except requests.exceptions.RequestException as e:
+        return False, None, f'Download failed: {e}'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to download')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Download static files based on Content-Type from chrome_session."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+
+    # Check Content-Type from chrome_session's response headers
+    content_type = get_content_type_from_chrome_session()
+
+    # If chrome_session didn't run or no Content-Type, skip
+    if not content_type:
+        print(f'No Content-Type found (chrome_session may not have run)')
+        print(f'START_TS={start_ts.isoformat()}')
+        print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+        print(f'STATUS=skipped')
+        print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+        sys.exit(0)  # Permanent skip - can't determine content type
+
+    # If not a static file type, skip (this is the normal case for HTML pages)
+    if not is_static_content_type(content_type):
+        print(f'Not a static file (Content-Type: {content_type})')
+        print(f'START_TS={start_ts.isoformat()}')
+        print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+        print(f'STATUS=skipped')
+        print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id, "content_type": content_type})}')
+        sys.exit(0)  # Permanent skip - not a static file
+
+    try:
+        # Download the file
+        print(f'Static file detected (Content-Type: {content_type}), downloading...')
+        success, output, error = download_file(url)
+        status = 'succeeded' if success else 'failed'
+
+        if success and output:
+            size = Path(output).stat().st_size
+            print(f'Static file downloaded ({size} bytes): {output}')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'content_type': content_type,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -0,0 +1,262 @@
+#!/usr/bin/env node
+/**
+ * Extract the title of a URL.
+ *
+ * If a Chrome session exists (from chrome_session extractor), connects to it via CDP
+ * to get the page title (which includes JS-rendered content).
+ * Otherwise falls back to fetching the URL and parsing HTML.
+ *
+ * Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes title/title.txt
+ *
+ * Environment variables:
+ *     TIMEOUT: Timeout in seconds (default: 30)
+ *     USER_AGENT: User agent string (optional)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+const http = require('http');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'title';
+const OUTPUT_DIR = 'title';
+const OUTPUT_FILE = 'title.txt';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Get CDP URL from chrome_session if available
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Extract title from HTML
+function extractTitleFromHtml(html) {
+    // Try <title> tag
+    const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
+    if (titleMatch) {
+        return titleMatch[1].trim();
+    }
+
+    // Try og:title
+    const ogMatch = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i);
+    if (ogMatch) {
+        return ogMatch[1].trim();
+    }
+
+    // Try twitter:title
+    const twitterMatch = html.match(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i);
+    if (twitterMatch) {
+        return twitterMatch[1].trim();
+    }
+
+    return null;
+}
+
+// Fetch URL and extract title (fallback method)
+function fetchTitle(url) {
+    return new Promise((resolve, reject) => {
+        const timeout = getEnvInt('TIMEOUT', 30) * 1000;
+        const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
+
+        const client = url.startsWith('https') ? https : http;
+
+        const req = client.get(url, {
+            headers: { 'User-Agent': userAgent },
+            timeout,
+        }, (res) => {
+            // Handle redirects
+            if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+                fetchTitle(res.headers.location).then(resolve).catch(reject);
+                return;
+            }
+
+            let data = '';
+            res.on('data', chunk => {
+                data += chunk;
+                // Only need first 64KB to find title
+                if (data.length > 65536) {
+                    req.destroy();
+                }
+            });
+            res.on('end', () => {
+                const title = extractTitleFromHtml(data);
+                if (title) {
+                    resolve(title);
+                } else {
+                    reject(new Error('No title found in HTML'));
+                }
+            });
+        });
+
+        req.on('error', reject);
+        req.on('timeout', () => {
+            req.destroy();
+            reject(new Error('Request timeout'));
+        });
+    });
+}
+
+// Get title using Puppeteer CDP connection
+async function getTitleFromCdp(cdpUrl) {
+    const puppeteer = require('puppeteer-core');
+
+    const browser = await puppeteer.connect({
+        browserWSEndpoint: cdpUrl,
+    });
+
+    try {
+        // Get existing pages
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            throw new Error('No page found in Chrome session');
+        }
+
+        // Get title from page
+        const title = await page.title();
+
+        if (!title) {
+            // Try getting from DOM directly
+            const domTitle = await page.evaluate(() => {
+                return document.title ||
+                       document.querySelector('meta[property="og:title"]')?.content ||
+                       document.querySelector('meta[name="twitter:title"]')?.content ||
+                       document.querySelector('h1')?.textContent?.trim();
+            });
+            return domTitle;
+        }
+
+        return title;
+    } finally {
+        // Disconnect without closing browser
+        browser.disconnect();
+    }
+}
+
+async function extractTitle(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    // Try Chrome session first
+    const cdpUrl = getCdpUrl();
+    if (cdpUrl) {
+        try {
+            const title = await getTitleFromCdp(cdpUrl);
+            if (title) {
+                fs.writeFileSync(outputPath, title, 'utf8');
+                return { success: true, output: outputPath, title, method: 'cdp' };
+            }
+        } catch (e) {
+            console.error(`CDP title extraction failed: ${e.message}, falling back to HTTP`);
+        }
+    }
+
+    // Fallback to HTTP fetch
+    try {
+        const title = await fetchTitle(url);
+        fs.writeFileSync(outputPath, title, 'utf8');
+        return { success: true, output: outputPath, title, method: 'http' };
+    } catch (e) {
+        return { success: false, error: e.message };
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        const result = await extractTitle(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            console.log(`Title extracted (${result.method}): ${result.title}`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/title/tests/test_title.py
+++ b/archivebox/plugins/title/tests/test_title.py
@@ -0,0 +1,241 @@
+"""
+Integration tests for title plugin
+
+Tests verify:
+1. Plugin script exists
+2. Node.js is available
+3. Title extraction works for real example.com
+4. Output file contains actual page title
+5. Handles various title sources (<title>, og:title, twitter:title)
+6. Config options work (TIMEOUT, USER_AGENT)
+7. Fallback to HTTP when chrome_session not available
+"""
+
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+TITLE_HOOK = PLUGIN_DIR / 'on_Snapshot__32_title.js'
+TEST_URL = 'https://example.com'
+
+
+def test_hook_script_exists():
+    """Verify hook script exists."""
+    assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"
+
+
+def test_extracts_title_from_example_com():
+    """Test full workflow: extract title from real example.com."""
+
+    # Check node is available
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Run title extraction
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+
+        # Verify output in stdout
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+        assert 'Title extracted' in result.stdout, "Should report completion"
+
+        # Verify output directory created
+        title_dir = tmpdir / 'title'
+        assert title_dir.exists(), "Output directory not created"
+
+        # Verify output file exists
+        title_file = title_dir / 'title.txt'
+        assert title_file.exists(), "title.txt not created"
+
+        # Verify title contains REAL example.com title
+        title_text = title_file.read_text().strip()
+        assert len(title_text) > 0, "Title should not be empty"
+        assert 'example' in title_text.lower(), "Title should contain 'example'"
+
+        # example.com has title "Example Domain"
+        assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
+
+        # Verify RESULT_JSON is present
+        assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
+
+
+def test_falls_back_to_http_when_chrome_session_unavailable():
+    """Test that title plugin falls back to HTTP when chrome_session unavailable."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Don't create chrome_session directory - force HTTP fallback
+
+        # Run title extraction
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+
+        # Verify output exists and has real title
+        output_title_file = tmpdir / 'title' / 'title.txt'
+        assert output_title_file.exists(), "Output title.txt not created"
+
+        title_text = output_title_file.read_text().strip()
+        assert 'example' in title_text.lower()
+
+
+def test_config_timeout_honored():
+    """Test that TIMEOUT config is respected."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set very short timeout (but example.com should still succeed)
+        import os
+        env = os.environ.copy()
+        env['TIMEOUT'] = '5'
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should complete (success or fail, but not hang)
+        assert result.returncode in (0, 1), "Should complete without hanging"
+
+
+def test_config_user_agent():
+    """Test that USER_AGENT config is used."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set custom user agent
+        import os
+        env = os.environ.copy()
+        env['USER_AGENT'] = 'TestBot/1.0'
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should succeed (example.com doesn't block)
+        if result.returncode == 0:
+            assert 'STATUS=succeeded' in result.stdout
+
+
+def test_handles_https_urls():
+    """Test that HTTPS URLs work correctly."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        if result.returncode == 0:
+            output_title_file = tmpdir / 'title' / 'title.txt'
+            if output_title_file.exists():
+                title_text = output_title_file.read_text().strip()
+                assert len(title_text) > 0, "Title should not be empty"
+                assert 'example' in title_text.lower()
+
+
+def test_handles_404_gracefully():
+    """Test that title plugin handles 404 pages.
+
+    Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
+    with the generic "Example Domain" title.
+    """
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # May succeed or fail depending on server behavior
+        # example.com returns "Example Domain" even for 404s
+        assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
+
+
+def test_handles_redirects():
+    """Test that title plugin handles redirects correctly."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # http://example.com redirects to https://example.com
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # Should succeed and follow redirect
+        if result.returncode == 0:
+            output_title_file = tmpdir / 'title' / 'title.txt'
+            if output_title_file.exists():
+                title_text = output_title_file.read_text().strip()
+                assert 'example' in title_text.lower()
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/ublock/on_Snapshot__03_ublock.js
+++ b/archivebox/plugins/ublock/on_Snapshot__03_ublock.js
@@ -0,0 +1,116 @@
+#!/usr/bin/env node
+/**
+ * uBlock Origin Extension Plugin
+ *
+ * Installs and configures the uBlock Origin Chrome extension for ad blocking
+ * and privacy protection during page archiving.
+ *
+ * Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
+ *
+ * Priority: 03 (early) - Must install before Chrome session starts
+ * Hook: on_Snapshot
+ *
+ * This extension automatically:
+ * - Blocks ads, trackers, and malware domains
+ * - Reduces page load time and bandwidth usage
+ * - Improves privacy during archiving
+ * - Removes clutter from archived pages
+ * - Uses efficient blocking with filter lists
+ */
+
+const path = require('path');
+const fs = require('fs');
+
+// Import extension utilities
+const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
+
+// Extension metadata
+const EXTENSION = {
+    webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
+    name: 'ublock',
+};
+
+// Get extensions directory from environment or use default
+const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
+    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
+
+/**
+ * Install the uBlock Origin extension
+ */
+async function installUblockExtension() {
+    console.log('[*] Installing uBlock Origin extension...');
+
+    // Install the extension
+    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
+
+    if (!extension) {
+        console.error('[❌] Failed to install uBlock Origin extension');
+        return null;
+    }
+
+    console.log('[+] uBlock Origin extension installed');
+    console.log('[+] Ads and trackers will be blocked during archiving');
+
+    return extension;
+}
+
+/**
+ * Note: uBlock Origin works automatically with default filter lists.
+ * No configuration needed - blocks ads, trackers, and malware domains out of the box.
+ */
+
+/**
+ * Main entry point - install extension before archiving
+ */
+async function main() {
+    // Check if extension is already cached
+    const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
+
+    if (fs.existsSync(cacheFile)) {
+        try {
+            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
+            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
+
+            if (fs.existsSync(manifestPath)) {
+                console.log('[*] uBlock Origin extension already installed (using cache)');
+                return cached;
+            }
+        } catch (e) {
+            // Cache file corrupted, re-install
+            console.warn('[⚠️] Extension cache corrupted, re-installing...');
+        }
+    }
+
+    // Install extension
+    const extension = await installUblockExtension();
+
+    // Export extension metadata for chrome_session to load
+    if (extension) {
+        // Write extension info to a cache file that chrome_session can read
+        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
+        await fs.promises.writeFile(
+            cacheFile,
+            JSON.stringify(extension, null, 2)
+        );
+        console.log(`[+] Extension metadata written to ${cacheFile}`);
+    }
+
+    return extension;
+}
+
+// Export functions for use by other plugins
+module.exports = {
+    EXTENSION,
+    installUblockExtension,
+};
+
+// Run if executed directly
+if (require.main === module) {
+    main().then(() => {
+        console.log('[✓] uBlock Origin extension setup complete');
+        process.exit(0);
+    }).catch(err => {
+        console.error('[❌] uBlock Origin extension setup failed:', err);
+        process.exit(1);
+    });
+}
--- a/archivebox/plugins/ublock/tests/test_ublock.js
+++ b/archivebox/plugins/ublock/tests/test_ublock.js
@@ -0,0 +1,321 @@
+/**
+ * Unit tests for ublock plugin
+ *
+ * Run with: node --test tests/test_ublock.js
+ */
+
+const assert = require('assert');
+const fs = require('fs');
+const path = require('path');
+const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
+
+// Test fixtures
+const TEST_DIR = path.join(__dirname, '.test_fixtures');
+const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
+
+describe('ublock plugin', () => {
+    before(() => {
+        if (!fs.existsSync(TEST_DIR)) {
+            fs.mkdirSync(TEST_DIR, { recursive: true });
+        }
+    });
+
+    after(() => {
+        if (fs.existsSync(TEST_DIR)) {
+            fs.rmSync(TEST_DIR, { recursive: true, force: true });
+        }
+    });
+
+    describe('EXTENSION metadata', () => {
+        it('should have correct webstore_id for uBlock Origin', () => {
+            const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
+
+            assert.strictEqual(EXTENSION.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
+        });
+
+        it('should have correct name', () => {
+            const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
+
+            assert.strictEqual(EXTENSION.name, 'ublock');
+        });
+    });
+
+    describe('installUblockExtension', () => {
+        beforeEach(() => {
+            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
+
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_EXTENSIONS_DIR;
+        });
+
+        it('should use cached extension if available', async () => {
+            const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
+
+            // Create fake cache
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
+            const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_ublock');
+
+            fs.mkdirSync(fakeExtensionDir, { recursive: true });
+            fs.writeFileSync(
+                path.join(fakeExtensionDir, 'manifest.json'),
+                JSON.stringify({ version: '1.67.0' })
+            );
+
+            const fakeCache = {
+                webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
+                name: 'ublock',
+                unpacked_path: fakeExtensionDir,
+                version: '1.67.0'
+            };
+
+            fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
+
+            const result = await installUblockExtension();
+
+            assert.notStrictEqual(result, null);
+            assert.strictEqual(result.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
+        });
+
+        it('should not require any configuration', async () => {
+            // uBlock Origin works out of the box with default filter lists
+            const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
+
+            assert.ok(EXTENSION);
+            // No config fields should be required
+        });
+
+        it('should have large download size (filter lists)', () => {
+            // uBlock Origin is typically larger than other extensions
+            // due to included filter lists (usually 3-5 MB)
+
+            const typicalSize = 4 * 1024 * 1024; // ~4 MB
+            const minExpectedSize = 2 * 1024 * 1024; // Minimum 2 MB
+
+            // Just verify we understand the expected size
+            assert.ok(typicalSize > minExpectedSize);
+        });
+    });
+
+    describe('cache file creation', () => {
+        beforeEach(() => {
+            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
+
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_EXTENSIONS_DIR;
+        });
+
+        it('should create cache file with correct structure', async () => {
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
+
+            const mockExtension = {
+                webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
+                name: 'ublock',
+                version: '1.68.0',
+                unpacked_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock'),
+                crx_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock.crx')
+            };
+
+            await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
+
+            assert.ok(fs.existsSync(cacheFile));
+
+            const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
+            assert.strictEqual(cache.name, 'ublock');
+            assert.strictEqual(cache.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
+        });
+    });
+
+    describe('extension functionality', () => {
+        it('should work automatically with default filter lists', () => {
+            const features = {
+                automaticBlocking: true,
+                requiresConfiguration: false,
+                requiresApiKey: false,
+                defaultFilterLists: true,
+                blocksAds: true,
+                blocksTrackers: true,
+                blocksMalware: true
+            };
+
+            assert.strictEqual(features.automaticBlocking, true);
+            assert.strictEqual(features.requiresConfiguration, false);
+            assert.strictEqual(features.requiresApiKey, false);
+            assert.strictEqual(features.defaultFilterLists, true);
+        });
+
+        it('should not require runtime configuration', () => {
+            // uBlock Origin works purely via filter lists and content scripts
+            // No API keys or runtime configuration needed
+
+            const requiresRuntimeConfig = false;
+            const requiresApiKey = false;
+
+            assert.strictEqual(requiresRuntimeConfig, false);
+            assert.strictEqual(requiresApiKey, false);
+        });
+
+        it('should support standard filter list formats', () => {
+            const supportedFormats = [
+                'EasyList',
+                'EasyPrivacy',
+                'Malware Domains',
+                'Peter Lowe\'s List',
+                'uBlock Origin filters'
+            ];
+
+            assert.ok(supportedFormats.length > 0);
+            // Should support multiple filter list formats
+        });
+    });
+
+    describe('priority and execution order', () => {
+        it('should have priority 03 (early)', () => {
+            const filename = 'on_Snapshot__03_ublock.js';
+
+            const match = filename.match(/on_Snapshot__(\d+)_/);
+            assert.ok(match);
+
+            const priority = parseInt(match[1]);
+            assert.strictEqual(priority, 3);
+        });
+
+        it('should run before chrome_session (priority 20)', () => {
+            const extensionPriority = 3;
+            const chromeSessionPriority = 20;
+
+            assert.ok(extensionPriority < chromeSessionPriority);
+        });
+
+        it('should run after cookie dismissal extension', () => {
+            const ublockPriority = 3;
+            const cookiesPriority = 2;
+
+            assert.ok(ublockPriority > cookiesPriority);
+        });
+    });
+
+    describe('performance considerations', () => {
+        it('should benefit from caching due to large size', () => {
+            // uBlock Origin's large size makes caching especially important
+
+            const averageDownloadTime = 10; // seconds
+            const averageCacheCheckTime = 0.01; // seconds
+
+            const performanceGain = averageDownloadTime / averageCacheCheckTime;
+
+            // Should be at least 100x faster with cache
+            assert.ok(performanceGain > 100);
+        });
+
+        it('should not impact page load time significantly', () => {
+            // While extension is large, it uses efficient blocking
+
+            const efficientBlocking = true;
+            const minimalOverhead = true;
+
+            assert.strictEqual(efficientBlocking, true);
+            assert.strictEqual(minimalOverhead, true);
+        });
+    });
+
+    describe('error handling', () => {
+        beforeEach(() => {
+            process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
+
+            if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+        });
+
+        afterEach(() => {
+            if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
+                fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
+            }
+
+            delete process.env.CHROME_EXTENSIONS_DIR;
+        });
+
+        it('should handle corrupted cache gracefully', async () => {
+            const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
+
+            // Create corrupted cache
+            fs.writeFileSync(cacheFile, 'invalid json content');
+
+            const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
+
+            // Mock loadOrInstallExtension to avoid actual download
+            const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
+            const originalFunc = extensionUtils.loadOrInstallExtension;
+
+            extensionUtils.loadOrInstallExtension = async () => ({
+                webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
+                name: 'ublock',
+                version: '1.68.0'
+            });
+
+            const result = await installUblockExtension();
+
+            extensionUtils.loadOrInstallExtension = originalFunc;
+
+            assert.notStrictEqual(result, null);
+        });
+
+        it('should handle download timeout gracefully', () => {
+            // For large extension like uBlock, timeout handling is important
+
+            const timeoutSeconds = 120; // 2 minutes
+            const minTimeout = 30; // Should allow at least 30 seconds
+
+            assert.ok(timeoutSeconds > minTimeout);
+        });
+    });
+
+    describe('filter list validation', () => {
+        it('should have valid filter list format', () => {
+            // Example filter list entry
+            const sampleFilters = [
+                '||ads.example.com^',
+                '||tracker.example.com^$third-party',
+                '##.advertisement'
+            ];
+
+            // All filters should follow standard format
+            sampleFilters.forEach(filter => {
+                assert.ok(typeof filter === 'string');
+                assert.ok(filter.length > 0);
+            });
+        });
+
+        it('should support cosmetic filters', () => {
+            const cosmeticFilter = '##.banner-ad';
+
+            // Should start with ## for cosmetic filters
+            assert.ok(cosmeticFilter.startsWith('##'));
+        });
+
+        it('should support network filters', () => {
+            const networkFilter = '||ads.example.com^';
+
+            // Network filters typically start with || or contain ^
+            assert.ok(networkFilter.includes('||') || networkFilter.includes('^'));
+        });
+    });
+});
--- a/archivebox/plugins/ublock/tests/test_ublock.py
+++ b/archivebox/plugins/ublock/tests/test_ublock.py
@@ -0,0 +1,148 @@
+"""
+Unit tests for ublock plugin
+
+Tests invoke the plugin hook as an external process and verify outputs/side effects.
+"""
+
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__03_ublock.js"
+
+
+def test_install_script_exists():
+    """Verify install script exists"""
+    assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
+
+
+def test_extension_metadata():
+    """Test that uBlock Origin extension has correct metadata"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
+
+        result = subprocess.run(
+            ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
+            capture_output=True,
+            text=True,
+            env=env
+        )
+
+        assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
+
+        metadata = json.loads(result.stdout)
+        assert metadata["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
+        assert metadata["name"] == "ublock"
+
+
+def test_install_creates_cache():
+    """Test that install creates extension cache"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=120  # uBlock is large, may take longer to download
+        )
+
+        # Check output mentions installation
+        assert "uBlock" in result.stdout or "ublock" in result.stdout
+
+        # Check cache file was created
+        cache_file = ext_dir / "ublock.extension.json"
+        assert cache_file.exists(), "Cache file should be created"
+
+        # Verify cache content
+        cache_data = json.loads(cache_file.read_text())
+        assert cache_data["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
+        assert cache_data["name"] == "ublock"
+
+
+def test_install_uses_existing_cache():
+    """Test that install uses existing cache when available"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        # Create fake cache
+        fake_extension_dir = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock"
+        fake_extension_dir.mkdir(parents=True)
+
+        manifest = {"version": "1.68.0", "name": "uBlock Origin"}
+        (fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should use cache or install successfully
+        assert result.returncode == 0
+
+
+def test_no_configuration_required():
+    """Test that uBlock Origin works without configuration"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+        # No API keys needed - works with default filter lists
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=120
+        )
+
+        # Should not require any API keys
+        combined_output = result.stdout + result.stderr
+        assert "API" not in combined_output or result.returncode == 0
+
+
+def test_large_extension_size():
+    """Test that uBlock Origin is downloaded successfully despite large size"""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ext_dir = Path(tmpdir) / "chrome_extensions"
+        ext_dir.mkdir(parents=True)
+
+        env = os.environ.copy()
+        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
+
+        result = subprocess.run(
+            ["node", str(INSTALL_SCRIPT)],
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=120
+        )
+
+        # If extension was downloaded, verify it's substantial size
+        crx_file = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock.crx"
+        if crx_file.exists():
+            # uBlock Origin with filter lists is typically 2-5 MB
+            size_bytes = crx_file.stat().st_size
+            assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
--- a/archivebox/plugins/wget/config.json
+++ b/archivebox/plugins/wget/config.json
@@ -0,0 +1,80 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SAVE_WGET": {
+      "type": "boolean",
+      "default": true,
+      "description": "Enable wget archiving"
+    },
+    "SAVE_WARC": {
+      "type": "boolean",
+      "default": true,
+      "description": "Save WARC archive file"
+    },
+    "SAVE_WGET_REQUISITES": {
+      "type": "boolean",
+      "default": true,
+      "description": "Download page requisites (CSS, JS, images)"
+    },
+    "WGET_BINARY": {
+      "type": "string",
+      "default": "wget",
+      "description": "Path to wget binary"
+    },
+    "WGET_TIMEOUT": {
+      "type": "integer",
+      "default": 60,
+      "minimum": 5,
+      "x-fallback": "TIMEOUT",
+      "description": "Timeout for wget in seconds"
+    },
+    "WGET_USER_AGENT": {
+      "type": "string",
+      "default": "",
+      "x-fallback": "USER_AGENT",
+      "description": "User agent string for wget"
+    },
+    "WGET_CHECK_SSL_VALIDITY": {
+      "type": "boolean",
+      "default": true,
+      "x-fallback": "CHECK_SSL_VALIDITY",
+      "x-aliases": ["CHECK_SSL_VALIDITY"],
+      "description": "Whether to verify SSL certificates"
+    },
+    "WGET_COOKIES_FILE": {
+      "type": "string",
+      "default": "",
+      "x-fallback": "COOKIES_FILE",
+      "description": "Path to cookies file"
+    },
+    "WGET_RESTRICT_FILE_NAMES": {
+      "type": "string",
+      "default": "windows",
+      "enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
+      "x-fallback": "RESTRICT_FILE_NAMES",
+      "description": "Filename restriction mode"
+    },
+    "WGET_ARGS": {
+      "type": "array",
+      "items": {"type": "string"},
+      "default": [
+        "--no-verbose",
+        "--adjust-extension",
+        "--convert-links",
+        "--force-directories",
+        "--backup-converted",
+        "--span-hosts",
+        "--no-parent",
+        "-e", "robots=off"
+      ],
+      "description": "Default wget arguments"
+    },
+    "WGET_EXTRA_ARGS": {
+      "type": "string",
+      "default": "",
+      "description": "Extra arguments for wget (space-separated)"
+    }
+  }
+}
--- a/archivebox/plugins/wget/on_Crawl__00_validate_wget.py
+++ b/archivebox/plugins/wget/on_Crawl__00_validate_wget.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+Validation hook for wget binary.
+
+Runs at crawl start to verify wget is available.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+def get_binary_version(abspath: str) -> str | None:
+    """Get version string from binary."""
+    try:
+        result = subprocess.run(
+            [abspath, '--version'],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode == 0 and result.stdout:
+            # wget version string: "GNU Wget 1.24.5 built on ..."
+            first_line = result.stdout.strip().split('\n')[0]
+            # Extract version number
+            parts = first_line.split()
+            for i, part in enumerate(parts):
+                if part.lower() == 'wget' and i + 1 < len(parts):
+                    return parts[i + 1]
+            return first_line[:32]
+    except Exception:
+        pass
+    return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+    """Get SHA256 hash of binary."""
+    try:
+        with open(abspath, 'rb') as f:
+            return hashlib.sha256(f.read()).hexdigest()
+    except Exception:
+        return None
+
+
+def find_wget() -> dict | None:
+    """Find wget binary using abx-pkg or fallback to shutil.which."""
+    # Try abx-pkg first
+    try:
+        from abx_pkg import Binary, EnvProvider
+
+        class WgetBinary(Binary):
+            name: str = 'wget'
+            binproviders_supported = [EnvProvider()]
+
+        binary = WgetBinary()
+        loaded = binary.load()
+        if loaded and loaded.abspath:
+            return {
+                'name': 'wget',
+                'abspath': str(loaded.abspath),
+                'version': str(loaded.version) if loaded.version else None,
+                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
+                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
+            }
+    except ImportError:
+        pass
+    except Exception:
+        pass
+
+    # Fallback to shutil.which
+    abspath = shutil.which('wget') or os.environ.get('WGET_BINARY', '')
+    if abspath and Path(abspath).is_file():
+        return {
+            'name': 'wget',
+            'abspath': abspath,
+            'version': get_binary_version(abspath),
+            'sha256': get_binary_hash(abspath),
+            'binprovider': 'env',
+        }
+
+    return None
+
+
+def main():
+    """Validate wget binary and output JSONL."""
+
+    result = find_wget()
+
+    if result and result.get('abspath'):
+        # Output InstalledBinary
+        print(json.dumps({
+            'type': 'InstalledBinary',
+            'name': result['name'],
+            'abspath': result['abspath'],
+            'version': result['version'],
+            'sha256': result['sha256'],
+            'binprovider': result['binprovider'],
+        }))
+
+        # Output Machine config update
+        print(json.dumps({
+            'type': 'Machine',
+            '_method': 'update',
+            'key': 'config/WGET_BINARY',
+            'value': result['abspath'],
+        }))
+
+        if result['version']:
+            print(json.dumps({
+                'type': 'Machine',
+                '_method': 'update',
+                'key': 'config/WGET_VERSION',
+                'value': result['version'],
+            }))
+
+        sys.exit(0)
+    else:
+        # Output Dependency request
+        print(json.dumps({
+            'type': 'Dependency',
+            'bin_name': 'wget',
+            'bin_providers': 'apt,brew,env',
+        }))
+
+        # Exit non-zero to indicate binary not found
+        print(f"wget binary not found", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/wget/on_Crawl__00_validate_wget_config.py
+++ b/archivebox/plugins/wget/on_Crawl__00_validate_wget_config.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""
+Validate and compute derived wget config values.
+
+This hook runs early in the Crawl lifecycle to:
+1. Validate config values with warnings (not hard errors)
+2. Compute derived values (USE_WGET from SAVE_WGET/SAVE_WARC)
+3. Check binary availability and version
+
+Output:
+    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
+    - InstalledBinary JSONL records to stdout when binaries are found
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+
+from abx_pkg import Binary, EnvProvider
+
+
+# Read config from environment (already validated by JSONSchema)
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+def output_installed_binary(binary: Binary, name: str):
+    """Output InstalledBinary JSONL record to stdout."""
+    machine_id = os.environ.get('MACHINE_ID', '')
+
+    record = {
+        'type': 'InstalledBinary',
+        'name': name,
+        'abspath': str(binary.abspath),
+        'version': str(binary.version) if binary.version else '',
+        'sha256': binary.sha256 or '',
+        'binprovider': 'env',
+        'machine_id': machine_id,
+    }
+    print(json.dumps(record))
+
+
+def main():
+    warnings = []
+    errors = []
+    computed = {}
+
+    # Get config values
+    save_wget = get_env_bool('SAVE_WGET', True)
+    save_warc = get_env_bool('SAVE_WARC', True)
+    wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
+    wget_binary = get_env('WGET_BINARY', 'wget')
+
+    # Compute derived values
+    use_wget = save_wget or save_warc
+    computed['USE_WGET'] = str(use_wget).lower()
+
+    # Validate timeout with warning (not error)
+    if use_wget and wget_timeout < 20:
+        warnings.append(
+            f"WGET_TIMEOUT={wget_timeout} is very low. "
+            "wget may fail to archive sites if set to less than ~20 seconds. "
+            "Consider setting WGET_TIMEOUT=60 or higher."
+        )
+
+    # Check binary availability using abx-pkg
+    provider = EnvProvider()
+    try:
+        binary = Binary(name=wget_binary, binproviders=[provider]).load()
+        binary_path = str(binary.abspath) if binary.abspath else ''
+    except Exception:
+        binary = None
+        binary_path = ''
+
+    if not binary_path:
+        if use_wget:
+            errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set SAVE_WGET=false.")
+        computed['WGET_BINARY'] = ''
+    else:
+        computed['WGET_BINARY'] = binary_path
+        wget_version = str(binary.version) if binary.version else 'unknown'
+        computed['WGET_VERSION'] = wget_version
+
+        # Output InstalledBinary JSONL record
+        output_installed_binary(binary, name='wget')
+
+    # Check for compression support
+    if computed.get('WGET_BINARY'):
+        try:
+            result = subprocess.run(
+                [computed['WGET_BINARY'], '--compression=auto', '--help'],
+                capture_output=True, timeout=5
+            )
+            computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
+        except Exception:
+            computed['WGET_AUTO_COMPRESSION'] = 'false'
+
+    # Output results
+    # Format: KEY=VALUE lines that hooks.py will parse and add to env
+    for key, value in computed.items():
+        print(f"COMPUTED:{key}={value}")
+
+    for warning in warnings:
+        print(f"WARNING:{warning}", file=sys.stderr)
+
+    for error in errors:
+        print(f"ERROR:{error}", file=sys.stderr)
+
+    # Exit with error if any hard errors
+    sys.exit(1 if errors else 0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -0,0 +1,325 @@
+#!/usr/bin/env python3
+"""
+Archive a URL using wget.
+
+Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
+Output: Downloads files to $PWD
+
+Environment variables:
+    WGET_BINARY: Path to wget binary (optional, falls back to PATH)
+    WGET_TIMEOUT: Timeout in seconds (default: 60)
+    WGET_USER_AGENT: User agent string
+    WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
+    WGET_COOKIES_FILE: Path to cookies file (optional)
+    WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
+    WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
+
+    # Wget feature toggles
+    SAVE_WGET: Enable wget archiving (default: True)
+    SAVE_WARC: Save WARC file (default: True)
+    SAVE_WGET_REQUISITES: Download page requisites (default: True)
+
+    # Fallback to ARCHIVING_CONFIG values if WGET_* not set:
+    TIMEOUT: Fallback timeout
+    USER_AGENT: Fallback user agent
+    CHECK_SSL_VALIDITY: Fallback SSL check
+    COOKIES_FILE: Fallback cookies file
+    RESTRICT_FILE_NAMES: Fallback filename restriction
+"""
+
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'wget'
+BIN_NAME = 'wget'
+BIN_PROVIDERS = 'apt,brew,env'
+OUTPUT_DIR = 'wget'
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def get_env_int(name: str, default: int = 0) -> int:
+    try:
+        return int(get_env(name, str(default)))
+    except ValueError:
+        return default
+
+
+STATICFILE_DIR = 'staticfile'
+
+def has_staticfile_output() -> bool:
+    """Check if staticfile extractor already downloaded this URL."""
+    staticfile_dir = Path(STATICFILE_DIR)
+    return staticfile_dir.exists() and any(staticfile_dir.iterdir())
+
+
+def find_wget() -> str | None:
+    """Find wget binary."""
+    wget = get_env('WGET_BINARY')
+    if wget and os.path.isfile(wget):
+        return wget
+    return shutil.which('wget')
+
+
+def get_version(binary: str) -> str:
+    """Get wget version."""
+    try:
+        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
+        return result.stdout.split('\n')[0].strip()[:64]
+    except Exception:
+        return ''
+
+
+def check_wget_compression(binary: str) -> bool:
+    """Check if wget supports --compression=auto."""
+    try:
+        result = subprocess.run(
+            [binary, '--compression=auto', '--help'],
+            capture_output=True,
+            timeout=5
+        )
+        return result.returncode == 0
+    except Exception:
+        return False
+
+
+# Default wget args (from old WGET_CONFIG)
+WGET_DEFAULT_ARGS = [
+    '--no-verbose',
+    '--adjust-extension',
+    '--convert-links',
+    '--force-directories',
+    '--backup-converted',
+    '--span-hosts',
+    '--no-parent',
+    '-e', 'robots=off',
+]
+
+
+def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
+    """
+    Archive URL using wget.
+
+    Returns: (success, output_path, error_message)
+    """
+    # Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
+    timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
+    user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+    check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
+    cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
+    restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
+    extra_args = get_env('WGET_EXTRA_ARGS', '')
+
+    # Feature toggles
+    save_warc = get_env_bool('SAVE_WARC', True)
+    save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True)
+
+    # Check for compression support
+    supports_compression = check_wget_compression(binary)
+
+    # Build wget command (later options take precedence)
+    cmd = [
+        binary,
+        *WGET_DEFAULT_ARGS,
+        f'--timeout={timeout}',
+        '--tries=2',
+    ]
+
+    if user_agent:
+        cmd.append(f'--user-agent={user_agent}')
+
+    if restrict_names:
+        cmd.append(f'--restrict-file-names={restrict_names}')
+
+    if save_requisites:
+        cmd.append('--page-requisites')
+
+    if save_warc:
+        warc_dir = Path('warc')
+        warc_dir.mkdir(exist_ok=True)
+        warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
+        cmd.append(f'--warc-file={warc_path}')
+    else:
+        cmd.append('--timestamping')
+
+    if cookies_file and Path(cookies_file).is_file():
+        cmd.extend(['--load-cookies', cookies_file])
+
+    if supports_compression:
+        cmd.append('--compression=auto')
+
+    if not check_ssl:
+        cmd.extend(['--no-check-certificate', '--no-hsts'])
+
+    if extra_args:
+        cmd.extend(extra_args.split())
+
+    cmd.append(url)
+
+    # Run wget
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            timeout=timeout * 2,  # Allow extra time for large downloads
+        )
+
+        # Find downloaded files
+        downloaded_files = [
+            f for f in Path('.').rglob('*')
+            if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/')
+        ]
+
+        if not downloaded_files:
+            stderr = result.stderr.decode('utf-8', errors='replace')
+            stdout = result.stdout.decode('utf-8', errors='replace')
+            combined = stderr + stdout
+
+            if '403' in combined or 'Forbidden' in combined:
+                return False, None, '403 Forbidden (try changing USER_AGENT)'
+            elif '404' in combined or 'Not Found' in combined:
+                return False, None, '404 Not Found'
+            elif '500' in combined:
+                return False, None, '500 Internal Server Error'
+            else:
+                return False, None, f'No files downloaded: {stderr[:200]}'
+
+        # Find main HTML file
+        html_files = [
+            f for f in downloaded_files
+            if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f))
+        ]
+        output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
+
+        # Parse download stats from wget output
+        output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:]
+        files_count = len(downloaded_files)
+
+        return True, output_path, ''
+
+    except subprocess.TimeoutExpired:
+        return False, None, f'Timed out after {timeout * 2} seconds'
+    except Exception as e:
+        return False, None, f'{type(e).__name__}: {e}'
+
+
+@click.command()
+@click.option('--url', required=True, help='URL to archive')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Archive a URL using wget."""
+
+    start_ts = datetime.now(timezone.utc)
+    version = ''
+    output = None
+    status = 'failed'
+    error = ''
+    binary = None
+    cmd_str = ''
+
+    try:
+        # Check if wget is enabled
+        if not get_env_bool('SAVE_WGET', True):
+            print('Skipping wget (SAVE_WGET=False)')
+            status = 'skipped'
+            end_ts = datetime.now(timezone.utc)
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={end_ts.isoformat()}')
+            print(f'STATUS={status}')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)
+
+        # Check if staticfile extractor already handled this (permanent skip)
+        if has_staticfile_output():
+            print(f'Skipping wget - staticfile extractor already downloaded this')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - staticfile already handled
+
+        # Find binary
+        binary = find_wget()
+        if not binary:
+            print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
+            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
+            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
+            print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr)
+            sys.exit(1)
+
+        version = get_version(binary)
+        cmd_str = f'{binary} ... {url}'
+
+        # Run extraction
+        success, output, error = save_wget(url, binary)
+        status = 'succeeded' if success else 'failed'
+
+        if success:
+            # Count downloaded files
+            files = list(Path('.').rglob('*'))
+            file_count = len([f for f in files if f.is_file()])
+            print(f'wget completed: {file_count} files downloaded')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    # Print results
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if cmd_str:
+        print(f'CMD={cmd_str}')
+    if version:
+        print(f'VERSION={version}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    # Print JSON result
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'cmd_version': version,
+        'output': output,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/wget/tests/test_wget.py
+++ b/archivebox/plugins/wget/tests/test_wget.py
@@ -0,0 +1,369 @@
+"""
+Integration tests for wget plugin
+
+Tests verify:
+1. Plugin reports missing dependency correctly
+2. wget can be installed via brew/apt provider hooks
+3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
+4. Extraction works against real example.com
+5. Output files contain actual page content
+6. Skip cases work (SAVE_WGET=False, staticfile present)
+7. Failure cases handled (404, network errors)
+"""
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import uuid
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+PLUGINS_ROOT = PLUGIN_DIR.parent
+WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
+BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
+APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
+TEST_URL = 'https://example.com'
+
+
+def test_hook_script_exists():
+    """Verify hook script exists."""
+    assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
+
+
+def test_reports_missing_dependency_when_not_installed():
+    """Test that script reports DEPENDENCY_NEEDED when wget is not found."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Run with empty PATH so binary won't be found
+        env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
+
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env
+        )
+
+        # Should fail and report missing dependency
+        assert result.returncode != 0, "Should exit non-zero when dependency missing"
+        combined = result.stdout + result.stderr
+        assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
+        assert 'wget' in combined.lower(), "Should mention wget"
+        assert 'BIN_PROVIDERS' in combined, "Should report available providers (apt,brew,env)"
+
+
+def test_can_install_wget_via_provider():
+    """Test that wget can be installed via brew/apt provider hooks."""
+
+    # Determine which provider to use
+    if shutil.which('brew'):
+        provider_hook = BREW_HOOK
+        provider_name = 'brew'
+    elif shutil.which('apt-get'):
+        provider_hook = APT_HOOK
+        provider_name = 'apt'
+    else:
+        pytest.skip("Neither brew nor apt available on this system")
+
+    assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
+
+    # Test installation via provider hook
+    dependency_id = str(uuid.uuid4())
+
+    result = subprocess.run(
+        [
+            sys.executable,
+            str(provider_hook),
+            '--dependency-id', dependency_id,
+            '--bin-name', 'wget',
+            '--bin-providers', 'apt,brew,env'
+        ],
+        capture_output=True,
+        text=True,
+        timeout=300  # Installation can take time
+    )
+
+    # Should succeed (wget installs successfully or is already installed)
+    assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
+
+    # Should output InstalledBinary JSONL record
+    assert 'InstalledBinary' in result.stdout or 'wget' in result.stderr, \
+        f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
+
+    # Parse JSONL if present
+    if result.stdout.strip():
+        for line in result.stdout.strip().split('\n'):
+            try:
+                record = json.loads(line)
+                if record.get('type') == 'InstalledBinary':
+                    assert record['name'] == 'wget'
+                    assert record['binprovider'] in ['brew', 'apt']
+                    assert record['abspath'], "Should have binary path"
+                    assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}"
+                    break
+            except json.JSONDecodeError:
+                continue
+
+    # Verify wget is now available
+    result = subprocess.run(['which', 'wget'], capture_output=True, text=True)
+    assert result.returncode == 0, "wget should be available after installation"
+
+
+def test_archives_example_com():
+    """Test full workflow: ensure wget installed then archive example.com."""
+
+    # First ensure wget is installed via provider
+    if shutil.which('brew'):
+        provider_hook = BREW_HOOK
+    elif shutil.which('apt-get'):
+        provider_hook = APT_HOOK
+    else:
+        pytest.skip("Neither brew nor apt available")
+
+    # Run installation (idempotent - will succeed if already installed)
+    install_result = subprocess.run(
+        [
+            sys.executable,
+            str(provider_hook),
+            '--dependency-id', str(uuid.uuid4()),
+            '--bin-name', 'wget',
+            '--bin-providers', 'apt,brew,env'
+        ],
+        capture_output=True,
+        text=True,
+        timeout=300
+    )
+
+    if install_result.returncode != 0:
+        pytest.skip(f"Could not install wget: {install_result.stderr}")
+
+    # Now test archiving
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Run wget extraction
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=120
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+
+        # Verify output in stdout
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+        assert 'wget completed' in result.stdout, "Should report completion"
+
+        # Verify files were downloaded
+        downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
+        assert len(downloaded_files) > 0, "No HTML files downloaded"
+
+        # Find main HTML file (should contain example.com)
+        main_html = None
+        for html_file in downloaded_files:
+            content = html_file.read_text(errors='ignore')
+            if 'example domain' in content.lower():
+                main_html = html_file
+                break
+
+        assert main_html is not None, "Could not find main HTML file with example.com content"
+
+        # Verify HTML content contains REAL example.com text
+        html_content = main_html.read_text(errors='ignore')
+        assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
+        assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
+        assert ('this domain' in html_content.lower() or
+                'illustrative examples' in html_content.lower()), \
+            "Missing example.com description text"
+        assert ('iana' in html_content.lower() or
+                'more information' in html_content.lower()), \
+            "Missing IANA reference"
+
+        # Verify RESULT_JSON is present and valid
+        assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
+
+        for line in result.stdout.split('\n'):
+            if line.startswith('RESULT_JSON='):
+                result_json = json.loads(line.replace('RESULT_JSON=', ''))
+                assert result_json['extractor'] == 'wget'
+                assert result_json['status'] == 'succeeded'
+                assert result_json['url'] == TEST_URL
+                assert result_json['snapshot_id'] == 'test789'
+                assert 'duration' in result_json
+                assert result_json['duration'] >= 0
+                break
+
+
+def test_config_save_wget_false_skips():
+    """Test that SAVE_WGET=False causes skip."""
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set SAVE_WGET=False
+        env = os.environ.copy()
+        env['SAVE_WGET'] = 'False'
+
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should succeed but skip
+        assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
+        assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
+        assert 'SAVE_WGET=False' in result.stdout, "Should mention SAVE_WGET=False"
+
+
+def test_config_save_warc():
+    """Test that SAVE_WARC=True creates WARC files."""
+
+    # Ensure wget is available
+    if not shutil.which('wget'):
+        pytest.skip("wget not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set SAVE_WARC=True explicitly
+        env = os.environ.copy()
+        env['SAVE_WARC'] = 'True'
+
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=120
+        )
+
+        if result.returncode == 0:
+            # Look for WARC files in warc/ subdirectory
+            warc_dir = tmpdir / 'warc'
+            if warc_dir.exists():
+                warc_files = list(warc_dir.rglob('*'))
+                warc_files = [f for f in warc_files if f.is_file()]
+                assert len(warc_files) > 0, "WARC file not created when SAVE_WARC=True"
+
+
+def test_staticfile_present_skips():
+    """Test that wget skips when staticfile already downloaded."""
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Create staticfile directory with content to simulate staticfile extractor ran
+        staticfile_dir = tmpdir / 'staticfile'
+        staticfile_dir.mkdir()
+        (staticfile_dir / 'index.html').write_text('<html>test</html>')
+
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        # Should skip
+        assert result.returncode == 0, "Should exit 0 when skipping"
+        assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
+        assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
+
+
+def test_handles_404_gracefully():
+    """Test that wget fails gracefully on 404."""
+
+    if not shutil.which('wget'):
+        pytest.skip("wget not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Try to download non-existent page
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # Should fail
+        assert result.returncode != 0, "Should fail on 404"
+        combined = result.stdout + result.stderr
+        assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \
+            "Should report 404 or no files downloaded"
+
+
+def test_config_timeout_honored():
+    """Test that WGET_TIMEOUT config is respected."""
+
+    if not shutil.which('wget'):
+        pytest.skip("wget not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set very short timeout
+        env = os.environ.copy()
+        env['WGET_TIMEOUT'] = '5'
+
+        # This should still succeed for example.com (it's fast)
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Verify it completed (success or fail, but didn't hang)
+        assert result.returncode in (0, 1), "Should complete (success or fail)"
+
+
+def test_config_user_agent():
+    """Test that WGET_USER_AGENT config is used."""
+
+    if not shutil.which('wget'):
+        pytest.skip("wget not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set custom user agent
+        env = os.environ.copy()
+        env['WGET_USER_AGENT'] = 'TestBot/1.0'
+
+        result = subprocess.run(
+            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=120
+        )
+
+        # Should succeed (example.com doesn't block)
+        if result.returncode == 0:
+            assert 'STATUS=succeeded' in result.stdout
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])