logging and admin ui improvements

2026-04-04 23:07:56 +10:00 · 2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions
--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'accessibility';
-const OUTPUT_DIR = 'accessibility';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'accessibility.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Extract accessibility info
 async function extractAccessibility(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
+++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
@@ -24,7 +24,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'archive_org'
-OUTPUT_DIR = 'archive_org'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'archive.org.txt'


--- a/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
+++ b/archivebox/plugins/chrome_cleanup/on_Snapshot__45_chrome_cleanup.py
@@ -26,7 +26,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'chrome_cleanup'
-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'


 def get_env(name: str, default: str = '') -> str:
--- a/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
+++ b/archivebox/plugins/chrome_navigate/on_Snapshot__30_chrome_navigate.js
@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'chrome_navigate';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
--- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
+++ b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'consolelog';
-const OUTPUT_DIR = 'consolelog';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'console.jsonl';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
 async function captureConsoleLogs(url) {
    const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Clear existing file
--- a/archivebox/plugins/dom/on_Snapshot__36_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__36_dom.js
@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'dom';
-const OUTPUT_DIR = 'dom';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.html';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
 }

 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -114,10 +114,7 @@ async function dumpDom(url) {

    const { width, height } = parseResolution(resolution);

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -26,7 +26,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'favicon'
-OUTPUT_DIR = 'favicon'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'favicon.ico'


--- a/archivebox/plugins/git/on_Snapshot__12_git.py
+++ b/archivebox/plugins/git/on_Snapshot__12_git.py
@@ -26,7 +26,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'git'
 BIN_NAME = 'git'
 BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'repo'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
--- a/archivebox/plugins/headers/on_Snapshot__33_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js
@@ -22,9 +22,9 @@ const http = require('http');

 // Extractor metadata
 const EXTRACTOR_NAME = 'headers';
-const OUTPUT_DIR = 'headers';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'headers.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';
 const CHROME_HEADERS_FILE = 'response_headers.json';

 // Parse command line arguments
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
 }

 async function extractHeaders(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Try Chrome session first
--- a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
+++ b/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
@@ -28,7 +28,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'htmltotext'
-OUTPUT_DIR = 'htmltotext'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'htmltotext.txt'


@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
    if not text or len(text) < 10:
        return False, None, 'No meaningful text extracted from HTML'

-    # Create output directory and write output
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
    output_path = output_dir / OUTPUT_FILE
    output_path.write_text(text, encoding='utf-8')

--- a/archivebox/plugins/media/on_Snapshot__51_media.py
+++ b/archivebox/plugins/media/on_Snapshot__51_media.py
@@ -39,7 +39,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'media'
 BIN_NAME = 'yt-dlp'
 BIN_PROVIDERS = 'pip,apt,brew,env'
-OUTPUT_DIR = 'media'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'

 def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
    extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
    media_max_size = get_env('MEDIA_MAX_SIZE', '750m')

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)

    # Build command (later options take precedence)
    cmd = [
--- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py
@@ -27,7 +27,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'mercury'
 BIN_NAME = 'postlight-parser'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'mercury'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
    """
    timeout = get_env_int('TIMEOUT', 60)

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)

    try:
        # Get text version
--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'parse_dom_outlinks';
-const OUTPUT_DIR = 'parse_dom_outlinks';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'outlinks.json';
 const URLS_FILE = 'urls.jsonl';  // For crawl system
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -64,10 +64,7 @@ function getCdpUrl() {

 // Extract outlinks
 async function extractOutlinks(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__35_pdf.js
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'pdf';
-const OUTPUT_DIR = 'pdf';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'output.pdf';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
 }

 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -113,10 +113,7 @@ async function printToPdf(url) {

    const { width, height } = parseResolution(resolution);

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/readability/on_Snapshot__52_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py
@@ -29,7 +29,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'readability'
 BIN_NAME = 'readability-extractor'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'readability'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
    if not html_source:
        return False, None, 'No HTML source found (run singlefile, dom, or wget first)'

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)

    try:
        # Run readability-extractor (outputs JSON by default)
--- a/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
+++ b/archivebox/plugins/redirects/on_Snapshot__22_redirects.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'redirects';
-const OUTPUT_DIR = 'redirects';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'redirects.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Track redirect chain
 async function trackRedirects(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/responses/on_Snapshot__24_responses.js
+++ b/archivebox/plugins/responses/on_Snapshot__24_responses.js
@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'responses';
-const OUTPUT_DIR = 'responses';
-const CHROME_SESSION_DIR = 'chrome_session';
+const OUTPUT_DIR = '.';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Resource types to capture (by default, capture everything)
 const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
    const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
    const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());

-    // Create output directories
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
+    // Create subdirectories for organizing responses
    const allDir = path.join(OUTPUT_DIR, 'all');
    if (!fs.existsSync(allDir)) {
        fs.mkdirSync(allDir, { recursive: true });
--- a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'screenshot';
-const OUTPUT_DIR = 'screenshot';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'screenshot.png';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
 }

 // Check if staticfile extractor already downloaded this URL
-const STATICFILE_DIR = 'staticfile';
+const STATICFILE_DIR = '../staticfile';
 function hasStaticFileOutput() {
    return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
 }
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {

    const { width, height } = parseResolution(resolution);

-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
+++ b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Validation hook for ripgrep binary.
+
+Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
+Outputs JSONL for InstalledBinary and Machine config updates.
+"""
+
+import os
+import sys
+import json
+import shutil
+import hashlib
+import subprocess
+from pathlib import Path
+
+
+def get_binary_version(abspath: str) -> str | None:
+    """Get version string from ripgrep binary."""
+    try:
+        result = subprocess.run(
+            [abspath, '--version'],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode == 0 and result.stdout:
+            # ripgrep version string: "ripgrep 14.1.0"
+            first_line = result.stdout.strip().split('\n')[0]
+            parts = first_line.split()
+            for i, part in enumerate(parts):
+                if part.lower() == 'ripgrep' and i + 1 < len(parts):
+                    return parts[i + 1]
+            # Try to find version number pattern
+            for part in parts:
+                if part[0].isdigit() and '.' in part:
+                    return part
+            return first_line[:32]
+    except Exception:
+        pass
+    return None
+
+
+def get_binary_hash(abspath: str) -> str | None:
+    """Get SHA256 hash of binary."""
+    try:
+        with open(abspath, 'rb') as f:
+            return hashlib.sha256(f.read()).hexdigest()
+    except Exception:
+        return None
+
+
+def find_ripgrep() -> dict | None:
+    """Find ripgrep binary using shutil.which or env var."""
+    # Check env var first - if it's an absolute path and exists, use it
+    ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
+    if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
+        abspath = ripgrep_env
+    else:
+        # Otherwise try shutil.which with the env var as the binary name
+        abspath = shutil.which(ripgrep_env) if ripgrep_env else None
+        if not abspath:
+            abspath = shutil.which('rg')
+
+    if abspath and Path(abspath).is_file():
+        return {
+            'name': 'rg',
+            'abspath': abspath,
+            'version': get_binary_version(abspath),
+            'sha256': get_binary_hash(abspath),
+            'binprovider': 'env',
+        }
+
+    return None
+
+
+def main():
+    """Validate ripgrep binary and output JSONL."""
+
+    # Check if ripgrep search backend is enabled
+    search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
+
+    if search_backend != 'ripgrep':
+        # No-op: ripgrep is not the active search backend
+        sys.exit(0)
+
+    result = find_ripgrep()
+
+    if result and result.get('abspath'):
+        # Output InstalledBinary
+        print(json.dumps({
+            'type': 'InstalledBinary',
+            'name': result['name'],
+            'abspath': result['abspath'],
+            'version': result['version'],
+            'sha256': result['sha256'],
+            'binprovider': result['binprovider'],
+        }))
+
+        # Output Machine config update
+        print(json.dumps({
+            'type': 'Machine',
+            '_method': 'update',
+            'key': 'config/RIPGREP_BINARY',
+            'value': result['abspath'],
+        }))
+
+        if result['version']:
+            print(json.dumps({
+                'type': 'Machine',
+                '_method': 'update',
+                'key': 'config/RIPGREP_VERSION',
+                'value': result['version'],
+            }))
+
+        sys.exit(0)
+    else:
+        # Output Dependency request
+        print(json.dumps({
+            'type': 'Dependency',
+            'bin_name': 'rg',
+            'bin_providers': 'apt,brew,cargo,env',
+        }))
+
+        # Exit non-zero to indicate binary not found
+        print(f"ripgrep binary not found", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/search_backend_ripgrep/tests/init.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/init.py
--- a/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
+++ b/archivebox/plugins/search_backend_ripgrep/tests/test_ripgrep_detection.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+"""
+Tests for ripgrep binary detection and archivebox install functionality.
+
+Guards against regressions in:
+1. Machine.config overrides not being used in version command
+2. Ripgrep hook not resolving binary names via shutil.which()
+3. SEARCH_BACKEND_ENGINE not being passed to hook environment
+"""
+
+import os
+import sys
+import json
+import shutil
+import tempfile
+import subprocess
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+
+def test_ripgrep_hook_detects_binary_from_path():
+    """Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    # Skip if rg is not installed
+    if not shutil.which('rg'):
+        pytest.skip("ripgrep (rg) not installed")
+
+    # Set SEARCH_BACKEND_ENGINE to enable the hook
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+    env['RIPGREP_BINARY'] = 'rg'  # Just the name, not the full path (this was the bug)
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, f"Hook failed: {result.stderr}"
+
+    # Parse JSONL output
+    lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
+    assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
+
+    installed_binary = json.loads(lines[0])
+    assert installed_binary['type'] == 'InstalledBinary'
+    assert installed_binary['name'] == 'rg'
+    assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
+    assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
+    assert installed_binary['version'], "Version should be detected"
+
+    machine_config = json.loads(lines[1])
+    assert machine_config['type'] == 'Machine'
+    assert machine_config['key'] == 'config/RIPGREP_BINARY'
+    assert '/' in machine_config['value'], "Machine config should store full path"
+
+
+def test_ripgrep_hook_skips_when_backend_not_ripgrep():
+    """Test that ripgrep hook exits silently when search backend is not ripgrep."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'sqlite'  # Different backend
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
+    assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
+
+
+def test_ripgrep_hook_handles_absolute_path():
+    """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
+    hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
+
+    rg_path = shutil.which('rg')
+    if not rg_path:
+        pytest.skip("ripgrep (rg) not installed")
+
+    env = os.environ.copy()
+    env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
+    env['RIPGREP_BINARY'] = rg_path  # Full absolute path
+
+    result = subprocess.run(
+        [sys.executable, str(hook_path)],
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=10,
+    )
+
+    assert result.returncode == 0, f"Hook failed: {result.stderr}"
+    assert result.stdout.strip(), "Hook should produce output"
+
+    installed_binary = json.loads(result.stdout.strip().split('\n')[0])
+    assert installed_binary['abspath'] == rg_path
+
+
+@pytest.mark.django_db
+def test_machine_config_overrides_base_config():
+    """
+    Test that Machine.config overrides take precedence over base config.
+
+    Guards against regression where archivebox version was showing binaries
+    as "not installed" even though they were detected and stored in Machine.config.
+    """
+    from machine.models import Machine, InstalledBinary
+
+    machine = Machine.current()
+
+    # Simulate a hook detecting chrome and storing it with a different path than base config
+    detected_chrome_path = '/custom/path/to/chrome'
+    machine.config['CHROME_BINARY'] = detected_chrome_path
+    machine.config['CHROME_VERSION'] = '143.0.7499.170'
+    machine.save()
+
+    # Create InstalledBinary record
+    InstalledBinary.objects.create(
+        machine=machine,
+        name='chrome',
+        abspath=detected_chrome_path,
+        version='143.0.7499.170',
+        binprovider='env',
+    )
+
+    # Verify Machine.config takes precedence
+    from archivebox.config.configset import get_config
+    config = get_config()
+
+    # Machine.config should override the base config value
+    assert machine.config.get('CHROME_BINARY') == detected_chrome_path
+
+    # The version command should use Machine.config, not base config
+    # (Base config might have 'chromium' while Machine.config has the full path)
+    bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
+    assert bin_value == detected_chrome_path, \
+        "Machine.config override should take precedence over base config"
+
+
+@pytest.mark.django_db
+def test_search_backend_engine_passed_to_hooks():
+    """
+    Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
+
+    Guards against regression where hooks couldn't determine which search backend was active.
+    """
+    from pathlib import Path
+    from archivebox.hooks import build_hook_environment
+    from archivebox.config.configset import get_config
+
+    config = get_config()
+    search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
+
+    env = build_hook_environment(overrides=None)
+
+    assert 'SEARCH_BACKEND_ENGINE' in env, \
+        "SEARCH_BACKEND_ENGINE must be in hook environment"
+    assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
+        f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
+
+
+@pytest.mark.django_db
+def test_install_creates_installedbinary_records():
+    """
+    Test that archivebox install creates InstalledBinary records for detected binaries.
+
+    This is an integration test that verifies the full install flow.
+    """
+    from machine.models import Machine, InstalledBinary
+    from crawls.models import Seed, Crawl
+    from crawls.statemachines import CrawlMachine
+    from archivebox.base_models.models import get_or_create_system_user_pk
+
+    machine = Machine.current()
+    initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+
+    # Create an install crawl (like archivebox install does)
+    created_by_id = get_or_create_system_user_pk()
+    seed, _ = Seed.objects.get_or_create(
+        uri='archivebox://test-install',
+        label='Test dependency detection',
+        created_by_id=created_by_id,
+        defaults={'extractor': 'auto'},
+    )
+
+    crawl = Crawl.objects.create(
+        seed=seed,
+        max_depth=0,
+        created_by_id=created_by_id,
+        status='queued',
+    )
+
+    # Run the crawl state machine (this triggers hooks)
+    sm = CrawlMachine(crawl)
+    sm.send('tick')  # queued -> started (runs hooks)
+
+    # Verify InstalledBinary records were created
+    final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
+    assert final_binary_count > initial_binary_count, \
+        "archivebox install should create InstalledBinary records"
+
+    # Verify at least some common binaries were detected
+    common_binaries = ['git', 'wget', 'node']
+    detected = []
+    for bin_name in common_binaries:
+        if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
+            detected.append(bin_name)
+
+    assert detected, f"At least one of {common_binaries} should be detected"
+
+    # Verify detected binaries have valid paths and versions
+    for binary in InstalledBinary.objects.filter(machine=machine):
+        if binary.abspath:  # Only check non-empty paths
+            assert '/' in binary.abspath, \
+                f"{binary.name} should have full path, not just name: {binary.abspath}"
+            # Version might be empty for some binaries, that's ok
+
+
+@pytest.mark.django_db
+def test_ripgrep_only_detected_when_backend_enabled():
+    """
+    Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
+
+    Guards against ripgrep being installed/detected when not needed.
+    """
+    from machine.models import Machine, InstalledBinary
+    from crawls.models import Seed, Crawl
+    from crawls.statemachines import CrawlMachine
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from django.conf import settings
+
+    if not shutil.which('rg'):
+        pytest.skip("ripgrep (rg) not installed")
+
+    machine = Machine.current()
+
+    # Clear any existing ripgrep records
+    InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+    # Test 1: With ripgrep backend - should be detected
+    with patch('archivebox.config.configset.get_config') as mock_config:
+        mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
+
+        created_by_id = get_or_create_system_user_pk()
+        seed = Seed.objects.create(
+            uri='archivebox://test-rg-enabled',
+            label='Test ripgrep detection enabled',
+            created_by_id=created_by_id,
+            extractor='auto',
+        )
+
+        crawl = Crawl.objects.create(
+            seed=seed,
+            max_depth=0,
+            created_by_id=created_by_id,
+            status='queued',
+        )
+
+        sm = CrawlMachine(crawl)
+        sm.send('tick')
+
+        # Ripgrep should be detected
+        rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+        assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
+
+    # Clear records again
+    InstalledBinary.objects.filter(machine=machine, name='rg').delete()
+
+    # Test 2: With different backend - should NOT be detected
+    with patch('archivebox.config.configset.get_config') as mock_config:
+        mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
+
+        seed2 = Seed.objects.create(
+            uri='archivebox://test-rg-disabled',
+            label='Test ripgrep detection disabled',
+            created_by_id=created_by_id,
+            extractor='auto',
+        )
+
+        crawl2 = Crawl.objects.create(
+            seed=seed2,
+            max_depth=0,
+            created_by_id=created_by_id,
+            status='queued',
+        )
+
+        sm2 = CrawlMachine(crawl2)
+        sm2.send('tick')
+
+        # Ripgrep should NOT be detected
+        rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
+        assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
--- a/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
+++ b/archivebox/plugins/search_backend_sonic/on_Snapshot__91_index_sonic.py
@@ -29,7 +29,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'index_sonic'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'

 # Text file patterns to index
 INDEXABLE_FILES = [
--- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
+++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
@@ -27,7 +27,7 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'index_sqlite'
-OUTPUT_DIR = 'search_index'
+OUTPUT_DIR = '.'

 # Text file patterns to index, in priority order
 INDEXABLE_FILES = [
--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'seo';
-const OUTPUT_DIR = 'seo';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'seo.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Extract SEO metadata
 async function extractSeo(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    let browser = null;
--- a/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
+++ b/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
 const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');

-const OUTPUT_DIR = 'singlefile';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'singlefile.html';

 /**
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
            .filter(fn => fn.endsWith('.html'))
    );

-    // Ensure output directory exists
-    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    // Output directory is current directory (hook already runs in output dir)
    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);

    console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
        return null;
    }

-    // Ensure output directory exists
-    await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
+    // Output directory is current directory (hook already runs in output dir)
    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Build command
--- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
@@ -41,7 +41,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'singlefile'
 BIN_NAME = 'single-file'
 BIN_PROVIDERS = 'npm,env'
-OUTPUT_DIR = 'singlefile'
+OUTPUT_DIR = '.'
 OUTPUT_FILE = 'singlefile.html'


@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'

 def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
        return ''


-CHROME_SESSION_DIR = 'chrome_session'
+CHROME_SESSION_DIR = '../chrome_session'


 def get_cdp_url() -> str | None:
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
    if extra_args:
        cmd.extend(extra_args.split())

-    # Create output directory
+    # Output directory is current directory (hook already runs in output dir)
    output_dir = Path(OUTPUT_DIR)
-    output_dir.mkdir(exist_ok=True)
    output_path = output_dir / OUTPUT_FILE

    cmd.extend([url, str(output_path)])
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
            sys.exit(1)

        version = get_version(binary)
-        cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
+        cmd_str = f'{binary} {url} {OUTPUT_FILE}'

        # Run extraction
        success, output, error = save_singlefile(url, binary)
--- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
+++ b/archivebox/plugins/ssl/on_Snapshot__23_ssl.js
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');

 // Extractor metadata
 const EXTRACTOR_NAME = 'ssl';
-const OUTPUT_DIR = 'ssl';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'ssl.json';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {

 // Extract SSL details
 async function extractSsl(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Only extract SSL for HTTPS URLs
--- a/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
+++ b/archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
@@ -31,8 +31,8 @@ import rich_click as click

 # Extractor metadata
 EXTRACTOR_NAME = 'staticfile'
-OUTPUT_DIR = 'staticfile'
-CHROME_SESSION_DIR = 'chrome_session'
+OUTPUT_DIR = '.'
+CHROME_SESSION_DIR = '../chrome_session'

 # Content-Types that indicate static files
 # These can't be meaningfully processed by Chrome-based extractors
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
        if content_length and int(content_length) > max_size:
            return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'

-        # Create output directory
+        # Output directory is current directory (hook already runs in output dir)
        output_dir = Path(OUTPUT_DIR)
-        output_dir.mkdir(exist_ok=True)

        # Determine filename
        filename = get_filename_from_url(url)
--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -21,9 +21,9 @@ const http = require('http');

 // Extractor metadata
 const EXTRACTOR_NAME = 'title';
-const OUTPUT_DIR = 'title';
+const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'title.txt';
-const CHROME_SESSION_DIR = 'chrome_session';
+const CHROME_SESSION_DIR = '../chrome_session';

 // Parse command line arguments
 function parseArgs() {
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
 }

 async function extractTitle(url) {
-    // Create output directory
-    if (!fs.existsSync(OUTPUT_DIR)) {
-        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
-    }
+    // Output directory is current directory (hook already runs in output dir)
    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);

    // Try Chrome session first
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -43,7 +43,7 @@ import rich_click as click
 EXTRACTOR_NAME = 'wget'
 BIN_NAME = 'wget'
 BIN_PROVIDERS = 'apt,brew,env'
-OUTPUT_DIR = 'wget'
+OUTPUT_DIR = '.'


 def get_env(name: str, default: str = '') -> str:
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-STATICFILE_DIR = 'staticfile'
+STATICFILE_DIR = '../staticfile'

 def has_staticfile_output() -> bool:
    """Check if staticfile extractor already downloaded this URL."""