logging and admin ui improvements

This commit is contained in:
Nick Sweeting
2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'accessibility';
const OUTPUT_DIR = 'accessibility';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'accessibility.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract accessibility info
async function extractAccessibility(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -24,7 +24,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'archive_org'
OUTPUT_DIR = 'archive_org'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'archive.org.txt'

View File

@@ -26,7 +26,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'chrome_cleanup'
CHROME_SESSION_DIR = 'chrome_session'
CHROME_SESSION_DIR = '../chrome_session'
def get_env(name: str, default: str = '') -> str:

View File

@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'consolelog';
const OUTPUT_DIR = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
async function captureConsoleLogs(url) {
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Clear existing file

View File

@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'dom';
const OUTPUT_DIR = 'dom';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.html';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -114,10 +114,7 @@ async function dumpDom(url) {
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -26,7 +26,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'favicon'
OUTPUT_DIR = 'favicon'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'favicon.ico'

View File

@@ -26,7 +26,7 @@ import rich_click as click
EXTRACTOR_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'repo'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:

View File

@@ -22,9 +22,9 @@ const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'headers';
const OUTPUT_DIR = 'headers';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'headers.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
const CHROME_HEADERS_FILE = 'response_headers.json';
// Parse command line arguments
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
}
async function extractHeaders(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first

View File

@@ -28,7 +28,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'htmltotext'
OUTPUT_DIR = 'htmltotext'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'htmltotext.txt'
@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
if not text or len(text) < 10:
return False, None, 'No meaningful text extracted from HTML'
# Create output directory and write output
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
output_path.write_text(text, encoding='utf-8')

View File

@@ -39,7 +39,7 @@ import rich_click as click
EXTRACTOR_NAME = 'media'
BIN_NAME = 'yt-dlp'
BIN_PROVIDERS = 'pip,apt,brew,env'
OUTPUT_DIR = 'media'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
STATICFILE_DIR = 'staticfile'
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
# Build command (later options take precedence)
cmd = [

View File

@@ -27,7 +27,7 @@ import rich_click as click
EXTRACTOR_NAME = 'mercury'
BIN_NAME = 'postlight-parser'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'mercury'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
timeout = get_env_int('TIMEOUT', 60)
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
try:
# Get text version

View File

@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'parse_dom_outlinks';
const OUTPUT_DIR = 'parse_dom_outlinks';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'outlinks.json';
const URLS_FILE = 'urls.jsonl'; // For crawl system
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -64,10 +64,7 @@ function getCdpUrl() {
// Extract outlinks
async function extractOutlinks(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'pdf';
const OUTPUT_DIR = 'pdf';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.pdf';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -113,10 +113,7 @@ async function printToPdf(url) {
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -29,7 +29,7 @@ import rich_click as click
EXTRACTOR_NAME = 'readability'
BIN_NAME = 'readability-extractor'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'readability'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
if not html_source:
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
try:
# Run readability-extractor (outputs JSON by default)

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'redirects';
const OUTPUT_DIR = 'redirects';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Track redirect chain
async function trackRedirects(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'responses';
const OUTPUT_DIR = 'responses';
const CHROME_SESSION_DIR = 'chrome_session';
const OUTPUT_DIR = '.';
const CHROME_SESSION_DIR = '../chrome_session';
// Resource types to capture (by default, capture everything)
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
// Create output directories
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
// Create subdirectories for organizing responses
const allDir = path.join(OUTPUT_DIR, 'all');
if (!fs.existsSync(allDir)) {
fs.mkdirSync(allDir, { recursive: true });

View File

@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'screenshot';
const OUTPUT_DIR = 'screenshot';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'screenshot.png';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Validation hook for ripgrep binary.
Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from ripgrep binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# ripgrep version string: "ripgrep 14.1.0"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
for i, part in enumerate(parts):
if part.lower() == 'ripgrep' and i + 1 < len(parts):
return parts[i + 1]
# Try to find version number pattern
for part in parts:
if part[0].isdigit() and '.' in part:
return part
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_ripgrep() -> dict | None:
"""Find ripgrep binary using shutil.which or env var."""
# Check env var first - if it's an absolute path and exists, use it
ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
abspath = ripgrep_env
else:
# Otherwise try shutil.which with the env var as the binary name
abspath = shutil.which(ripgrep_env) if ripgrep_env else None
if not abspath:
abspath = shutil.which('rg')
if abspath and Path(abspath).is_file():
return {
'name': 'rg',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
"""Validate ripgrep binary and output JSONL."""
# Check if ripgrep search backend is enabled
search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
if search_backend != 'ripgrep':
# No-op: ripgrep is not the active search backend
sys.exit(0)
result = find_ripgrep()
if result and result.get('abspath'):
# Output InstalledBinary
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
# Output Machine config update
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/RIPGREP_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/RIPGREP_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
# Output Dependency request
print(json.dumps({
'type': 'Dependency',
'bin_name': 'rg',
'bin_providers': 'apt,brew,cargo,env',
}))
# Exit non-zero to indicate binary not found
print(f"ripgrep binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
Tests for ripgrep binary detection and archivebox install functionality.
Guards against regressions in:
1. Machine.config overrides not being used in version command
2. Ripgrep hook not resolving binary names via shutil.which()
3. SEARCH_BACKEND_ENGINE not being passed to hook environment
"""
import os
import sys
import json
import shutil
import tempfile
import subprocess
from pathlib import Path
from unittest.mock import patch, MagicMock
import pytest
def test_ripgrep_hook_detects_binary_from_path():
"""Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
# Skip if rg is not installed
if not shutil.which('rg'):
pytest.skip("ripgrep (rg) not installed")
# Set SEARCH_BACKEND_ENGINE to enable the hook
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
env['RIPGREP_BINARY'] = 'rg' # Just the name, not the full path (this was the bug)
result = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env,
timeout=10,
)
assert result.returncode == 0, f"Hook failed: {result.stderr}"
# Parse JSONL output
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
installed_binary = json.loads(lines[0])
assert installed_binary['type'] == 'InstalledBinary'
assert installed_binary['name'] == 'rg'
assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
assert installed_binary['version'], "Version should be detected"
machine_config = json.loads(lines[1])
assert machine_config['type'] == 'Machine'
assert machine_config['key'] == 'config/RIPGREP_BINARY'
assert '/' in machine_config['value'], "Machine config should store full path"
def test_ripgrep_hook_skips_when_backend_not_ripgrep():
"""Test that ripgrep hook exits silently when search backend is not ripgrep."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend
result = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env,
timeout=10,
)
assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
def test_ripgrep_hook_handles_absolute_path():
"""Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
rg_path = shutil.which('rg')
if not rg_path:
pytest.skip("ripgrep (rg) not installed")
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
env['RIPGREP_BINARY'] = rg_path # Full absolute path
result = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env,
timeout=10,
)
assert result.returncode == 0, f"Hook failed: {result.stderr}"
assert result.stdout.strip(), "Hook should produce output"
installed_binary = json.loads(result.stdout.strip().split('\n')[0])
assert installed_binary['abspath'] == rg_path
@pytest.mark.django_db
def test_machine_config_overrides_base_config():
"""
Test that Machine.config overrides take precedence over base config.
Guards against regression where archivebox version was showing binaries
as "not installed" even though they were detected and stored in Machine.config.
"""
from machine.models import Machine, InstalledBinary
machine = Machine.current()
# Simulate a hook detecting chrome and storing it with a different path than base config
detected_chrome_path = '/custom/path/to/chrome'
machine.config['CHROME_BINARY'] = detected_chrome_path
machine.config['CHROME_VERSION'] = '143.0.7499.170'
machine.save()
# Create InstalledBinary record
InstalledBinary.objects.create(
machine=machine,
name='chrome',
abspath=detected_chrome_path,
version='143.0.7499.170',
binprovider='env',
)
# Verify Machine.config takes precedence
from archivebox.config.configset import get_config
config = get_config()
# Machine.config should override the base config value
assert machine.config.get('CHROME_BINARY') == detected_chrome_path
# The version command should use Machine.config, not base config
# (Base config might have 'chromium' while Machine.config has the full path)
bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
assert bin_value == detected_chrome_path, \
"Machine.config override should take precedence over base config"
@pytest.mark.django_db
def test_search_backend_engine_passed_to_hooks():
"""
Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
Guards against regression where hooks couldn't determine which search backend was active.
"""
from pathlib import Path
from archivebox.hooks import build_hook_environment
from archivebox.config.configset import get_config
config = get_config()
search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
env = build_hook_environment(overrides=None)
assert 'SEARCH_BACKEND_ENGINE' in env, \
"SEARCH_BACKEND_ENGINE must be in hook environment"
assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
@pytest.mark.django_db
def test_install_creates_installedbinary_records():
"""
Test that archivebox install creates InstalledBinary records for detected binaries.
This is an integration test that verifies the full install flow.
"""
from machine.models import Machine, InstalledBinary
from crawls.models import Seed, Crawl
from crawls.statemachines import CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
machine = Machine.current()
initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
# Create an install crawl (like archivebox install does)
created_by_id = get_or_create_system_user_pk()
seed, _ = Seed.objects.get_or_create(
uri='archivebox://test-install',
label='Test dependency detection',
created_by_id=created_by_id,
defaults={'extractor': 'auto'},
)
crawl = Crawl.objects.create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
# Run the crawl state machine (this triggers hooks)
sm = CrawlMachine(crawl)
sm.send('tick') # queued -> started (runs hooks)
# Verify InstalledBinary records were created
final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
assert final_binary_count > initial_binary_count, \
"archivebox install should create InstalledBinary records"
# Verify at least some common binaries were detected
common_binaries = ['git', 'wget', 'node']
detected = []
for bin_name in common_binaries:
if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
detected.append(bin_name)
assert detected, f"At least one of {common_binaries} should be detected"
# Verify detected binaries have valid paths and versions
for binary in InstalledBinary.objects.filter(machine=machine):
if binary.abspath: # Only check non-empty paths
assert '/' in binary.abspath, \
f"{binary.name} should have full path, not just name: {binary.abspath}"
# Version might be empty for some binaries, that's ok
@pytest.mark.django_db
def test_ripgrep_only_detected_when_backend_enabled():
"""
Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
Guards against ripgrep being installed/detected when not needed.
"""
from machine.models import Machine, InstalledBinary
from crawls.models import Seed, Crawl
from crawls.statemachines import CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
from django.conf import settings
if not shutil.which('rg'):
pytest.skip("ripgrep (rg) not installed")
machine = Machine.current()
# Clear any existing ripgrep records
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
# Test 1: With ripgrep backend - should be detected
with patch('archivebox.config.configset.get_config') as mock_config:
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
created_by_id = get_or_create_system_user_pk()
seed = Seed.objects.create(
uri='archivebox://test-rg-enabled',
label='Test ripgrep detection enabled',
created_by_id=created_by_id,
extractor='auto',
)
crawl = Crawl.objects.create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
sm = CrawlMachine(crawl)
sm.send('tick')
# Ripgrep should be detected
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
# Clear records again
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
# Test 2: With different backend - should NOT be detected
with patch('archivebox.config.configset.get_config') as mock_config:
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
seed2 = Seed.objects.create(
uri='archivebox://test-rg-disabled',
label='Test ripgrep detection disabled',
created_by_id=created_by_id,
extractor='auto',
)
crawl2 = Crawl.objects.create(
seed=seed2,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
sm2 = CrawlMachine(crawl2)
sm2.send('tick')
# Ripgrep should NOT be detected
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -29,7 +29,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sonic'
OUTPUT_DIR = 'search_index'
OUTPUT_DIR = '.'
# Text file patterns to index
INDEXABLE_FILES = [

View File

@@ -27,7 +27,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sqlite'
OUTPUT_DIR = 'search_index'
OUTPUT_DIR = '.'
# Text file patterns to index, in priority order
INDEXABLE_FILES = [

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'seo';
const OUTPUT_DIR = 'seo';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'seo.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract SEO metadata
async function extractSeo(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;

View File

@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = 'singlefile';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'singlefile.html';
/**
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
.filter(fn => fn.endsWith('.html'))
);
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
return null;
}
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
// Output directory is current directory (hook already runs in output dir)
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command

View File

@@ -41,7 +41,7 @@ import rich_click as click
EXTRACTOR_NAME = 'singlefile'
BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'singlefile'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'singlefile.html'
@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
STATICFILE_DIR = 'staticfile'
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
return ''
CHROME_SESSION_DIR = 'chrome_session'
CHROME_SESSION_DIR = '../chrome_session'
def get_cdp_url() -> str | None:
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
if extra_args:
cmd.extend(extra_args.split())
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
cmd.extend([url, str(output_path)])
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
cmd_str = f'{binary} {url} {OUTPUT_FILE}'
# Run extraction
success, output, error = save_singlefile(url, binary)

View File

@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'ssl';
const OUTPUT_DIR = 'ssl';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -60,10 +60,7 @@ function getCdpUrl() {
// Extract SSL details
async function extractSsl(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Only extract SSL for HTTPS URLs

View File

@@ -31,8 +31,8 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'staticfile'
OUTPUT_DIR = 'staticfile'
CHROME_SESSION_DIR = 'chrome_session'
OUTPUT_DIR = '.'
CHROME_SESSION_DIR = '../chrome_session'
# Content-Types that indicate static files
# These can't be meaningfully processed by Chrome-based extractors
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
if content_length and int(content_length) > max_size:
return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
# Create output directory
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
# Determine filename
filename = get_filename_from_url(url)

View File

@@ -21,9 +21,9 @@ const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'title';
const OUTPUT_DIR = 'title';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'title.txt';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_SESSION_DIR = '../chrome_session';
// Parse command line arguments
function parseArgs() {
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
}
async function extractTitle(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first

View File

@@ -43,7 +43,7 @@ import rich_click as click
EXTRACTOR_NAME = 'wget'
BIN_NAME = 'wget'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'wget'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
STATICFILE_DIR = 'staticfile'
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""