mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 23:07:56 +10:00
logging and admin ui improvements
This commit is contained in:
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'accessibility';
|
||||
const OUTPUT_DIR = 'accessibility';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'accessibility.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract accessibility info
|
||||
async function extractAccessibility(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -24,7 +24,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'archive_org'
|
||||
OUTPUT_DIR = 'archive_org'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'chrome_cleanup'
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
CHROME_SESSION_DIR = '../chrome_session'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
|
||||
@@ -31,7 +31,7 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'chrome_navigate';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = 'consolelog';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -86,10 +86,7 @@ async function serializeArgs(args) {
|
||||
async function captureConsoleLogs(url) {
|
||||
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Clear existing file
|
||||
|
||||
@@ -24,9 +24,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'dom';
|
||||
const OUTPUT_DIR = 'dom';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'output.html';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -58,7 +58,7 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
@@ -114,10 +114,7 @@ async function dumpDom(url) {
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -26,7 +26,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'favicon'
|
||||
OUTPUT_DIR = 'favicon'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'favicon.ico'
|
||||
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'git'
|
||||
BIN_NAME = 'git'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'repo'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
|
||||
@@ -22,9 +22,9 @@ const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'headers';
|
||||
const OUTPUT_DIR = 'headers';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'headers.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
const CHROME_HEADERS_FILE = 'response_headers.json';
|
||||
|
||||
// Parse command line arguments
|
||||
@@ -110,10 +110,7 @@ function fetchHeaders(url) {
|
||||
}
|
||||
|
||||
async function extractHeaders(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Try Chrome session first
|
||||
|
||||
@@ -28,7 +28,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'htmltotext'
|
||||
OUTPUT_DIR = 'htmltotext'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'htmltotext.txt'
|
||||
|
||||
|
||||
@@ -114,9 +114,8 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
|
||||
if not text or len(text) < 10:
|
||||
return False, None, 'No meaningful text extracted from HTML'
|
||||
|
||||
# Create output directory and write output
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
output_path.write_text(text, encoding='utf-8')
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'media'
|
||||
BIN_NAME = 'yt-dlp'
|
||||
BIN_PROVIDERS = 'pip,apt,brew,env'
|
||||
OUTPUT_DIR = 'media'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -62,7 +62,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
@@ -129,9 +129,8 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
|
||||
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Build command (later options take precedence)
|
||||
cmd = [
|
||||
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'mercury'
|
||||
BIN_NAME = 'postlight-parser'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'mercury'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -72,9 +72,8 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
timeout = get_env_int('TIMEOUT', 60)
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Get text version
|
||||
|
||||
@@ -24,10 +24,10 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'outlinks.json';
|
||||
const URLS_FILE = 'urls.jsonl'; // For crawl system
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -64,10 +64,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract outlinks
|
||||
async function extractOutlinks(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'pdf';
|
||||
const OUTPUT_DIR = 'pdf';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'output.pdf';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
@@ -113,10 +113,7 @@ async function printToPdf(url) {
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -29,7 +29,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'readability'
|
||||
BIN_NAME = 'readability-extractor'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'readability'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -101,9 +101,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if not html_source:
|
||||
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Run readability-extractor (outputs JSON by default)
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'redirects';
|
||||
const OUTPUT_DIR = 'redirects';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'redirects.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Track redirect chain
|
||||
async function trackRedirects(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -26,8 +26,8 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'responses';
|
||||
const OUTPUT_DIR = 'responses';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const OUTPUT_DIR = '.';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Resource types to capture (by default, capture everything)
|
||||
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
|
||||
@@ -149,10 +149,8 @@ async function archiveResponses(originalUrl) {
|
||||
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
|
||||
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
|
||||
|
||||
// Create output directories
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
// Create subdirectories for organizing responses
|
||||
const allDir = path.join(OUTPUT_DIR, 'all');
|
||||
if (!fs.existsSync(allDir)) {
|
||||
fs.mkdirSync(allDir, { recursive: true });
|
||||
|
||||
@@ -23,9 +23,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'screenshot';
|
||||
const OUTPUT_DIR = 'screenshot';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'screenshot.png';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -57,7 +57,7 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
@@ -116,10 +116,7 @@ async function takeScreenshot(url) {
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
131
archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
Executable file
131
archivebox/plugins/search_backend_ripgrep/on_Crawl__00_validate_ripgrep.py
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for ripgrep binary.
|
||||
|
||||
Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from ripgrep binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# ripgrep version string: "ripgrep 14.1.0"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.lower() == 'ripgrep' and i + 1 < len(parts):
|
||||
return parts[i + 1]
|
||||
# Try to find version number pattern
|
||||
for part in parts:
|
||||
if part[0].isdigit() and '.' in part:
|
||||
return part
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_ripgrep() -> dict | None:
|
||||
"""Find ripgrep binary using shutil.which or env var."""
|
||||
# Check env var first - if it's an absolute path and exists, use it
|
||||
ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
|
||||
if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
|
||||
abspath = ripgrep_env
|
||||
else:
|
||||
# Otherwise try shutil.which with the env var as the binary name
|
||||
abspath = shutil.which(ripgrep_env) if ripgrep_env else None
|
||||
if not abspath:
|
||||
abspath = shutil.which('rg')
|
||||
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'rg',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Validate ripgrep binary and output JSONL."""
|
||||
|
||||
# Check if ripgrep search backend is enabled
|
||||
search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower()
|
||||
|
||||
if search_backend != 'ripgrep':
|
||||
# No-op: ripgrep is not the active search backend
|
||||
sys.exit(0)
|
||||
|
||||
result = find_ripgrep()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
# Output InstalledBinary
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
# Output Machine config update
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/RIPGREP_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/RIPGREP_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Output Dependency request
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'rg',
|
||||
'bin_providers': 'apt,brew,cargo,env',
|
||||
}))
|
||||
|
||||
# Exit non-zero to indicate binary not found
|
||||
print(f"ripgrep binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for ripgrep binary detection and archivebox install functionality.
|
||||
|
||||
Guards against regressions in:
|
||||
1. Machine.config overrides not being used in version command
|
||||
2. Ripgrep hook not resolving binary names via shutil.which()
|
||||
3. SEARCH_BACKEND_ENGINE not being passed to hook environment
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import tempfile
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_ripgrep_hook_detects_binary_from_path():
|
||||
"""Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
|
||||
# Skip if rg is not installed
|
||||
if not shutil.which('rg'):
|
||||
pytest.skip("ripgrep (rg) not installed")
|
||||
|
||||
# Set SEARCH_BACKEND_ENGINE to enable the hook
|
||||
env = os.environ.copy()
|
||||
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
|
||||
env['RIPGREP_BINARY'] = 'rg' # Just the name, not the full path (this was the bug)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Hook failed: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
assert len(lines) >= 2, "Expected at least 2 JSONL lines (InstalledBinary + Machine config)"
|
||||
|
||||
installed_binary = json.loads(lines[0])
|
||||
assert installed_binary['type'] == 'InstalledBinary'
|
||||
assert installed_binary['name'] == 'rg'
|
||||
assert '/' in installed_binary['abspath'], "Expected full path, not just binary name"
|
||||
assert Path(installed_binary['abspath']).is_file(), "Binary path should exist"
|
||||
assert installed_binary['version'], "Version should be detected"
|
||||
|
||||
machine_config = json.loads(lines[1])
|
||||
assert machine_config['type'] == 'Machine'
|
||||
assert machine_config['key'] == 'config/RIPGREP_BINARY'
|
||||
assert '/' in machine_config['value'], "Machine config should store full path"
|
||||
|
||||
|
||||
def test_ripgrep_hook_skips_when_backend_not_ripgrep():
|
||||
"""Test that ripgrep hook exits silently when search backend is not ripgrep."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
|
||||
env = os.environ.copy()
|
||||
env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
|
||||
assert result.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
|
||||
|
||||
|
||||
def test_ripgrep_hook_handles_absolute_path():
|
||||
"""Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
|
||||
rg_path = shutil.which('rg')
|
||||
if not rg_path:
|
||||
pytest.skip("ripgrep (rg) not installed")
|
||||
|
||||
env = os.environ.copy()
|
||||
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
|
||||
env['RIPGREP_BINARY'] = rg_path # Full absolute path
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Hook failed: {result.stderr}"
|
||||
assert result.stdout.strip(), "Hook should produce output"
|
||||
|
||||
installed_binary = json.loads(result.stdout.strip().split('\n')[0])
|
||||
assert installed_binary['abspath'] == rg_path
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_machine_config_overrides_base_config():
|
||||
"""
|
||||
Test that Machine.config overrides take precedence over base config.
|
||||
|
||||
Guards against regression where archivebox version was showing binaries
|
||||
as "not installed" even though they were detected and stored in Machine.config.
|
||||
"""
|
||||
from machine.models import Machine, InstalledBinary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Simulate a hook detecting chrome and storing it with a different path than base config
|
||||
detected_chrome_path = '/custom/path/to/chrome'
|
||||
machine.config['CHROME_BINARY'] = detected_chrome_path
|
||||
machine.config['CHROME_VERSION'] = '143.0.7499.170'
|
||||
machine.save()
|
||||
|
||||
# Create InstalledBinary record
|
||||
InstalledBinary.objects.create(
|
||||
machine=machine,
|
||||
name='chrome',
|
||||
abspath=detected_chrome_path,
|
||||
version='143.0.7499.170',
|
||||
binprovider='env',
|
||||
)
|
||||
|
||||
# Verify Machine.config takes precedence
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config()
|
||||
|
||||
# Machine.config should override the base config value
|
||||
assert machine.config.get('CHROME_BINARY') == detected_chrome_path
|
||||
|
||||
# The version command should use Machine.config, not base config
|
||||
# (Base config might have 'chromium' while Machine.config has the full path)
|
||||
bin_value = machine.config.get('CHROME_BINARY') or config.get('CHROME_BINARY', '')
|
||||
assert bin_value == detected_chrome_path, \
|
||||
"Machine.config override should take precedence over base config"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_search_backend_engine_passed_to_hooks():
|
||||
"""
|
||||
Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
|
||||
|
||||
Guards against regression where hooks couldn't determine which search backend was active.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import build_hook_environment
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config()
|
||||
search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
|
||||
|
||||
env = build_hook_environment(overrides=None)
|
||||
|
||||
assert 'SEARCH_BACKEND_ENGINE' in env, \
|
||||
"SEARCH_BACKEND_ENGINE must be in hook environment"
|
||||
assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
|
||||
f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_install_creates_installedbinary_records():
|
||||
"""
|
||||
Test that archivebox install creates InstalledBinary records for detected binaries.
|
||||
|
||||
This is an integration test that verifies the full install flow.
|
||||
"""
|
||||
from machine.models import Machine, InstalledBinary
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.statemachines import CrawlMachine
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
machine = Machine.current()
|
||||
initial_binary_count = InstalledBinary.objects.filter(machine=machine).count()
|
||||
|
||||
# Create an install crawl (like archivebox install does)
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
seed, _ = Seed.objects.get_or_create(
|
||||
uri='archivebox://test-install',
|
||||
label='Test dependency detection',
|
||||
created_by_id=created_by_id,
|
||||
defaults={'extractor': 'auto'},
|
||||
)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
# Run the crawl state machine (this triggers hooks)
|
||||
sm = CrawlMachine(crawl)
|
||||
sm.send('tick') # queued -> started (runs hooks)
|
||||
|
||||
# Verify InstalledBinary records were created
|
||||
final_binary_count = InstalledBinary.objects.filter(machine=machine).count()
|
||||
assert final_binary_count > initial_binary_count, \
|
||||
"archivebox install should create InstalledBinary records"
|
||||
|
||||
# Verify at least some common binaries were detected
|
||||
common_binaries = ['git', 'wget', 'node']
|
||||
detected = []
|
||||
for bin_name in common_binaries:
|
||||
if InstalledBinary.objects.filter(machine=machine, name=bin_name).exists():
|
||||
detected.append(bin_name)
|
||||
|
||||
assert detected, f"At least one of {common_binaries} should be detected"
|
||||
|
||||
# Verify detected binaries have valid paths and versions
|
||||
for binary in InstalledBinary.objects.filter(machine=machine):
|
||||
if binary.abspath: # Only check non-empty paths
|
||||
assert '/' in binary.abspath, \
|
||||
f"{binary.name} should have full path, not just name: {binary.abspath}"
|
||||
# Version might be empty for some binaries, that's ok
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_ripgrep_only_detected_when_backend_enabled():
|
||||
"""
|
||||
Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
|
||||
|
||||
Guards against ripgrep being installed/detected when not needed.
|
||||
"""
|
||||
from machine.models import Machine, InstalledBinary
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.statemachines import CrawlMachine
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from django.conf import settings
|
||||
|
||||
if not shutil.which('rg'):
|
||||
pytest.skip("ripgrep (rg) not installed")
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Clear any existing ripgrep records
|
||||
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
|
||||
|
||||
# Test 1: With ripgrep backend - should be detected
|
||||
with patch('archivebox.config.configset.get_config') as mock_config:
|
||||
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
seed = Seed.objects.create(
|
||||
uri='archivebox://test-rg-enabled',
|
||||
label='Test ripgrep detection enabled',
|
||||
created_by_id=created_by_id,
|
||||
extractor='auto',
|
||||
)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
sm = CrawlMachine(crawl)
|
||||
sm.send('tick')
|
||||
|
||||
# Ripgrep should be detected
|
||||
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
|
||||
assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
|
||||
|
||||
# Clear records again
|
||||
InstalledBinary.objects.filter(machine=machine, name='rg').delete()
|
||||
|
||||
# Test 2: With different backend - should NOT be detected
|
||||
with patch('archivebox.config.configset.get_config') as mock_config:
|
||||
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
|
||||
|
||||
seed2 = Seed.objects.create(
|
||||
uri='archivebox://test-rg-disabled',
|
||||
label='Test ripgrep detection disabled',
|
||||
created_by_id=created_by_id,
|
||||
extractor='auto',
|
||||
)
|
||||
|
||||
crawl2 = Crawl.objects.create(
|
||||
seed=seed2,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
sm2 = CrawlMachine(crawl2)
|
||||
sm2.send('tick')
|
||||
|
||||
# Ripgrep should NOT be detected
|
||||
rg_detected = InstalledBinary.objects.filter(machine=machine, name='rg').exists()
|
||||
assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -29,7 +29,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sonic'
|
||||
OUTPUT_DIR = 'search_index'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
# Text file patterns to index
|
||||
INDEXABLE_FILES = [
|
||||
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sqlite'
|
||||
OUTPUT_DIR = 'search_index'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
# Text file patterns to index, in priority order
|
||||
INDEXABLE_FILES = [
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'seo';
|
||||
const OUTPUT_DIR = 'seo';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'seo.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract SEO metadata
|
||||
async function extractSeo(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
@@ -40,7 +40,7 @@ const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = 'singlefile';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
/**
|
||||
@@ -102,8 +102,7 @@ async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
.filter(fn => fn.endsWith('.html'))
|
||||
);
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
@@ -170,8 +169,7 @@ async function saveSinglefileWithCLI(url, options = {}) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
|
||||
@@ -41,7 +41,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'singlefile'
|
||||
BIN_NAME = 'single-file'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'singlefile'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'singlefile.html'
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
@@ -135,7 +135,7 @@ def get_version(binary: str) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
CHROME_SESSION_DIR = '../chrome_session'
|
||||
|
||||
|
||||
def get_cdp_url() -> str | None:
|
||||
@@ -203,9 +203,8 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
|
||||
cmd.extend([url, str(output_path)])
|
||||
@@ -274,7 +273,7 @@ def main(url: str, snapshot_id: str):
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
|
||||
cmd_str = f'{binary} {url} {OUTPUT_FILE}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
|
||||
@@ -21,9 +21,9 @@ const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'ssl';
|
||||
const OUTPUT_DIR = 'ssl';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'ssl.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -60,10 +60,7 @@ function getCdpUrl() {
|
||||
|
||||
// Extract SSL details
|
||||
async function extractSsl(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Only extract SSL for HTTPS URLs
|
||||
|
||||
@@ -31,8 +31,8 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'staticfile'
|
||||
OUTPUT_DIR = 'staticfile'
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
OUTPUT_DIR = '.'
|
||||
CHROME_SESSION_DIR = '../chrome_session'
|
||||
|
||||
# Content-Types that indicate static files
|
||||
# These can't be meaningfully processed by Chrome-based extractors
|
||||
@@ -214,9 +214,8 @@ def download_file(url: str) -> tuple[bool, str | None, str]:
|
||||
if content_length and int(content_length) > max_size:
|
||||
return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
|
||||
|
||||
# Create output directory
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Determine filename
|
||||
filename = get_filename_from_url(url)
|
||||
|
||||
@@ -21,9 +21,9 @@ const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'title';
|
||||
const OUTPUT_DIR = 'title';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'title.txt';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
@@ -162,10 +162,7 @@ async function getTitleFromCdp(cdpUrl) {
|
||||
}
|
||||
|
||||
async function extractTitle(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Try Chrome session first
|
||||
|
||||
@@ -43,7 +43,7 @@ import rich_click as click
|
||||
EXTRACTOR_NAME = 'wget'
|
||||
BIN_NAME = 'wget'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'wget'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -66,7 +66,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
|
||||
Reference in New Issue
Block a user