much better tests and add page ui

This commit is contained in:
Nick Sweeting
2025-12-29 04:02:11 -08:00
parent 9487f8a0de
commit 30c60eef76
93 changed files with 2998 additions and 2712 deletions

View File

@@ -22,12 +22,68 @@ from pathlib import Path
import pytest
import tempfile
import shutil
import platform
PLUGIN_DIR = Path(__file__).parent.parent
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
# Get LIB_DIR and MACHINE_TYPE from environment or compute them
def get_lib_dir_and_machine_type():
"""Get or compute LIB_DIR and MACHINE_TYPE for tests."""
from archivebox.config.paths import get_machine_type
from archivebox.config.common import STORAGE_CONFIG
lib_dir = os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)
machine_type = os.environ.get('MACHINE_TYPE') or get_machine_type()
return Path(lib_dir), machine_type
# Setup NODE_PATH to find npm packages
LIB_DIR, MACHINE_TYPE = get_lib_dir_and_machine_type()
# Note: LIB_DIR already includes machine_type (e.g., data/lib/arm64-darwin)
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
NPM_PREFIX = LIB_DIR / 'npm'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
env['MACHINE_TYPE'] = MACHINE_TYPE
return env
@pytest.fixture(scope="session", autouse=True)
def ensure_puppeteer_installed():
"""Ensure puppeteer is installed in LIB_DIR before running tests."""
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
# Rebuild pydantic models
NpmProvider.model_rebuild()
# Check if puppeteer-core is already available
puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core'
if puppeteer_core_path.exists():
return # Already installed
print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
NPM_PREFIX.mkdir(parents=True, exist_ok=True)
# Install puppeteer using NpmProvider with custom prefix
provider = NpmProvider(npm_prefix=NPM_PREFIX)
try:
binary = Binary(
name='puppeteer',
binproviders=[provider],
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
)
binary.install()
print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
except Exception as e:
pytest.skip(f"Failed to install puppeteer: {e}")
def test_hook_scripts_exist():
"""Verify chrome hooks exist."""
@@ -65,6 +121,10 @@ def test_chrome_launch_and_tab_creation():
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
# Get test environment with NODE_PATH set
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
@@ -72,7 +132,7 @@ def test_chrome_launch_and_tab_creation():
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env={**os.environ, 'CHROME_HEADLESS': 'true'}
env=env
)
# Wait for Chrome to launch (check process isn't dead and files exist)
@@ -133,13 +193,14 @@ def test_chrome_launch_and_tab_creation():
snapshot_chrome_dir.mkdir()
# Launch tab at snapshot level
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env={**os.environ, 'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
env=env
)
assert result.returncode == 0, f"Tab creation failed: {result.stderr}\nStdout: {result.stdout}"
@@ -179,7 +240,7 @@ def test_chrome_navigation():
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env={**os.environ, 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
@@ -199,7 +260,7 @@ def test_chrome_navigation():
capture_output=True,
text=True,
timeout=60,
env={**os.environ, 'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
)
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
@@ -210,7 +271,7 @@ def test_chrome_navigation():
capture_output=True,
text=True,
timeout=120,
env={**os.environ, 'CHROME_PAGELOAD_TIMEOUT': '30', 'CHROME_WAIT_FOR': 'load'}
env=get_test_env() | {'CHROME_PAGELOAD_TIMEOUT': '30', 'CHROME_WAIT_FOR': 'load'}
)
assert result.returncode == 0, f"Navigation failed: {result.stderr}\nStdout: {result.stdout}"
@@ -250,7 +311,7 @@ def test_tab_cleanup_on_sigterm():
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env={**os.environ, 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
@@ -270,7 +331,7 @@ def test_tab_cleanup_on_sigterm():
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env={**os.environ, 'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
)
# Wait for tab to be created
@@ -314,7 +375,7 @@ def test_multiple_snapshots_share_chrome():
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env={**os.environ, 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
@@ -344,7 +405,7 @@ def test_multiple_snapshots_share_chrome():
capture_output=True,
text=True,
timeout=60,
env={**os.environ, 'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
)
assert result.returncode == 0, f"Tab {snap_num} creation failed: {result.stderr}"
@@ -400,7 +461,7 @@ def test_chrome_cleanup_on_crawl_end():
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env={**os.environ, 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
@@ -445,7 +506,7 @@ def test_zombie_prevention_hook_killed():
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env={**os.environ, 'CHROME_HEADLESS': 'true'}
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch

View File

@@ -12,6 +12,7 @@ Tests verify:
"""
import json
import os
import subprocess
import sys
import tempfile
@@ -26,6 +27,22 @@ DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_PATH
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""

View File

@@ -30,6 +30,27 @@ from pathlib import Path
import rich_click as click
# Monkey patch forum-dl for Pydantic v2 compatibility
# forum-dl 0.3.0 uses deprecated json(models_as_dict=False) which doesn't work in Pydantic v2
try:
from forum_dl.writers.jsonl import JsonlWriter
from pydantic import BaseModel
# Check if we're using Pydantic v2 (has model_dump_json)
if hasattr(BaseModel, 'model_dump_json'):
# Patch JsonlWriter to use Pydantic v2 API
original_serialize = JsonlWriter._serialize_entry
def _patched_serialize_entry(self, entry):
# Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)
return entry.model_dump_json()
JsonlWriter._serialize_entry = _patched_serialize_entry
except (ImportError, AttributeError):
# forum-dl not installed or already compatible
pass
# Extractor metadata
PLUGIN_NAME = 'forumdl'
BIN_NAME = 'forum-dl'

View File

@@ -16,6 +16,7 @@ import json
import subprocess
import sys
import tempfile
import time
import uuid
from pathlib import Path
import pytest
@@ -187,16 +188,98 @@ def test_config_timeout():
env['FORUMDL_BINARY'] = binary_path
env['FORUMDL_TIMEOUT'] = '5'
start_time = time.time()
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
timeout=10 # Should complete in 5s, use 10s as safety margin
)
elapsed_time = time.time() - start_time
assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
# Allow 1 second overhead for subprocess startup and Python interpreter
assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
def test_real_forum_url():
"""Test that forum-dl processes real forum URLs with jsonl output format.
NOTE: forum-dl currently has known issues:
- Pydantic v2 incompatibility causing errors with most extractors
- Many forums return 403/404 or have changed their structure
- This test verifies the hook runs and handles these issues gracefully
If forum-dl is fixed in the future, this test should start succeeding with actual downloads.
"""
import os
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip("forum-dl binary not available")
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try HackerNews - supported by forum-dl but currently has Pydantic v2 compat issues
# When forum-dl is updated, this URL should work
forum_url = 'https://news.ycombinator.com/item?id=1'
env = os.environ.copy()
env['FORUMDL_BINARY'] = binary_path
env['FORUMDL_TIMEOUT'] = '60'
env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format as requested
# HTML output would be via: env['FORUMDL_EXTRA_ARGS'] = '--files-output ./files'
start_time = time.time()
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', forum_url, '--snapshot-id', 'testforum'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=90
)
elapsed_time = time.time() - start_time
# Test passes if the hook handles the URL gracefully (success OR handled error)
# This is appropriate given forum-dl's current state
assert result.returncode in (0, 1), f"Hook should handle forum URL gracefully. stderr: {result.stderr}"
# Check for successful extraction (will pass when forum-dl is fixed)
if result.returncode == 0:
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result_json and result_json['status'] == 'succeeded':
output_files = list(tmpdir.glob('**/*'))
forum_files = [f for f in output_files if f.is_file()]
if forum_files:
print(f"✓ Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
else:
print(f"✓ Completed in {elapsed_time:.2f}s (no content - URL may not be a forum thread)")
else:
print(f"✓ Completed in {elapsed_time:.2f}s (no content extracted)")
else:
# Handled error gracefully - test still passes
error_msg = result.stderr.strip()[:200]
print(f"✓ Handled error gracefully in {elapsed_time:.2f}s")
# Known issues: Pydantic v2 compat, 403 errors, etc.
assert '403' in error_msg or 'pydantic' in error_msg.lower() or 'error' in error_msg.lower(), \
f"Expected known error type, got: {error_msg}"
assert result.returncode == 0, "Should complete without hanging"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -16,6 +16,7 @@ import json
import subprocess
import sys
import tempfile
import time
from pathlib import Path
import pytest
@@ -117,16 +118,73 @@ def test_config_timeout():
env = os.environ.copy()
env['GALLERY_DL_TIMEOUT'] = '5'
start_time = time.time()
result = subprocess.run(
[sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
timeout=10 # Should complete in 5s, use 10s as safety margin
)
elapsed_time = time.time() - start_time
assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
# Allow 1 second overhead for subprocess startup and Python interpreter
assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
def test_real_gallery_url():
"""Test that gallery-dl can extract images from a real Flickr gallery URL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Use a real Flickr photo page
gallery_url = 'https://www.flickr.com/photos/gregorydolivet/55002388567/in/explore-2025-12-25/'
env = os.environ.copy()
env['GALLERY_DL_TIMEOUT'] = '60' # Give it time to download
start_time = time.time()
result = subprocess.run(
[sys.executable, str(GALLERYDL_HOOK), '--url', gallery_url, '--snapshot-id', 'testflickr'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=90
)
elapsed_time = time.time() - start_time
# Should succeed
assert result.returncode == 0, f"Should extract gallery successfully: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Check that some files were downloaded
output_files = list(tmpdir.glob('**/*'))
image_files = [f for f in output_files if f.is_file() and f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.gif', '.webp')]
assert len(image_files) > 0, f"Should have downloaded at least one image. Files: {output_files}"
print(f"Successfully extracted {len(image_files)} image(s) in {elapsed_time:.2f}s")
assert result.returncode == 0, "Should complete without hanging"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -13,6 +13,7 @@ import shutil
import subprocess
import sys
import tempfile
import time
from pathlib import Path
import pytest
@@ -77,5 +78,59 @@ def test_handles_non_git_url():
# Should report failure or skip for non-git URL
assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip: {result_json}"
def test_real_git_repo():
"""Test that git can clone a real GitHub repository."""
import os
if not shutil.which('git'):
pytest.skip("git binary not available")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Use a real but small GitHub repository
git_url = 'https://github.com/ArchiveBox/abx-pkg'
env = os.environ.copy()
env['GIT_TIMEOUT'] = '120' # Give it time to clone
start_time = time.time()
result = subprocess.run(
[sys.executable, str(GIT_HOOK), '--url', git_url, '--snapshot-id', 'testgit'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=180
)
elapsed_time = time.time() - start_time
# Should succeed
assert result.returncode == 0, f"Should clone repository successfully: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Check that the git repo was cloned
git_dirs = list(tmpdir.glob('**/.git'))
assert len(git_dirs) > 0, f"Should have cloned a git repository. Contents: {list(tmpdir.rglob('*'))}"
print(f"Successfully cloned repository in {elapsed_time:.2f}s")
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -76,9 +76,7 @@ def get_ytdlp_default_args(media_max_size: str = '750m') -> list[str]:
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
@@ -112,7 +110,7 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
binary,
*get_ytdlp_default_args(media_max_size),
'--no-progress',
'-o', f'{OUTPUT_DIR}/%(title)s.%(ext)s',
'-o', '%(title)s.%(ext)s',
]
if not check_ssl:

View File

@@ -16,6 +16,7 @@ import json
import subprocess
import sys
import tempfile
import time
from pathlib import Path
import pytest
@@ -131,16 +132,73 @@ def test_config_timeout():
env = os.environ.copy()
env['MEDIA_TIMEOUT'] = '5'
start_time = time.time()
result = subprocess.run(
[sys.executable, str(MEDIA_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
timeout=10 # Should complete in 5s, use 10s as safety margin
)
elapsed_time = time.time() - start_time
assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
# Allow 1 second overhead for subprocess startup and Python interpreter
assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
def test_real_youtube_url():
"""Test that yt-dlp can extract media from a real YouTube URL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Use a short, stable YouTube video (YouTube's own about video)
youtube_url = 'https://www.youtube.com/watch?v=jNQXAC9IVRw' # "Me at the zoo" - first YouTube video
env = os.environ.copy()
env['MEDIA_TIMEOUT'] = '120' # Give it time to download
start_time = time.time()
result = subprocess.run(
[sys.executable, str(MEDIA_HOOK), '--url', youtube_url, '--snapshot-id', 'testyoutube'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=180
)
elapsed_time = time.time() - start_time
# Should succeed
assert result.returncode == 0, f"Should extract media successfully: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Check that some media files were downloaded
output_files = list(tmpdir.glob('**/*'))
media_files = [f for f in output_files if f.is_file() and f.suffix.lower() in ('.mp4', '.webm', '.mkv', '.m4a', '.mp3', '.json', '.jpg', '.webp')]
assert len(media_files) > 0, f"Should have downloaded at least one media file. Files: {output_files}"
print(f"Successfully extracted {len(media_files)} file(s) in {elapsed_time:.2f}s")
assert result.returncode == 0, "Should complete without hanging"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -7,11 +7,13 @@ Output: Binary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
LIB_DIR: Library directory including machine type (e.g., data/lib/arm64-darwin) (required)
"""
import json
import os
import sys
from pathlib import Path
import rich_click as click
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
@@ -34,13 +36,25 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
click.echo(f"npm provider not allowed for {name}", err=True)
sys.exit(0)
# Use abx-pkg NpmProvider to install binary
provider = NpmProvider()
# Get LIB_DIR from environment (required)
# Note: LIB_DIR already includes machine type (e.g., data/lib/arm64-darwin)
lib_dir = os.environ.get('LIB_DIR')
if not lib_dir:
click.echo("ERROR: LIB_DIR environment variable not set", err=True)
sys.exit(1)
# Structure: lib/arm64-darwin/npm (npm will create node_modules inside this)
npm_prefix = Path(lib_dir) / 'npm'
npm_prefix.mkdir(parents=True, exist_ok=True)
# Use abx-pkg NpmProvider to install binary with custom prefix
provider = NpmProvider(npm_prefix=npm_prefix)
if not provider.INSTALLER_BIN:
click.echo("npm not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {name} via npm...", err=True)
click.echo(f"Installing {name} via npm to {npm_prefix}...", err=True)
try:
# Parse overrides if provided

View File

@@ -13,6 +13,7 @@ Tests verify:
"""
import json
import os
import subprocess
import sys
import tempfile
@@ -27,6 +28,22 @@ PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_PATH
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""

View File

@@ -4,10 +4,15 @@ Install a binary using pip package manager.
Usage: on_Binary__install_using_pip_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
Output: Binary JSONL record to stdout after installation
Environment variables:
LIB_DIR: Library directory including machine type (e.g., data/lib/arm64-darwin) (required)
"""
import json
import os
import sys
from pathlib import Path
import rich_click as click
from abx_pkg import Binary, PipProvider
@@ -30,13 +35,25 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
click.echo(f"pip provider not allowed for {name}", err=True)
sys.exit(0)
# Use abx-pkg PipProvider to install binary
provider = PipProvider()
# Get LIB_DIR from environment (required)
# Note: LIB_DIR already includes machine type (e.g., data/lib/arm64-darwin)
lib_dir = os.environ.get('LIB_DIR')
if not lib_dir:
click.echo("ERROR: LIB_DIR environment variable not set", err=True)
sys.exit(1)
# Structure: lib/arm64-darwin/pip/venv (PipProvider will create venv automatically)
pip_venv_path = Path(lib_dir) / 'pip' / 'venv'
pip_venv_path.parent.mkdir(parents=True, exist_ok=True)
# Use abx-pkg PipProvider to install binary with custom venv
provider = PipProvider(pip_venv=pip_venv_path)
if not provider.INSTALLER_BIN:
click.echo("pip not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {name} via pip...", err=True)
click.echo(f"Installing {name} via pip to venv at {pip_venv_path}...", err=True)
try:
# Parse overrides if provided

View File

@@ -26,6 +26,22 @@ PLUGINS_ROOT = PLUGIN_DIR.parent
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
TEST_URL = 'https://example.com'
# Get LIB_DIR for NODE_PATH
def get_lib_dir():
"""Get LIB_DIR for tests."""
from archivebox.config.common import STORAGE_CONFIG
return Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = LIB_DIR / 'npm' / 'node_modules'
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
env['LIB_DIR'] = str(LIB_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""

View File

@@ -1,131 +1,91 @@
#!/usr/bin/env python3
"""
Install and configure ripgrep binary.
Install hook for ripgrep binary.
This hook runs early in the Crawl lifecycle to:
1. Install ripgrep binary if needed
2. Check if ripgrep backend is enabled
3. Output Binary JSONL records when ripgrep is found
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- Binary JSONL records to stdout when binaries are found
Runs at crawl start to verify ripgrep is available when SEARCH_BACKEND_ENGINE='ripgrep'.
Outputs JSONL for Binary and Machine config updates.
Uses abx-pkg to handle installation via apt/brew providers.
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
import json
# Read config from environment
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def find_ripgrep() -> dict | None:
"""Find ripgrep binary using abx-pkg, respecting RIPGREP_BINARY env var."""
# Quick check: if RIPGREP_BINARY is set and exists, skip expensive lookup
configured_binary = os.environ.get('RIPGREP_BINARY', '').strip()
if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
# Binary is already configured and valid - exit immediately
sys.exit(0)
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
from abx_pkg import Binary, EnvProvider, AptProvider, BrewProvider, BinProviderOverrides
# Try to find ripgrep using abx-pkg (EnvProvider checks PATH, apt/brew handle installation)
binary = Binary(
name='rg',
binproviders=[EnvProvider(), AptProvider(), BrewProvider()],
overrides={
'apt': {'packages': ['ripgrep']},
'brew': {'packages': ['ripgrep']},
}
)
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'rg',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except Exception as e:
print(f"Error loading ripgrep: {e}", file=sys.stderr)
pass
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def output_machine_config(key: str, value: str):
"""Output Machine config JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Machine',
'id': machine_id or 'default',
'key': key,
'value': value,
'machine_id': machine_id,
}
print(json.dumps(record))
return None
def main():
warnings = []
errors = []
computed = {}
# Get config values
search_backend_engine = get_env('SEARCH_BACKEND_ENGINE', 'ripgrep')
ripgrep_binary = get_env('RIPGREP_BINARY', 'rg')
search_backend_timeout = get_env_int('SEARCH_BACKEND_TIMEOUT', 90)
# Only proceed if ripgrep backend is enabled
search_backend_engine = os.environ.get('SEARCH_BACKEND_ENGINE', 'ripgrep').strip()
if search_backend_engine != 'ripgrep':
# Not using ripgrep, exit successfully without output
sys.exit(0)
# Check binary availability using abx-pkg (trust abx-pkg only)
provider = EnvProvider()
try:
binary = Binary(name=ripgrep_binary, binproviders=[provider]).load()
resolved_path = str(binary.abspath) if binary.abspath else ''
except Exception:
binary = None
resolved_path = ''
result = find_ripgrep()
if not resolved_path:
errors.append(f"RIPGREP_BINARY={ripgrep_binary} not found. Install ripgrep: apt install ripgrep")
computed['RIPGREP_BINARY'] = ''
if result and result.get('abspath'):
print(json.dumps({
'type': 'Binary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/RIPGREP_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/RIPGREP_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
computed['RIPGREP_BINARY'] = resolved_path
ripgrep_version = str(binary.version) if binary.version else 'unknown'
computed['RIPGREP_VERSION'] = ripgrep_version
# Output Binary JSONL record
output_binary(binary, name='rg')
# Output Machine config JSONL record
output_machine_config('config/RIPGREP_BINARY', resolved_path)
# Validate timeout
if search_backend_timeout < 10:
warnings.append(
f"SEARCH_BACKEND_TIMEOUT={search_backend_timeout} is very low. "
"Searches may timeout. Consider setting SEARCH_BACKEND_TIMEOUT=90 or higher."
)
# Output results
# Format: KEY=VALUE lines that hooks.py will parse and add to env
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
# Exit with error if any hard errors
sys.exit(1 if errors else 0)
print(f"Ripgrep binary not found (install with: apt install ripgrep or brew install ripgrep)", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -81,12 +81,12 @@ def test_ripgrep_hook_skips_when_backend_not_ripgrep():
def test_ripgrep_hook_handles_absolute_path():
"""Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
"""Test that ripgrep hook exits successfully when RIPGREP_BINARY is a valid absolute path."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
rg_path = shutil.which('rg')
if not rg_path:
pass
pytest.skip("ripgrep not installed")
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
@@ -100,11 +100,9 @@ def test_ripgrep_hook_handles_absolute_path():
timeout=10,
)
assert result.returncode == 0, f"Hook failed: {result.stderr}"
assert result.stdout.strip(), "Hook should produce output"
binary = json.loads(result.stdout.strip().split('\n')[0])
assert binary['abspath'] == rg_path
# When binary is already configured with valid absolute path, hook exits early without output
assert result.returncode == 0, f"Hook should exit successfully when binary already configured: {result.stderr}"
# No output is expected/needed when binary is already valid
@pytest.mark.django_db