way better plugin hooks system wip

This commit is contained in:
Nick Sweeting
2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions

View File

@@ -0,0 +1 @@
{"type": "Binary", "name": "wget", "binproviders": "apt,brew,pip,env"}

View File

@@ -1,107 +0,0 @@
#!/usr/bin/env python3
"""
Install hook for wget binary.
Runs at crawl start to verify wget is available.
Outputs JSONL for InstalledBinary and Machine config updates.
Respects WGET_BINARY env var for custom binary paths.
"""
import os
import sys
import json
from pathlib import Path
def find_wget() -> dict | None:
"""Find wget binary using abx-pkg, respecting WGET_BINARY env var."""
try:
from abx_pkg import Binary, EnvProvider
# Check if user has configured a custom binary
configured_binary = os.environ.get('WGET_BINARY', '').strip()
if configured_binary:
# User specified a custom binary path or name
if '/' in configured_binary:
# Absolute path - extract name from path
bin_name = Path(configured_binary).name
else:
# Just a binary name
bin_name = configured_binary
else:
# Default to 'wget'
bin_name = 'wget'
binary = Binary(name=bin_name, binproviders=[EnvProvider()])
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': bin_name,
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except Exception:
pass
return None
def main():
"""Find wget binary and output JSONL."""
# Determine binary name from config
configured_binary = os.environ.get('WGET_BINARY', '').strip()
if configured_binary and '/' in configured_binary:
bin_name = Path(configured_binary).name
elif configured_binary:
bin_name = configured_binary
else:
bin_name = 'wget'
result = find_wget()
if result and result.get('abspath'):
# Output InstalledBinary
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
# Output Machine config update
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/WGET_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/WGET_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
# Output Dependency request (uses configured bin_name)
print(json.dumps({
'type': 'Dependency',
'bin_name': bin_name,
'bin_providers': 'apt,brew,env',
}))
# Exit non-zero to indicate binary not found
print(f"{bin_name} binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -9,7 +9,7 @@ This hook runs early in the Crawl lifecycle to:
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- InstalledBinary JSONL records to stdout when binaries are found
- Binary JSONL records to stdout when binaries are found
"""
import json
@@ -40,12 +40,12 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def output_installed_binary(binary: Binary, name: str):
"""Output InstalledBinary JSONL record to stdout."""
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'InstalledBinary',
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
@@ -97,8 +97,8 @@ def main():
wget_version = str(binary.version) if binary.version else 'unknown'
computed['WGET_VERSION'] = wget_version
# Output InstalledBinary JSONL record
output_installed_binary(binary, name='wget')
# Output Binary JSONL record
output_binary(binary, name='wget')
# Check for compression support
if computed.get('WGET_BINARY'):

View File

@@ -30,7 +30,6 @@ Environment variables:
import json
import os
import re
import shutil
import subprocess
import sys
from datetime import datetime, timezone
@@ -74,36 +73,6 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
def find_wget() -> str | None:
"""Find wget binary."""
wget = get_env('WGET_BINARY')
if wget and os.path.isfile(wget):
return wget
return shutil.which('wget')
def get_version(binary: str) -> str:
"""Get wget version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.split('\n')[0].strip()[:64]
except Exception:
return ''
def check_wget_compression(binary: str) -> bool:
"""Check if wget supports --compression=auto."""
try:
result = subprocess.run(
[binary, '--compression=auto', '--help'],
capture_output=True,
timeout=5
)
return result.returncode == 0
except Exception:
return False
# Default wget args (from old WGET_CONFIG)
WGET_DEFAULT_ARGS = [
'--no-verbose',
@@ -135,9 +104,6 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
save_warc = get_env_bool('SAVE_WARC', True)
save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True)
# Check for compression support
supports_compression = check_wget_compression(binary)
# Build wget command (later options take precedence)
cmd = [
binary,
@@ -166,9 +132,6 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--load-cookies', cookies_file])
if supports_compression:
cmd.append('--compression=auto')
if not check_ssl:
cmd.extend(['--no-check-certificate', '--no-hsts'])
@@ -230,13 +193,9 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
def main(url: str, snapshot_id: str):
"""Archive a URL using wget."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if wget is enabled
@@ -251,35 +210,17 @@ def main(url: str, snapshot_id: str):
print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
sys.exit(0)
# Find binary
binary = find_wget()
if not binary:
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} ... {url}'
# Get binary from environment
binary = get_env('WGET_BINARY', 'wget')
# Run extraction
success, output, error = save_wget(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Count downloaded files
files = list(Path('.').rglob('*'))
file_count = len([f for f in files if f.is_file()])
print(f'wget completed: {file_count} files downloaded')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Calculate duration
end_ts = datetime.now(timezone.utc)
if error:
print(f'ERROR: {error}', file=sys.stderr)
@@ -289,10 +230,6 @@ def main(url: str, snapshot_id: str):
'status': status,
'output_str': output or error or '',
}
if binary:
result['cmd'] = [binary, '--no-verbose', url]
if version:
result['cmd_version'] = version
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)

View File

@@ -26,9 +26,9 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Binary__install_using_brew_provider.py'
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Binary__install_using_apt_provider.py'
TEST_URL = 'https://example.com'
@@ -37,10 +37,10 @@ def test_hook_script_exists():
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
def test_wget_validate_hook():
"""Test wget validate hook checks for wget binary."""
def test_wget_install_hook():
"""Test wget install hook checks for wget binary."""
result = subprocess.run(
[sys.executable, str(WGET_VALIDATE_HOOK)],
[sys.executable, str(WGET_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
@@ -48,20 +48,20 @@ def test_wget_validate_hook():
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
if record.get('type') == 'Binary':
assert record['name'] == 'wget'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
assert found_binary, "Should output Binary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
@@ -150,8 +150,8 @@ def test_can_install_wget_via_provider():
# Should succeed (wget installs successfully or is already installed)
assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
# Should output InstalledBinary JSONL record
assert 'InstalledBinary' in result.stdout or 'wget' in result.stderr, \
# Should output Binary JSONL record
assert 'Binary' in result.stdout or 'wget' in result.stderr, \
f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
# Parse JSONL if present
@@ -159,7 +159,7 @@ def test_can_install_wget_via_provider():
for line in result.stdout.strip().split('\n'):
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
if record.get('type') == 'Binary':
assert record['name'] == 'wget'
assert record['binprovider'] in ['brew', 'apt']
assert record['abspath'], "Should have binary path"
@@ -216,9 +216,21 @@ def test_archives_example_com():
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Verify output in stdout
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'wget completed' in result.stdout, "Should report completion"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify files were downloaded
downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
@@ -245,23 +257,9 @@ def test_archives_example_com():
'more information' in html_content.lower()), \
"Missing IANA reference"
# Verify RESULT_JSON is present and valid
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
for line in result.stdout.split('\n'):
if line.startswith('RESULT_JSON='):
result_json = json.loads(line.replace('RESULT_JSON=', ''))
assert result_json['extractor'] == 'wget'
assert result_json['status'] == 'succeeded'
assert result_json['url'] == TEST_URL
assert result_json['snapshot_id'] == 'test789'
assert 'duration' in result_json
assert result_json['duration'] >= 0
break
def test_config_save_wget_false_skips():
"""Test that SAVE_WGET=False causes skip."""
"""Test that SAVE_WGET=False exits without emitting JSONL."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -279,10 +277,15 @@ def test_config_save_wget_false_skips():
timeout=30
)
# Should succeed but skip
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
assert 'SAVE_WGET=False' in result.stdout, "Should mention SAVE_WGET=False"
# Should exit 0 when feature disabled
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_config_save_warc():
@@ -323,23 +326,44 @@ def test_staticfile_present_skips():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create staticfile directory with content to simulate staticfile extractor ran
# Create directory structure like real ArchiveBox:
# tmpdir/
# staticfile/ <- staticfile extractor output
# wget/ <- wget extractor runs here, looks for ../staticfile
staticfile_dir = tmpdir / 'staticfile'
staticfile_dir.mkdir()
(staticfile_dir / 'index.html').write_text('<html>test</html>')
wget_dir = tmpdir / 'wget'
wget_dir.mkdir()
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
cwd=tmpdir,
cwd=wget_dir, # Run from wget subdirectory
capture_output=True,
text=True,
timeout=30
)
# Should skip
assert result.returncode == 0, "Should exit 0 when skipping"
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
# Should skip with permanent skip JSONL
assert result.returncode == 0, "Should exit 0 when permanently skipping"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should emit ArchiveResult JSONL for permanent skip"
assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}"
assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str"
def test_handles_404_gracefully():
@@ -418,7 +442,21 @@ def test_config_user_agent():
# Should succeed (example.com doesn't block)
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
if __name__ == '__main__':