way better plugin hooks system wip

2026-01-04 09:55:33 +10:00 · 2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions
--- a/archivebox/plugins/wget/binaries.jsonl
+++ b/archivebox/plugins/wget/binaries.jsonl
@@ -0,0 +1 @@
+{"type": "Binary", "name": "wget", "binproviders": "apt,brew,pip,env"}
--- a/archivebox/plugins/wget/on_Crawl__00_install_wget.py
+++ b/archivebox/plugins/wget/on_Crawl__00_install_wget.py
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-"""
-Install hook for wget binary.
-
-Runs at crawl start to verify wget is available.
-Outputs JSONL for InstalledBinary and Machine config updates.
-Respects WGET_BINARY env var for custom binary paths.
-"""
-
-import os
-import sys
-import json
-from pathlib import Path
-
-
-def find_wget() -> dict | None:
-    """Find wget binary using abx-pkg, respecting WGET_BINARY env var."""
-    try:
-        from abx_pkg import Binary, EnvProvider
-
-        # Check if user has configured a custom binary
-        configured_binary = os.environ.get('WGET_BINARY', '').strip()
-
-        if configured_binary:
-            # User specified a custom binary path or name
-            if '/' in configured_binary:
-                # Absolute path - extract name from path
-                bin_name = Path(configured_binary).name
-            else:
-                # Just a binary name
-                bin_name = configured_binary
-        else:
-            # Default to 'wget'
-            bin_name = 'wget'
-
-        binary = Binary(name=bin_name, binproviders=[EnvProvider()])
-        loaded = binary.load()
-        if loaded and loaded.abspath:
-            return {
-                'name': bin_name,
-                'abspath': str(loaded.abspath),
-                'version': str(loaded.version) if loaded.version else None,
-                'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
-                'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
-            }
-    except Exception:
-        pass
-
-    return None
-
-
-def main():
-    """Find wget binary and output JSONL."""
-    # Determine binary name from config
-    configured_binary = os.environ.get('WGET_BINARY', '').strip()
-    if configured_binary and '/' in configured_binary:
-        bin_name = Path(configured_binary).name
-    elif configured_binary:
-        bin_name = configured_binary
-    else:
-        bin_name = 'wget'
-
-    result = find_wget()
-
-    if result and result.get('abspath'):
-        # Output InstalledBinary
-        print(json.dumps({
-            'type': 'InstalledBinary',
-            'name': result['name'],
-            'abspath': result['abspath'],
-            'version': result['version'],
-            'sha256': result['sha256'],
-            'binprovider': result['binprovider'],
-        }))
-
-        # Output Machine config update
-        print(json.dumps({
-            'type': 'Machine',
-            '_method': 'update',
-            'key': 'config/WGET_BINARY',
-            'value': result['abspath'],
-        }))
-
-        if result['version']:
-            print(json.dumps({
-                'type': 'Machine',
-                '_method': 'update',
-                'key': 'config/WGET_VERSION',
-                'value': result['version'],
-            }))
-
-        sys.exit(0)
-    else:
-        # Output Dependency request (uses configured bin_name)
-        print(json.dumps({
-            'type': 'Dependency',
-            'bin_name': bin_name,
-            'bin_providers': 'apt,brew,env',
-        }))
-
-        # Exit non-zero to indicate binary not found
-        print(f"{bin_name} binary not found", file=sys.stderr)
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py
+++ b/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py
@@ -9,7 +9,7 @@ This hook runs early in the Crawl lifecycle to:

 Output:
    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
-    - InstalledBinary JSONL records to stdout when binaries are found
+    - Binary JSONL records to stdout when binaries are found
 """

 import json
@@ -40,12 +40,12 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


-def output_installed_binary(binary: Binary, name: str):
-    """Output InstalledBinary JSONL record to stdout."""
+def output_binary(binary: Binary, name: str):
+    """Output Binary JSONL record to stdout."""
    machine_id = os.environ.get('MACHINE_ID', '')

    record = {
-        'type': 'InstalledBinary',
+        'type': 'Binary',
        'name': name,
        'abspath': str(binary.abspath),
        'version': str(binary.version) if binary.version else '',
@@ -97,8 +97,8 @@ def main():
        wget_version = str(binary.version) if binary.version else 'unknown'
        computed['WGET_VERSION'] = wget_version

-        # Output InstalledBinary JSONL record
-        output_installed_binary(binary, name='wget')
+        # Output Binary JSONL record
+        output_binary(binary, name='wget')

    # Check for compression support
    if computed.get('WGET_BINARY'):
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -30,7 +30,6 @@ Environment variables:
 import json
 import os
 import re
-import shutil
 import subprocess
 import sys
 from datetime import datetime, timezone
@@ -74,36 +73,6 @@ def has_staticfile_output() -> bool:
    return staticfile_dir.exists() and any(staticfile_dir.iterdir())


-def find_wget() -> str | None:
-    """Find wget binary."""
-    wget = get_env('WGET_BINARY')
-    if wget and os.path.isfile(wget):
-        return wget
-    return shutil.which('wget')
-
-
-def get_version(binary: str) -> str:
-    """Get wget version."""
-    try:
-        result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
-        return result.stdout.split('\n')[0].strip()[:64]
-    except Exception:
-        return ''
-
-
-def check_wget_compression(binary: str) -> bool:
-    """Check if wget supports --compression=auto."""
-    try:
-        result = subprocess.run(
-            [binary, '--compression=auto', '--help'],
-            capture_output=True,
-            timeout=5
-        )
-        return result.returncode == 0
-    except Exception:
-        return False
-
-
 # Default wget args (from old WGET_CONFIG)
 WGET_DEFAULT_ARGS = [
    '--no-verbose',
@@ -135,9 +104,6 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
    save_warc = get_env_bool('SAVE_WARC', True)
    save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True)

-    # Check for compression support
-    supports_compression = check_wget_compression(binary)
-
    # Build wget command (later options take precedence)
    cmd = [
        binary,
@@ -166,9 +132,6 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
    if cookies_file and Path(cookies_file).is_file():
        cmd.extend(['--load-cookies', cookies_file])

-    if supports_compression:
-        cmd.append('--compression=auto')
-
    if not check_ssl:
        cmd.extend(['--no-check-certificate', '--no-hsts'])

@@ -230,13 +193,9 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
 def main(url: str, snapshot_id: str):
    """Archive a URL using wget."""

-    start_ts = datetime.now(timezone.utc)
-    version = ''
    output = None
    status = 'failed'
    error = ''
-    binary = None
-    cmd_str = ''

    try:
        # Check if wget is enabled
@@ -251,35 +210,17 @@ def main(url: str, snapshot_id: str):
            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
            sys.exit(0)

-        # Find binary
-        binary = find_wget()
-        if not binary:
-            print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
-            print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
-            print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
-            print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr)
-            sys.exit(1)
-
-        version = get_version(binary)
-        cmd_str = f'{binary} ... {url}'
+        # Get binary from environment
+        binary = get_env('WGET_BINARY', 'wget')

        # Run extraction
        success, output, error = save_wget(url, binary)
        status = 'succeeded' if success else 'failed'

-        if success:
-            # Count downloaded files
-            files = list(Path('.').rglob('*'))
-            file_count = len([f for f in files if f.is_file()])
-            print(f'wget completed: {file_count} files downloaded')
-
    except Exception as e:
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Calculate duration
-    end_ts = datetime.now(timezone.utc)
-
    if error:
        print(f'ERROR: {error}', file=sys.stderr)

@@ -289,10 +230,6 @@ def main(url: str, snapshot_id: str):
        'status': status,
        'output_str': output or error or '',
    }
-    if binary:
-        result['cmd'] = [binary, '--no-verbose', url]
-    if version:
-        result['cmd_version'] = version
    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)
--- a/archivebox/plugins/wget/tests/test_wget.py
+++ b/archivebox/plugins/wget/tests/test_wget.py
@@ -26,9 +26,9 @@ import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
 WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
-WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
-BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
-APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
+WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
+BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Binary__install_using_brew_provider.py'
+APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Binary__install_using_apt_provider.py'
 TEST_URL = 'https://example.com'


@@ -37,10 +37,10 @@ def test_hook_script_exists():
    assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"


-def test_wget_validate_hook():
-    """Test wget validate hook checks for wget binary."""
+def test_wget_install_hook():
+    """Test wget install hook checks for wget binary."""
    result = subprocess.run(
-        [sys.executable, str(WGET_VALIDATE_HOOK)],
+        [sys.executable, str(WGET_INSTALL_HOOK)],
        capture_output=True,
        text=True,
        timeout=30
@@ -48,20 +48,20 @@ def test_wget_validate_hook():

    # Hook exits 0 if binary found, 1 if not found (with Dependency record)
    if result.returncode == 0:
-        # Binary found - verify InstalledBinary JSONL output
+        # Binary found - verify Binary JSONL output
        found_binary = False
        for line in result.stdout.strip().split('\n'):
            if line.strip():
                try:
                    record = json.loads(line)
-                    if record.get('type') == 'InstalledBinary':
+                    if record.get('type') == 'Binary':
                        assert record['name'] == 'wget'
                        assert record['abspath']
                        found_binary = True
                        break
                except json.JSONDecodeError:
                    pass
-        assert found_binary, "Should output InstalledBinary record when binary found"
+        assert found_binary, "Should output Binary record when binary found"
    else:
        # Binary not found - verify Dependency JSONL output
        found_dependency = False
@@ -150,8 +150,8 @@ def test_can_install_wget_via_provider():
    # Should succeed (wget installs successfully or is already installed)
    assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"

-    # Should output InstalledBinary JSONL record
-    assert 'InstalledBinary' in result.stdout or 'wget' in result.stderr, \
+    # Should output Binary JSONL record
+    assert 'Binary' in result.stdout or 'wget' in result.stderr, \
        f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"

    # Parse JSONL if present
@@ -159,7 +159,7 @@ def test_can_install_wget_via_provider():
        for line in result.stdout.strip().split('\n'):
            try:
                record = json.loads(line)
-                if record.get('type') == 'InstalledBinary':
+                if record.get('type') == 'Binary':
                    assert record['name'] == 'wget'
                    assert record['binprovider'] in ['brew', 'apt']
                    assert record['abspath'], "Should have binary path"
@@ -216,9 +216,21 @@ def test_archives_example_com():

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

-        # Verify output in stdout
-        assert 'STATUS=succeeded' in result.stdout, "Should report success"
-        assert 'wget completed' in result.stdout, "Should report completion"
+        # Parse clean JSONL output
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass
+
+        assert result_json, "Should have ArchiveResult JSONL output"
+        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"

        # Verify files were downloaded
        downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
@@ -245,23 +257,9 @@ def test_archives_example_com():
                'more information' in html_content.lower()), \
            "Missing IANA reference"

-        # Verify RESULT_JSON is present and valid
-        assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
-
-        for line in result.stdout.split('\n'):
-            if line.startswith('RESULT_JSON='):
-                result_json = json.loads(line.replace('RESULT_JSON=', ''))
-                assert result_json['extractor'] == 'wget'
-                assert result_json['status'] == 'succeeded'
-                assert result_json['url'] == TEST_URL
-                assert result_json['snapshot_id'] == 'test789'
-                assert 'duration' in result_json
-                assert result_json['duration'] >= 0
-                break
-

 def test_config_save_wget_false_skips():
-    """Test that SAVE_WGET=False causes skip."""
+    """Test that SAVE_WGET=False exits without emitting JSONL."""

    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
@@ -279,10 +277,15 @@ def test_config_save_wget_false_skips():
            timeout=30
        )

-        # Should succeed but skip
-        assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
-        assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
-        assert 'SAVE_WGET=False' in result.stdout, "Should mention SAVE_WGET=False"
+        # Should exit 0 when feature disabled
+        assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
+
+        # Feature disabled - no JSONL emission, just logs to stderr
+        assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
+
+        # Should NOT emit any JSONL
+        jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
+        assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"


 def test_config_save_warc():
@@ -323,23 +326,44 @@ def test_staticfile_present_skips():
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

-        # Create staticfile directory with content to simulate staticfile extractor ran
+        # Create directory structure like real ArchiveBox:
+        # tmpdir/
+        #   staticfile/  <- staticfile extractor output
+        #   wget/         <- wget extractor runs here, looks for ../staticfile
        staticfile_dir = tmpdir / 'staticfile'
        staticfile_dir.mkdir()
        (staticfile_dir / 'index.html').write_text('<html>test</html>')

+        wget_dir = tmpdir / 'wget'
+        wget_dir.mkdir()
+
        result = subprocess.run(
            [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
-            cwd=tmpdir,
+            cwd=wget_dir,  # Run from wget subdirectory
            capture_output=True,
            text=True,
            timeout=30
        )

-        # Should skip
-        assert result.returncode == 0, "Should exit 0 when skipping"
-        assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
-        assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
+        # Should skip with permanent skip JSONL
+        assert result.returncode == 0, "Should exit 0 when permanently skipping"
+
+        # Parse clean JSONL output
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass
+
+        assert result_json, "Should emit ArchiveResult JSONL for permanent skip"
+        assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}"
+        assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str"


 def test_handles_404_gracefully():
@@ -418,7 +442,21 @@ def test_config_user_agent():

        # Should succeed (example.com doesn't block)
        if result.returncode == 0:
-            assert 'STATUS=succeeded' in result.stdout
+            # Parse clean JSONL output
+            result_json = None
+            for line in result.stdout.strip().split('\n'):
+                line = line.strip()
+                if line.startswith('{'):
+                    try:
+                        record = json.loads(line)
+                        if record.get('type') == 'ArchiveResult':
+                            result_json = record
+                            break
+                    except json.JSONDecodeError:
+                        pass
+
+            assert result_json, "Should have ArchiveResult JSONL output"
+            assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"


 if __name__ == '__main__':
				`@@ -0,0 +1 @@`
				`{"type": "Binary", "name": "wget", "binproviders": "apt,brew,pip,env"}`