From e3ba599812fa1716ebebc98dae2c77482b52e2cb Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 27 Dec 2025 10:12:45 +0000 Subject: [PATCH] Update install hooks to respect XYZ_BINARY env vars - All install hooks now respect their respective XYZ_BINARY env vars (e.g., WGET_BINARY, CHROME_BINARY, YTDLP_BINARY, etc.) - Support both absolute paths (/usr/bin/wget2) and binary names (wget2) - Dynamic bin_name used in Dependency JSONL output - Updated 11 install hooks to follow the new pattern - Mark checklist items as complete in TODO_hook_architecture.md --- TODO_hook_architecture.md | 14 ++--- .../on_Crawl__00_install_chrome.py | 34 ++++++++++-- .../forumdl/on_Crawl__00_install_forumdl.py | 35 +++++++++--- .../on_Crawl__00_install_gallerydl.py | 35 +++++++++--- .../plugins/git/on_Crawl__00_install_git.py | 35 +++++++++--- .../media/on_Crawl__00_install_ytdlp.py | 53 +++++++++++++------ .../mercury/on_Crawl__00_install_mercury.py | 35 +++++++++--- .../papersdl/on_Crawl__00_install_papersdl.py | 35 +++++++++--- .../on_Crawl__00_install_readability.py | 35 +++++++++--- .../on_Crawl__00_install_ripgrep.py | 36 ++++++++++--- .../on_Crawl__00_install_singlefile.py | 35 +++++++++--- .../plugins/wget/on_Crawl__00_install_wget.py | 42 ++++++++++++--- 12 files changed, 339 insertions(+), 85 deletions(-) diff --git a/TODO_hook_architecture.md b/TODO_hook_architecture.md index 951b6ccd..6aacb4ab 100644 --- a/TODO_hook_architecture.md +++ b/TODO_hook_architecture.md @@ -118,7 +118,7 @@ def run(self): self.save() ``` -### Validation Hook Pattern (on_Crawl__00_validate_*.py) +### Install Hook Pattern (on_Crawl__00_install_*.py) **Purpose**: Check if binary exists, emit Dependency if not found. @@ -831,11 +831,11 @@ const cmd = ['wget', '-p', '-k', url]; // Ignores WGET_BINARY #### Install Hook Checklist -- [ ] Renamed from `on_Crawl__*_validate_*` to `on_Crawl__*_install_*` -- [ ] Reads `XYZ_BINARY` env var and handles both absolute paths + bin names -- [ ] Emits `{"type": "Dependency", ...}` JSONL (NOT hardcoded to always check for 'wget') -- [ ] Does NOT call npm/apt/brew/pip directly -- [ ] Follows standard pattern from section 4.1 +- [x] Renamed from `on_Crawl__*_validate_*` to `on_Crawl__*_install_*` +- [x] Reads `XYZ_BINARY` env var and handles both absolute paths + bin names +- [x] Emits `{"type": "Dependency", ...}` JSONL (uses configured bin_name) +- [x] Does NOT call npm/apt/brew/pip directly +- [x] Follows standard pattern from section 4.1 #### Snapshot Hook Checklist @@ -1973,4 +1973,4 @@ All phases of the hook architecture implementation are now complete: - ✅ Phase 6: ArchiveResult.run() updated - ✅ Phase 7: Background hook support -Total hooks updated: **32 hooks** across 6 dependency providers, 11 validate hooks, 8 Python snapshot hooks, and 14 JS snapshot hooks (3 of which are background hooks). +Total hooks updated: **32 hooks** across 6 dependency providers, 13 install hooks (renamed from validate), 8 Python snapshot hooks, and 14 JS snapshot hooks (3 of which are background hooks). diff --git a/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py b/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py index cc997e88..1bbe64dd 100644 --- a/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py +++ b/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py @@ -1,23 +1,34 @@ #!/usr/bin/env python3 """ -Validation hook for Chrome/Chromium binary. +Install hook for Chrome/Chromium binary. Runs at crawl start to verify Chrome is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects CHROME_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_chrome() -> dict | None: - """Find Chrome/Chromium binary.""" + """Find Chrome/Chromium binary, respecting CHROME_BINARY env var.""" try: from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider - # Try common Chrome/Chromium binary names - for name in ['google-chrome', 'chromium', 'chromium-browser', 'google-chrome-stable', 'chrome']: - binary = Binary(name=name, binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('CHROME_BINARY', '').strip() + + if configured_binary: + # User specified a custom binary path or name + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + + binary = Binary(name=bin_name, binproviders=[EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { @@ -27,6 +38,19 @@ def find_chrome() -> dict | None: 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', } + else: + # Try common Chrome/Chromium binary names + for name in ['google-chrome', 'chromium', 'chromium-browser', 'google-chrome-stable', 'chrome']: + binary = Binary(name=name, binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'chrome', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } except Exception: pass diff --git a/archivebox/plugins/forumdl/on_Crawl__00_install_forumdl.py b/archivebox/plugins/forumdl/on_Crawl__00_install_forumdl.py index 2a5b8cb7..3b8973c6 100755 --- a/archivebox/plugins/forumdl/on_Crawl__00_install_forumdl.py +++ b/archivebox/plugins/forumdl/on_Crawl__00_install_forumdl.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for forum-dl. +Install hook for forum-dl. Runs at crawl start to verify forum-dl binary is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects FORUMDL_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_forumdl() -> dict | None: - """Find forum-dl binary.""" + """Find forum-dl binary, respecting FORUMDL_BINARY env var.""" try: from abx_pkg import Binary, PipProvider, EnvProvider - binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('FORUMDL_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'forum-dl' + + binary = Binary(name=bin_name, binproviders=[PipProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'forum-dl', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,6 +46,15 @@ def find_forumdl() -> dict | None: def main(): + # Determine binary name from config + configured_binary = os.environ.get('FORUMDL_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'forum-dl' + # Check for forum-dl (required) forumdl_result = find_forumdl() @@ -67,7 +90,7 @@ def main(): # Provide overrides to install with chardet instead print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'forum-dl', + 'bin_name': bin_name, 'bin_providers': 'pip,env', 'overrides': { 'pip': { @@ -77,7 +100,7 @@ def main(): } } })) - missing_deps.append('forum-dl') + missing_deps.append(bin_name) if missing_deps: print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) diff --git a/archivebox/plugins/gallerydl/on_Crawl__00_install_gallerydl.py b/archivebox/plugins/gallerydl/on_Crawl__00_install_gallerydl.py index 4893e2b2..b239f3a6 100755 --- a/archivebox/plugins/gallerydl/on_Crawl__00_install_gallerydl.py +++ b/archivebox/plugins/gallerydl/on_Crawl__00_install_gallerydl.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for gallery-dl. +Install hook for gallery-dl. Runs at crawl start to verify gallery-dl binary is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects GALLERYDL_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_gallerydl() -> dict | None: - """Find gallery-dl binary.""" + """Find gallery-dl binary, respecting GALLERYDL_BINARY env var.""" try: from abx_pkg import Binary, PipProvider, EnvProvider - binary = Binary(name='gallery-dl', binproviders=[PipProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('GALLERYDL_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'gallery-dl' + + binary = Binary(name=bin_name, binproviders=[PipProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'gallery-dl', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,6 +46,15 @@ def find_gallerydl() -> dict | None: def main(): + # Determine binary name from config + configured_binary = os.environ.get('GALLERYDL_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'gallery-dl' + # Check for gallery-dl (required) gallerydl_result = find_gallerydl() @@ -65,10 +88,10 @@ def main(): else: print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'gallery-dl', + 'bin_name': bin_name, 'bin_providers': 'pip,env', })) - missing_deps.append('gallery-dl') + missing_deps.append(bin_name) if missing_deps: print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) diff --git a/archivebox/plugins/git/on_Crawl__00_install_git.py b/archivebox/plugins/git/on_Crawl__00_install_git.py index 939f3d6e..e97ce0dd 100644 --- a/archivebox/plugins/git/on_Crawl__00_install_git.py +++ b/archivebox/plugins/git/on_Crawl__00_install_git.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for git binary. +Install hook for git binary. Runs at crawl start to verify git is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects GIT_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_git() -> dict | None: - """Find git binary.""" + """Find git binary, respecting GIT_BINARY env var.""" try: from abx_pkg import Binary, EnvProvider - binary = Binary(name='git', binproviders=[EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('GIT_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'git' + + binary = Binary(name=bin_name, binproviders=[EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'git', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,6 +46,15 @@ def find_git() -> dict | None: def main(): + # Determine binary name from config + configured_binary = os.environ.get('GIT_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'git' + result = find_git() if result and result.get('abspath'): @@ -63,10 +86,10 @@ def main(): else: print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'git', + 'bin_name': bin_name, 'bin_providers': 'apt,brew,env', })) - print(f"git binary not found", file=sys.stderr) + print(f"{bin_name} binary not found", file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py b/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py index 29eb1489..960f02f4 100755 --- a/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py +++ b/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for yt-dlp and its dependencies (node, ffmpeg). +Install hook for yt-dlp and its dependencies (node, ffmpeg). Runs at crawl start to verify yt-dlp and required binaries are available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects YTDLP_BINARY, NODE_BINARY, FFMPEG_BINARY env vars. """ +import os import sys import json +from pathlib import Path + + +def get_bin_name(env_var: str, default: str) -> str: + """Get binary name from env var or use default.""" + configured = os.environ.get(env_var, '').strip() + if configured: + if '/' in configured: + return Path(configured).name + return configured + return default def find_ytdlp() -> dict | None: - """Find yt-dlp binary.""" + """Find yt-dlp binary, respecting YTDLP_BINARY env var.""" try: from abx_pkg import Binary, PipProvider, BrewProvider, AptProvider, EnvProvider - binary = Binary(name='yt-dlp', binproviders=[PipProvider(), BrewProvider(), AptProvider(), EnvProvider()]) + bin_name = get_bin_name('YTDLP_BINARY', 'yt-dlp') + binary = Binary(name=bin_name, binproviders=[PipProvider(), BrewProvider(), AptProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'yt-dlp', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,15 +46,16 @@ def find_ytdlp() -> dict | None: def find_node() -> dict | None: - """Find node binary.""" + """Find node binary, respecting NODE_BINARY env var.""" try: from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider - binary = Binary(name='node', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) + bin_name = get_bin_name('NODE_BINARY', 'node') + binary = Binary(name=bin_name, binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'node', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -53,15 +68,16 @@ def find_node() -> dict | None: def find_ffmpeg() -> dict | None: - """Find ffmpeg binary.""" + """Find ffmpeg binary, respecting FFMPEG_BINARY env var.""" try: from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider - binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) + bin_name = get_bin_name('FFMPEG_BINARY', 'ffmpeg') + binary = Binary(name=bin_name, binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'ffmpeg', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -85,6 +101,11 @@ def main(): missing_deps = [] + # Get configured binary names + ytdlp_bin_name = get_bin_name('YTDLP_BINARY', 'yt-dlp') + node_bin_name = get_bin_name('NODE_BINARY', 'node') + ffmpeg_bin_name = get_bin_name('FFMPEG_BINARY', 'ffmpeg') + # Emit results for yt-dlp if ytdlp_result and ytdlp_result.get('abspath'): print(json.dumps({ @@ -113,10 +134,10 @@ def main(): else: print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'yt-dlp', + 'bin_name': ytdlp_bin_name, 'bin_providers': 'pip,brew,apt,env', })) - missing_deps.append('yt-dlp') + missing_deps.append(ytdlp_bin_name) # Emit results for node if node_result and node_result.get('abspath'): @@ -147,13 +168,13 @@ def main(): # node is installed as 'nodejs' package on apt print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'node', + 'bin_name': node_bin_name, 'bin_providers': 'apt,brew,env', 'overrides': { 'apt': {'packages': ['nodejs']} } })) - missing_deps.append('node') + missing_deps.append(node_bin_name) # Emit results for ffmpeg if ffmpeg_result and ffmpeg_result.get('abspath'): @@ -183,10 +204,10 @@ def main(): else: print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'ffmpeg', + 'bin_name': ffmpeg_bin_name, 'bin_providers': 'apt,brew,env', })) - missing_deps.append('ffmpeg') + missing_deps.append(ffmpeg_bin_name) if missing_deps: print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) diff --git a/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py b/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py index 9d854c15..f180f54b 100755 --- a/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py +++ b/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for postlight-parser binary. +Install hook for postlight-parser binary. Runs at crawl start to verify postlight-parser is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects MERCURY_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_mercury() -> dict | None: - """Find postlight-parser binary.""" + """Find postlight-parser binary, respecting MERCURY_BINARY env var.""" try: from abx_pkg import Binary, NpmProvider, EnvProvider - binary = Binary(name='postlight-parser', binproviders=[NpmProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('MERCURY_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'postlight-parser' + + binary = Binary(name=bin_name, binproviders=[NpmProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'postlight-parser', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,6 +46,15 @@ def find_mercury() -> dict | None: def main(): + # Determine binary name from config + configured_binary = os.environ.get('MERCURY_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'postlight-parser' + result = find_mercury() if result and result.get('abspath'): @@ -64,13 +87,13 @@ def main(): # postlight-parser is installed as @postlight/parser in npm print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'postlight-parser', + 'bin_name': bin_name, 'bin_providers': 'npm,env', 'overrides': { 'npm': {'packages': ['@postlight/parser']} } })) - print(f"postlight-parser binary not found", file=sys.stderr) + print(f"{bin_name} binary not found", file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/papersdl/on_Crawl__00_install_papersdl.py b/archivebox/plugins/papersdl/on_Crawl__00_install_papersdl.py index f70792b1..aed20af9 100755 --- a/archivebox/plugins/papersdl/on_Crawl__00_install_papersdl.py +++ b/archivebox/plugins/papersdl/on_Crawl__00_install_papersdl.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for papers-dl. +Install hook for papers-dl. Runs at crawl start to verify papers-dl binary is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects PAPERSDL_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_papersdl() -> dict | None: - """Find papers-dl binary.""" + """Find papers-dl binary, respecting PAPERSDL_BINARY env var.""" try: from abx_pkg import Binary, PipProvider, EnvProvider - binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('PAPERSDL_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'papers-dl' + + binary = Binary(name=bin_name, binproviders=[PipProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'papers-dl', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,6 +46,15 @@ def find_papersdl() -> dict | None: def main(): + # Determine binary name from config + configured_binary = os.environ.get('PAPERSDL_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'papers-dl' + # Check for papers-dl (required) papersdl_result = find_papersdl() @@ -65,10 +88,10 @@ def main(): else: print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'papers-dl', + 'bin_name': bin_name, 'bin_providers': 'pip,env', })) - missing_deps.append('papers-dl') + missing_deps.append(bin_name) if missing_deps: print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) diff --git a/archivebox/plugins/readability/on_Crawl__00_install_readability.py b/archivebox/plugins/readability/on_Crawl__00_install_readability.py index 9dd1946b..6f54b6eb 100755 --- a/archivebox/plugins/readability/on_Crawl__00_install_readability.py +++ b/archivebox/plugins/readability/on_Crawl__00_install_readability.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for readability-extractor binary. +Install hook for readability-extractor binary. Runs at crawl start to verify readability-extractor is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects READABILITY_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_readability() -> dict | None: - """Find readability-extractor binary.""" + """Find readability-extractor binary, respecting READABILITY_BINARY env var.""" try: from abx_pkg import Binary, NpmProvider, EnvProvider - binary = Binary(name='readability-extractor', binproviders=[NpmProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('READABILITY_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'readability-extractor' + + binary = Binary(name=bin_name, binproviders=[NpmProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'readability-extractor', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,6 +46,15 @@ def find_readability() -> dict | None: def main(): + # Determine binary name from config + configured_binary = os.environ.get('READABILITY_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'readability-extractor' + result = find_readability() if result and result.get('abspath'): @@ -64,13 +87,13 @@ def main(): # readability-extractor is installed from GitHub print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'readability-extractor', + 'bin_name': bin_name, 'bin_providers': 'npm,env', 'overrides': { 'npm': {'packages': ['github:ArchiveBox/readability-extractor']} } })) - print(f"readability-extractor binary not found", file=sys.stderr) + print(f"{bin_name} binary not found", file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py index 5062bae1..1bdb294b 100755 --- a/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py +++ b/archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py @@ -1,26 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for ripgrep binary. +Install hook for ripgrep binary. Only runs if SEARCH_BACKEND_ENGINE is set to 'ripgrep'. Outputs JSONL for InstalledBinary and Machine config updates. +Respects RIPGREP_BINARY env var for custom binary paths. """ import os import sys import json +from pathlib import Path def find_ripgrep() -> dict | None: - """Find ripgrep binary.""" + """Find ripgrep binary, respecting RIPGREP_BINARY env var.""" try: from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider - binary = Binary(name='rg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('RIPGREP_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'rg' + + binary = Binary(name=bin_name, binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'rg', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -33,7 +46,7 @@ def find_ripgrep() -> dict | None: def main(): - """Validate ripgrep binary and output JSONL.""" + """Find ripgrep binary and output JSONL.""" # Check if ripgrep search backend is enabled search_backend = os.environ.get('SEARCH_BACKEND_ENGINE', '').lower() @@ -42,6 +55,15 @@ def main(): # No-op: ripgrep is not the active search backend sys.exit(0) + # Determine binary name from config + configured_binary = os.environ.get('RIPGREP_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'rg' + result = find_ripgrep() if result and result.get('abspath'): @@ -76,12 +98,12 @@ def main(): # Output Dependency request print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'rg', + 'bin_name': bin_name, 'bin_providers': 'apt,brew,cargo,env', })) # Exit non-zero to indicate binary not found - print(f"ripgrep binary not found", file=sys.stderr) + print(f"{bin_name} binary not found", file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/singlefile/on_Crawl__00_install_singlefile.py b/archivebox/plugins/singlefile/on_Crawl__00_install_singlefile.py index eb5aa1c9..71694e32 100644 --- a/archivebox/plugins/singlefile/on_Crawl__00_install_singlefile.py +++ b/archivebox/plugins/singlefile/on_Crawl__00_install_singlefile.py @@ -1,25 +1,39 @@ #!/usr/bin/env python3 """ -Validation hook for single-file binary. +Install hook for single-file binary. Runs at crawl start to verify single-file (npm package) is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects SINGLEFILE_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_singlefile() -> dict | None: - """Find single-file binary.""" + """Find single-file binary, respecting SINGLEFILE_BINARY env var.""" try: from abx_pkg import Binary, NpmProvider, EnvProvider - binary = Binary(name='single-file', binproviders=[NpmProvider(), EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('SINGLEFILE_BINARY', '').strip() + + if configured_binary: + if '/' in configured_binary: + bin_name = Path(configured_binary).name + else: + bin_name = configured_binary + else: + bin_name = 'single-file' + + binary = Binary(name=bin_name, binproviders=[NpmProvider(), EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'single-file', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,6 +46,15 @@ def find_singlefile() -> dict | None: def main(): + # Determine binary name from config + configured_binary = os.environ.get('SINGLEFILE_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'single-file' + result = find_singlefile() if result and result.get('abspath'): @@ -63,10 +86,10 @@ def main(): else: print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'single-file', + 'bin_name': bin_name, 'bin_providers': 'npm,env', })) - print(f"single-file binary not found", file=sys.stderr) + print(f"{bin_name} binary not found", file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/wget/on_Crawl__00_install_wget.py b/archivebox/plugins/wget/on_Crawl__00_install_wget.py index 843cd234..837919a3 100644 --- a/archivebox/plugins/wget/on_Crawl__00_install_wget.py +++ b/archivebox/plugins/wget/on_Crawl__00_install_wget.py @@ -1,25 +1,43 @@ #!/usr/bin/env python3 """ -Validation hook for wget binary. +Install hook for wget binary. Runs at crawl start to verify wget is available. Outputs JSONL for InstalledBinary and Machine config updates. +Respects WGET_BINARY env var for custom binary paths. """ +import os import sys import json +from pathlib import Path def find_wget() -> dict | None: - """Find wget binary using abx-pkg.""" + """Find wget binary using abx-pkg, respecting WGET_BINARY env var.""" try: from abx_pkg import Binary, EnvProvider - binary = Binary(name='wget', binproviders=[EnvProvider()]) + # Check if user has configured a custom binary + configured_binary = os.environ.get('WGET_BINARY', '').strip() + + if configured_binary: + # User specified a custom binary path or name + if '/' in configured_binary: + # Absolute path - extract name from path + bin_name = Path(configured_binary).name + else: + # Just a binary name + bin_name = configured_binary + else: + # Default to 'wget' + bin_name = 'wget' + + binary = Binary(name=bin_name, binproviders=[EnvProvider()]) loaded = binary.load() if loaded and loaded.abspath: return { - 'name': 'wget', + 'name': bin_name, 'abspath': str(loaded.abspath), 'version': str(loaded.version) if loaded.version else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, @@ -32,7 +50,15 @@ def find_wget() -> dict | None: def main(): - """Validate wget binary and output JSONL.""" + """Find wget binary and output JSONL.""" + # Determine binary name from config + configured_binary = os.environ.get('WGET_BINARY', '').strip() + if configured_binary and '/' in configured_binary: + bin_name = Path(configured_binary).name + elif configured_binary: + bin_name = configured_binary + else: + bin_name = 'wget' result = find_wget() @@ -65,15 +91,15 @@ def main(): sys.exit(0) else: - # Output Dependency request + # Output Dependency request (uses configured bin_name) print(json.dumps({ 'type': 'Dependency', - 'bin_name': 'wget', + 'bin_name': bin_name, 'bin_providers': 'apt,brew,env', })) # Exit non-zero to indicate binary not found - print(f"wget binary not found", file=sys.stderr) + print(f"{bin_name} binary not found", file=sys.stderr) sys.exit(1)