mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 14:57:56 +10:00
Merge remote-tracking branch 'origin/dev' into claude/improve-test-suite-xm6Bh
This commit is contained in:
@@ -25,7 +25,8 @@ AptProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command (overrides default)")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using apt package manager."""
|
||||
|
||||
# Check if apt provider is allowed
|
||||
@@ -42,7 +43,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via apt...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"apt install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -25,7 +25,8 @@ BrewProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using Homebrew."""
|
||||
|
||||
if bin_providers != '*' and 'brew' not in bin_providers.split(','):
|
||||
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via brew...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"brew install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -6,103 +6,29 @@ Runs at crawl start to verify Chrome is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Common Chrome/Chromium binary names and paths
|
||||
CHROME_NAMES = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'chrome',
|
||||
]
|
||||
|
||||
CHROME_PATHS = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
'/snap/bin/chromium',
|
||||
'/opt/google/chrome/chrome',
|
||||
]
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from Chrome binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# Chrome version string: "Google Chrome 120.0.6099.109" or "Chromium 120.0.6099.109"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
# Find version number (looks like 120.0.6099.109)
|
||||
for part in parts:
|
||||
if '.' in part and part[0].isdigit():
|
||||
return part
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_chrome() -> dict | None:
|
||||
"""Find Chrome/Chromium binary."""
|
||||
# Check env var first
|
||||
env_path = os.environ.get('CHROME_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'abspath': env_path,
|
||||
'version': get_binary_version(env_path),
|
||||
'sha256': get_binary_hash(env_path),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
# Try shutil.which for various names
|
||||
for name in CHROME_NAMES:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
# Check common paths
|
||||
for path in CHROME_PATHS:
|
||||
if Path(path).is_file():
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'abspath': path,
|
||||
'version': get_binary_version(path),
|
||||
'sha256': get_binary_hash(path),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
# Try common Chrome/Chromium binary names
|
||||
for name in ['google-chrome', 'chromium', 'chromium-browser', 'google-chrome-stable', 'chrome']:
|
||||
binary = Binary(name=name, binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -6,39 +6,8 @@ Runs at crawl start to verify forum-dl binary is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_forumdl() -> dict | None:
|
||||
@@ -46,11 +15,7 @@ def find_forumdl() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
|
||||
class ForumdlBinary(Binary):
|
||||
name: str = 'forum-dl'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = ForumdlBinary()
|
||||
binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -60,22 +25,9 @@ def find_forumdl() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('forum-dl') or os.environ.get('FORUMDL_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'forum-dl',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -86,7 +38,7 @@ def main():
|
||||
missing_deps = []
|
||||
|
||||
# Emit results for forum-dl
|
||||
if forumdl_result and forumdl_result.get('abspath'):
|
||||
if forumdl_result and forumdl_result.get('abspath') and forumdl_result.get('version'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': forumdl_result['name'],
|
||||
@@ -111,10 +63,19 @@ def main():
|
||||
'value': forumdl_result['version'],
|
||||
}))
|
||||
else:
|
||||
# forum-dl has cchardet dependency that doesn't compile on Python 3.14+
|
||||
# Provide overrides to install with chardet instead
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'forum-dl',
|
||||
'bin_providers': 'pip,env',
|
||||
'overrides': {
|
||||
'pip': {
|
||||
'packages': ['--no-deps', 'forum-dl', 'chardet', 'pydantic', 'beautifulsoup4', 'lxml',
|
||||
'requests', 'urllib3', 'tenacity', 'python-dateutil',
|
||||
'html2text', 'warcio']
|
||||
}
|
||||
}
|
||||
}))
|
||||
missing_deps.append('forum-dl')
|
||||
|
||||
|
||||
@@ -137,6 +137,8 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
return True, None, '' # Not a forum site - success, no output
|
||||
if 'no content' in stderr_lower:
|
||||
return True, None, '' # No forum found - success, no output
|
||||
if 'extractornotfounderror' in stderr_lower:
|
||||
return True, None, '' # No forum extractor for this URL - success, no output
|
||||
if result.returncode == 0:
|
||||
return True, None, '' # forum-dl exited cleanly, just no forum - success
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
@@ -24,6 +25,75 @@ FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py'
|
||||
FORUMDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_forumdl.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Module-level cache for installed binary path
|
||||
_forumdl_binary_path = None
|
||||
|
||||
def get_forumdl_binary_path():
|
||||
"""Get the installed forum-dl binary path from cache or by running validation/installation."""
|
||||
global _forumdl_binary_path
|
||||
if _forumdl_binary_path:
|
||||
return _forumdl_binary_path
|
||||
|
||||
# Run validation hook to find or install binary
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Check if binary was found
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary' and record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'forum-dl':
|
||||
# Need to install via pip hook
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Dependency__install_using_pip_provider.py'
|
||||
dependency_id = str(uuid.uuid4())
|
||||
|
||||
# Build command with overrides if present
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', record['bin_name']
|
||||
]
|
||||
if 'overrides' in record:
|
||||
cmd.extend(['--overrides', json.dumps(record['overrides'])])
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Parse InstalledBinary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'InstalledBinary' and install_record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = install_record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Installation failed - print debug info
|
||||
if not _forumdl_binary_path:
|
||||
print(f"\n=== forum-dl installation failed ===", file=sys.stderr)
|
||||
print(f"stdout: {install_result.stdout}", file=sys.stderr)
|
||||
print(f"stderr: {install_result.stderr}", file=sys.stderr)
|
||||
print(f"returncode: {install_result.returncode}", file=sys.stderr)
|
||||
return None
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
|
||||
@@ -64,38 +134,40 @@ def test_forumdl_validate_hook():
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify forum-dl is available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
missing_binaries = []
|
||||
|
||||
# Verify forum-dl is available
|
||||
forumdl_binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
forumdl_loaded = forumdl_binary.load()
|
||||
if not (forumdl_loaded and forumdl_loaded.abspath):
|
||||
missing_binaries.append('forum-dl')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
"""Verify forum-dl is installed by calling the REAL validation and installation hooks."""
|
||||
binary_path = get_forumdl_binary_path()
|
||||
assert binary_path, (
|
||||
"forum-dl must be installed successfully via validation hook and pip provider. "
|
||||
"NOTE: forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
|
||||
"due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
|
||||
)
|
||||
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
|
||||
|
||||
|
||||
def test_handles_non_forum_url():
|
||||
"""Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
import os
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_BINARY'] = binary_path
|
||||
|
||||
# Run forum-dl extraction hook on non-forum URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should exit 0 even for non-forum URL
|
||||
# Should exit 0 even for non-forum URL (graceful handling)
|
||||
assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
@@ -138,8 +210,12 @@ def test_config_timeout():
|
||||
"""Test that FORUMDL_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_BINARY'] = binary_path
|
||||
env['FORUMDL_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -6,39 +6,8 @@ Runs at crawl start to verify gallery-dl binary is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_gallerydl() -> dict | None:
|
||||
@@ -46,11 +15,7 @@ def find_gallerydl() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
|
||||
class GalleryDlBinary(Binary):
|
||||
name: str = 'gallery-dl'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = GalleryDlBinary()
|
||||
binary = Binary(name='gallery-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -60,22 +25,9 @@ def find_gallerydl() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('gallery-dl') or os.environ.get('GALLERYDL_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'gallery-dl',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -6,43 +6,8 @@ Runs at crawl start to verify git is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# git version string: "git version 2.43.0"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
if len(parts) >= 3 and parts[0] == 'git':
|
||||
return parts[2]
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_git() -> dict | None:
|
||||
@@ -50,11 +15,7 @@ def find_git() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
class GitBinary(Binary):
|
||||
name: str = 'git'
|
||||
binproviders_supported = [EnvProvider()]
|
||||
|
||||
binary = GitBinary()
|
||||
binary = Binary(name='git', binproviders=[EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -64,22 +25,9 @@ def find_git() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('git') or os.environ.get('GIT_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'git',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -6,51 +6,16 @@ Runs at crawl start to verify yt-dlp and required binaries are available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_ytdlp() -> dict | None:
|
||||
"""Find yt-dlp binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
from abx_pkg import Binary, PipProvider, BrewProvider, AptProvider, EnvProvider
|
||||
|
||||
class YtdlpBinary(Binary):
|
||||
name: str = 'yt-dlp'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = YtdlpBinary()
|
||||
binary = Binary(name='yt-dlp', binproviders=[PipProvider(), BrewProvider(), AptProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -60,22 +25,9 @@ def find_ytdlp() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'yt-dlp',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -84,12 +36,7 @@ def find_node() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class NodeBinary(Binary):
|
||||
name: str = 'node'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
overrides: dict = {'apt': {'packages': ['nodejs']}}
|
||||
|
||||
binary = NodeBinary()
|
||||
binary = Binary(name='node', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -99,22 +46,9 @@ def find_node() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'node',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -123,11 +57,7 @@ def find_ffmpeg() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class FfmpegBinary(Binary):
|
||||
name: str = 'ffmpeg'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
|
||||
binary = FfmpegBinary()
|
||||
binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -137,22 +67,9 @@ def find_ffmpeg() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'ffmpeg',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -197,7 +114,7 @@ def main():
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,env',
|
||||
'bin_providers': 'pip,brew,apt,env',
|
||||
}))
|
||||
missing_deps.append('yt-dlp')
|
||||
|
||||
@@ -227,10 +144,14 @@ def main():
|
||||
'value': node_result['version'],
|
||||
}))
|
||||
else:
|
||||
# node is installed as 'nodejs' package on apt
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'node',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
'overrides': {
|
||||
'apt': {'packages': ['nodejs']}
|
||||
}
|
||||
}))
|
||||
missing_deps.append('node')
|
||||
|
||||
|
||||
@@ -6,39 +6,8 @@ Runs at crawl start to verify postlight-parser is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_mercury() -> dict | None:
|
||||
@@ -46,12 +15,7 @@ def find_mercury() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class MercuryBinary(Binary):
|
||||
name: str = 'postlight-parser'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
|
||||
|
||||
binary = MercuryBinary()
|
||||
binary = Binary(name='postlight-parser', binproviders=[NpmProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -61,22 +25,9 @@ def find_mercury() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'postlight-parser',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -110,10 +61,14 @@ def main():
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# postlight-parser is installed as @postlight/parser in npm
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'postlight-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
'overrides': {
|
||||
'npm': {'packages': ['@postlight/parser']}
|
||||
}
|
||||
}))
|
||||
print(f"postlight-parser binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -25,7 +25,8 @@ NpmProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using npm."""
|
||||
|
||||
if bin_providers != '*' and 'npm' not in bin_providers.split(','):
|
||||
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via npm...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"npm install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -6,39 +6,8 @@ Runs at crawl start to verify papers-dl binary is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_papersdl() -> dict | None:
|
||||
@@ -46,11 +15,7 @@ def find_papersdl() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
|
||||
class PapersdlBinary(Binary):
|
||||
name: str = 'papers-dl'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = PapersdlBinary()
|
||||
binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -60,22 +25,9 @@ def find_papersdl() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('papers-dl') or os.environ.get('PAPERSDL_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'papers-dl',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
@@ -24,6 +25,67 @@ PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py'
|
||||
PAPERSDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_papersdl.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Module-level cache for installed binary path
|
||||
_papersdl_binary_path = None
|
||||
|
||||
def get_papersdl_binary_path():
|
||||
"""Get the installed papers-dl binary path from cache or by running validation/installation."""
|
||||
global _papersdl_binary_path
|
||||
if _papersdl_binary_path:
|
||||
return _papersdl_binary_path
|
||||
|
||||
# Run validation hook to find or install binary
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(PAPERSDL_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Check if binary was found
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary' and record.get('name') == 'papers-dl':
|
||||
_papersdl_binary_path = record.get('abspath')
|
||||
return _papersdl_binary_path
|
||||
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'papers-dl':
|
||||
# Need to install via pip hook
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Dependency__install_using_pip_provider.py'
|
||||
dependency_id = str(uuid.uuid4())
|
||||
|
||||
# Build command with overrides if present
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', record['bin_name']
|
||||
]
|
||||
if 'overrides' in record:
|
||||
cmd.extend(['--overrides', json.dumps(record['overrides'])])
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Parse InstalledBinary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'InstalledBinary' and install_record.get('name') == 'papers-dl':
|
||||
_papersdl_binary_path = install_record.get('abspath')
|
||||
return _papersdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}"
|
||||
@@ -64,34 +126,32 @@ def test_papersdl_validate_hook():
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify papers-dl is available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
missing_binaries = []
|
||||
|
||||
# Verify papers-dl is available
|
||||
papersdl_binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
papersdl_loaded = papersdl_binary.load()
|
||||
if not (papersdl_loaded and papersdl_loaded.abspath):
|
||||
missing_binaries.append('papers-dl')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
"""Verify papers-dl is installed by calling the REAL validation and installation hooks."""
|
||||
binary_path = get_papersdl_binary_path()
|
||||
assert binary_path, "papers-dl must be installed successfully via validation hook and pip provider"
|
||||
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
|
||||
|
||||
|
||||
def test_handles_non_paper_url():
|
||||
"""Test that papers-dl extractor handles non-paper URLs gracefully via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
import os
|
||||
|
||||
binary_path = get_papersdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = os.environ.copy()
|
||||
env['PAPERSDL_BINARY'] = binary_path
|
||||
|
||||
# Run papers-dl extraction hook on non-paper URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
@@ -138,8 +198,12 @@ def test_config_timeout():
|
||||
"""Test that PAPERSDL_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
binary_path = get_papersdl_binary_path()
|
||||
assert binary_path, "Binary must be installed for this test"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['PAPERSDL_BINARY'] = binary_path
|
||||
env['PAPERSDL_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
|
||||
@@ -25,7 +25,8 @@ PipProvider.model_rebuild()
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using pip."""
|
||||
|
||||
if bin_providers != '*' and 'pip' not in bin_providers.split(','):
|
||||
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
|
||||
click.echo(f"Installing {bin_name} via pip...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"pip install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -6,39 +6,8 @@ Runs at crawl start to verify readability-extractor is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_readability() -> dict | None:
|
||||
@@ -46,12 +15,7 @@ def find_readability() -> dict | None:
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class ReadabilityBinary(Binary):
|
||||
name: str = 'readability-extractor'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
|
||||
binary = ReadabilityBinary()
|
||||
binary = Binary(name='readability-extractor', binproviders=[NpmProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -61,22 +25,9 @@ def find_readability() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'readability-extractor',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -110,10 +61,14 @@ def main():
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# readability-extractor is installed from GitHub
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
'overrides': {
|
||||
'npm': {'packages': ['github:ArchiveBox/readability-extractor']}
|
||||
}
|
||||
}))
|
||||
print(f"readability-extractor binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
@@ -9,67 +9,25 @@ Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from ripgrep binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# ripgrep version string: "ripgrep 14.1.0"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.lower() == 'ripgrep' and i + 1 < len(parts):
|
||||
return parts[i + 1]
|
||||
# Try to find version number pattern
|
||||
for part in parts:
|
||||
if part[0].isdigit() and '.' in part:
|
||||
return part
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_ripgrep() -> dict | None:
|
||||
"""Find ripgrep binary using shutil.which or env var."""
|
||||
# Check env var first - if it's an absolute path and exists, use it
|
||||
ripgrep_env = os.environ.get('RIPGREP_BINARY', '')
|
||||
if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
|
||||
abspath = ripgrep_env
|
||||
else:
|
||||
# Otherwise try shutil.which with the env var as the binary name
|
||||
abspath = shutil.which(ripgrep_env) if ripgrep_env else None
|
||||
if not abspath:
|
||||
abspath = shutil.which('rg')
|
||||
"""Find ripgrep binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'rg',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
binary = Binary(name='rg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'rg',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -6,82 +6,27 @@ Runs at crawl start to verify single-file (npm package) is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from single-file binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
return result.stdout.strip().split('\n')[0][:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
# For scripts, hash the script content
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_singlefile() -> dict | None:
|
||||
"""Find single-file binary."""
|
||||
# Check env var first
|
||||
env_path = os.environ.get('SINGLEFILE_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': env_path,
|
||||
'version': get_binary_version(env_path),
|
||||
'sha256': get_binary_hash(env_path),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
# Try shutil.which
|
||||
for name in ['single-file', 'singlefile']:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
binary = Binary(name='single-file', binproviders=[NpmProvider(), EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'npm',
|
||||
}
|
||||
|
||||
# Check common npm paths
|
||||
npm_paths = [
|
||||
Path.home() / '.npm-global/bin/single-file',
|
||||
Path.home() / 'node_modules/.bin/single-file',
|
||||
Path('/usr/local/bin/single-file'),
|
||||
Path('/usr/local/lib/node_modules/.bin/single-file'),
|
||||
]
|
||||
for path in npm_paths:
|
||||
if path.is_file():
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': str(path),
|
||||
'version': get_binary_version(str(path)),
|
||||
'sha256': get_binary_hash(str(path)),
|
||||
'binprovider': 'npm',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -6,58 +6,16 @@ Runs at crawl start to verify wget is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# wget version string: "GNU Wget 1.24.5 built on ..."
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
# Extract version number
|
||||
parts = first_line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.lower() == 'wget' and i + 1 < len(parts):
|
||||
return parts[i + 1]
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_wget() -> dict | None:
|
||||
"""Find wget binary using abx-pkg or fallback to shutil.which."""
|
||||
# Try abx-pkg first
|
||||
"""Find wget binary using abx-pkg."""
|
||||
try:
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
class WgetBinary(Binary):
|
||||
name: str = 'wget'
|
||||
binproviders_supported = [EnvProvider()]
|
||||
|
||||
binary = WgetBinary()
|
||||
binary = Binary(name='wget', binproviders=[EnvProvider()])
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
@@ -67,22 +25,9 @@ def find_wget() -> dict | None:
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('wget') or os.environ.get('WGET_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'wget',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user