Merge remote-tracking branch 'origin/dev' into claude/improve-test-suite-xm6Bh

This commit is contained in:
Claude
2025-12-27 05:53:06 +00:00
24 changed files with 1101 additions and 894 deletions

View File

@@ -14,7 +14,12 @@
"Bash(mkdir:*)", "Bash(mkdir:*)",
"Bash(chmod:*)", "Bash(chmod:*)",
"Bash(python -m forum_dl:*)", "Bash(python -m forum_dl:*)",
"Bash(archivebox manage migrate:*)" "Bash(archivebox manage migrate:*)",
"Bash(cat:*)",
"Bash(python archivebox/plugins/pip/on_Dependency__install_using_pip_provider.py:*)",
"Bash(forum-dl:*)",
"Bash(pip uninstall:*)",
"Bash(python:*)"
] ]
} }
} }

View File

@@ -1,3 +1,353 @@
# ArchiveBox Hook Architecture
## Core Design Pattern
**CRITICAL**: All hooks must follow this unified architecture. This pattern applies to ALL models: Crawl, Dependency, Snapshot, ArchiveResult, etc.
### The Flow
```
1. Model.run() discovers and executes hooks
2. Hooks emit JSONL to stdout
3. Model.run() parses JSONL and creates DB records
4. New DB records trigger their own Model.run()
5. Cycle repeats
```
**Example Flow:**
```
Crawl.run()
→ runs on_Crawl__* hooks
→ hooks emit JSONL: {type: 'Dependency', bin_name: 'wget', ...}
→ Crawl.run() creates Dependency record in DB
→ Dependency.run() is called automatically
→ runs on_Dependency__* hooks
→ hooks emit JSONL: {type: 'InstalledBinary', name: 'wget', ...}
→ Dependency.run() creates InstalledBinary record in DB
```
### Golden Rules
1. **Model.run() executes hooks directly** - No helper methods in statemachines. Statemachine just calls Model.run().
2. **Hooks emit JSONL** - Any line starting with `{` that has a `type` field creates/updates that model.
```python
print(json.dumps({'type': 'Dependency', 'bin_name': 'wget', ...}))
print(json.dumps({'type': 'InstalledBinary', 'name': 'wget', ...}))
```
3. **JSONL fields = Model fields** - JSONL keys must match Django model field names exactly. No transformation.
```python
# ✅ CORRECT - matches Dependency model
{'type': 'Dependency', 'bin_name': 'wget', 'bin_providers': 'apt,brew', 'overrides': {...}}
# ❌ WRONG - uses different field names
{'type': 'Dependency', 'name': 'wget', 'providers': 'apt,brew', 'custom_cmds': {...}}
```
4. **No hardcoding** - Never hardcode binary names, provider names, or anything else. Use discovery.
```python
# ✅ CORRECT - discovers all on_Dependency hooks dynamically
run_hooks(event_name='Dependency', ...)
# ❌ WRONG - hardcodes provider list
for provider in ['pip', 'npm', 'apt', 'brew']:
run_hooks(event_name=f'Dependency__install_using_{provider}_provider', ...)
```
5. **Trust abx-pkg** - Never use `shutil.which()`, `subprocess.run([bin, '--version'])`, or manual hash calculation.
```python
# ✅ CORRECT - abx-pkg handles everything
from abx_pkg import Binary, PipProvider, EnvProvider
binary = Binary(name='wget', binproviders=[PipProvider(), EnvProvider()]).load()
# binary.abspath, binary.version, binary.sha256 are all populated automatically
# ❌ WRONG - manual detection
abspath = shutil.which('wget')
version = subprocess.run(['wget', '--version'], ...).stdout
```
6. **Hooks check if they can handle requests** - Each hook decides internally if it can handle the dependency.
```python
# In on_Dependency__install_using_pip_provider.py
if bin_providers != '*' and 'pip' not in bin_providers.split(','):
sys.exit(0) # Can't handle this, exit cleanly
```
7. **Minimal transformation** - Statemachine/Model.run() should do minimal JSONL parsing, just create records.
```python
# ✅ CORRECT - simple JSONL parsing
obj = json.loads(line)
if obj.get('type') == 'Dependency':
Dependency.objects.create(**obj)
# ❌ WRONG - complex transformation logic
if obj.get('type') == 'Dependency':
dep = Dependency.objects.create(name=obj['bin_name']) # renaming fields
dep.custom_commands = transform_overrides(obj['overrides']) # transforming data
```
### Pattern Consistency
Follow the same pattern as `ArchiveResult.run()` (archivebox/core/models.py:1030):
```python
def run(self):
"""Execute this Model by running hooks and processing JSONL output."""
# 1. Discover hooks
hook = discover_hook_for_model(self)
# 2. Run hook
results = run_hook(hook, output_dir=..., ...)
# 3. Parse JSONL and update self
for line in results['stdout'].splitlines():
obj = json.loads(line)
if obj.get('type') == self.__class__.__name__:
self.status = obj.get('status')
self.output = obj.get('output')
# ... apply other fields
# 4. Create side-effect records
for line in results['stdout'].splitlines():
obj = json.loads(line)
if obj.get('type') != self.__class__.__name__:
create_record_from_jsonl(obj) # Creates InstalledBinary, etc.
self.save()
```
### Validation Hook Pattern (on_Crawl__00_validate_*.py)
**Purpose**: Check if binary exists, emit Dependency if not found.
```python
#!/usr/bin/env python3
import sys
import json
def find_wget() -> dict | None:
"""Find wget binary using abx-pkg."""
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'wget',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except Exception:
pass
return None
def main():
result = find_wget()
if result and result.get('abspath'):
# Binary found - emit InstalledBinary and Machine config
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/WGET_BINARY',
'value': result['abspath'],
}))
sys.exit(0)
else:
# Binary not found - emit Dependency
print(json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
'overrides': {}, # Empty if no special install requirements
}))
print(f"wget binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()
```
**Rules:**
- ✅ Use `Binary(...).load()` from abx-pkg - handles finding binary, version, hash automatically
- ✅ Emit `InstalledBinary` JSONL if found
- ✅ Emit `Dependency` JSONL if not found
- ✅ Use `overrides` field matching abx-pkg format: `{'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}`
- ❌ NEVER use `shutil.which()`, `subprocess.run()`, manual version detection, or hash calculation
- ❌ NEVER call package managers (apt, brew, pip, npm) directly
### Dependency Installation Pattern (on_Dependency__install_*.py)
**Purpose**: Install binary if not already installed.
```python
#!/usr/bin/env python3
import json
import sys
import rich_click as click
from abx_pkg import Binary, PipProvider
@click.command()
@click.option('--dependency-id', required=True)
@click.option('--bin-name', required=True)
@click.option('--bin-providers', default='*')
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
def main(dependency_id: str, bin_name: str, bin_providers: str, overrides: str | None):
"""Install binary using pip."""
# Check if this hook can handle this dependency
if bin_providers != '*' and 'pip' not in bin_providers.split(','):
click.echo(f"pip provider not allowed for {bin_name}", err=True)
sys.exit(0) # Exit cleanly - not an error, just can't handle
# Parse overrides
overrides_dict = None
if overrides:
try:
full_overrides = json.loads(overrides)
overrides_dict = full_overrides.get('pip', {}) # Extract pip section
except json.JSONDecodeError:
pass
# Install using abx-pkg
provider = PipProvider()
try:
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
except Exception as e:
click.echo(f"pip install failed: {e}", err=True)
sys.exit(1)
if not binary.abspath:
sys.exit(1)
# Emit InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': bin_name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'pip',
'dependency_id': dependency_id,
}))
sys.exit(0)
if __name__ == '__main__':
main()
```
**Rules:**
- ✅ Check `bin_providers` parameter - exit cleanly (code 0) if can't handle
- ✅ Parse `overrides` parameter as full dict, extract your provider's section
- ✅ Use `Binary(...).install()` from abx-pkg - handles actual installation
- ✅ Emit `InstalledBinary` JSONL on success
- ❌ NEVER hardcode provider names in Model.run() or anywhere else
- ❌ NEVER skip the bin_providers check
### Model.run() Pattern
```python
class Dependency(models.Model):
def run(self):
"""Execute dependency installation by running all on_Dependency hooks."""
import json
from pathlib import Path
from django.conf import settings
# Check if already installed
if self.is_installed:
return self.installed_binaries.first()
from archivebox.hooks import run_hooks
# Create output directory
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
output_dir = Path(DATA_DIR) / 'tmp' / f'dependency_{self.id}'
output_dir.mkdir(parents=True, exist_ok=True)
# Build kwargs for hooks
hook_kwargs = {
'dependency_id': str(self.id),
'bin_name': self.bin_name,
'bin_providers': self.bin_providers,
'overrides': json.dumps(self.overrides) if self.overrides else None,
}
# Run ALL on_Dependency hooks - each decides if it can handle this
results = run_hooks(
event_name='Dependency',
output_dir=output_dir,
timeout=600,
**hook_kwargs
)
# Process results - parse JSONL and create InstalledBinary records
for result in results:
if result['returncode'] != 0:
continue
for line in result['stdout'].strip().split('\n'):
if not line.strip():
continue
try:
obj = json.loads(line)
if obj.get('type') == 'InstalledBinary':
# Create InstalledBinary record - fields match JSONL exactly
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
continue
machine = Machine.current()
installed_binary, _ = InstalledBinary.objects.update_or_create(
machine=machine,
name=obj['name'],
defaults={
'abspath': obj['abspath'],
'version': obj['version'],
'sha256': obj.get('sha256') or '',
'binprovider': obj.get('binprovider') or 'env',
'dependency': self,
}
)
if self.is_installed:
return installed_binary
except json.JSONDecodeError:
continue
return None
```
**Rules:**
- ✅ Use `run_hooks(event_name='ModelName', ...)` with model name
- ✅ Pass all relevant data as kwargs (will become --cli-args for hooks)
- ✅ Parse JSONL output directly - each line is a potential record
- ✅ Create records using JSONL fields directly - no transformation
- ✅ Let hooks decide if they can handle the request
- ❌ NEVER hardcode hook names or provider lists
- ❌ NEVER create helper methods for hook execution - just call run_hooks()
- ❌ NEVER transform JSONL data - use it as-is
---
# Background Hooks Implementation Plan # Background Hooks Implementation Plan
## Overview ## Overview
@@ -186,11 +536,12 @@ class Migration(migrations.Migration):
--- ---
## Phase 2: Hook Output Format ## Phase 2: Hook Output Format Specification
### Hooks emit single JSON object to stdout ### Hooks emit single JSON object to stdout
**Contract:** **Contract:**
- Hook scripts must be executable (chmod +x) and specify their interpreter at the top with a /usr/bin/env shebang line
- Hook emits ONE JSON object with `type: 'ArchiveResult'` - Hook emits ONE JSON object with `type: 'ArchiveResult'`
- Hook can provide: `status`, `output_str`, `output_json`, `cmd` (optional) - Hook can provide: `status`, `output_str`, `output_json`, `cmd` (optional)
- Hook should NOT set: `output_files`, `output_size`, `output_mimetypes` (runner calculates these) - Hook should NOT set: `output_files`, `output_size`, `output_mimetypes` (runner calculates these)
@@ -203,37 +554,23 @@ class Migration(migrations.Migration):
// Simple string output // Simple string output
console.log(JSON.stringify({ console.log(JSON.stringify({
type: 'ArchiveResult', type: 'ArchiveResult',
status: 'succeeded', output_str: 'This is the page title',
output_str: 'Downloaded index.html (4.2 KB)'
})); }));
// With structured metadata (headers, redirects, etc.) // With structured metadata and optional fields (headers, redirects, etc.)
console.log(JSON.stringify({ console.log(JSON.stringify({
type: 'ArchiveResult', type: 'ArchiveResult',
status: 'succeeded', status: 'succeeded',
output_str: 'Archived https://example.com', output_str: 'Got https://example.com headers',
output_json: { output_json: {'content-type': 'text/html', 'server': 'nginx', 'status-code': 200, 'content-length': 234235},
headers: {'content-type': 'text/html', 'server': 'nginx'},
redirects: [{from: 'http://example.com', to: 'https://example.com'}]
}
})); }));
// With explicit cmd (for binary FK) // With explicit cmd (cmd first arg should match InstalledBinary.bin_abspath or XYZ_BINARY env var so ArchiveResult.run() can FK to the InstalledBinary)
console.log(JSON.stringify({ console.log(JSON.stringify({
type: 'ArchiveResult', type: 'ArchiveResult',
status: 'succeeded', status: 'succeeded',
output_str: 'Archived with wget', output_str: 'Archived with wget',
cmd: ['wget', '-p', '-k', 'https://example.com'] cmd: ['/some/abspath/to/wget', '-p', '-k', 'https://example.com']
}));
// Just structured data (no human-readable string)
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_json: {
title: 'My Page Title',
charset: 'UTF-8'
}
})); }));
// BAD: Don't duplicate ArchiveResult fields in output_json // BAD: Don't duplicate ArchiveResult fields in output_json
@@ -241,16 +578,17 @@ console.log(JSON.stringify({
type: 'ArchiveResult', type: 'ArchiveResult',
status: 'succeeded', status: 'succeeded',
output_json: { output_json: {
status: 'succeeded', // ❌ BAD - duplicates ArchiveResult.status status: 'succeeded', // ❌ BAD - this should be up a level on ArchiveResult.status, not inside output_json
output_files: ['index.html'], // ❌ BAD - runner calculates this title: 'the page title', // ❌ BAD - if the extractor's main output is just a string then it belongs in output_str
custom_data: 'ok' // ✅ GOOD - custom fields only custom_data: 1234, // ✅ GOOD - custom fields only
} },
output_files: {'index.html': {}}, // ❌ BAD - runner calculates this for us, no need to return it manually
})); }));
``` ```
--- ---
## Phase 3: run_hook() is Generic (No HookResult TypedDict) ## Phase 3: Architecture - Generic run_hook()
`run_hook()` is a generic JSONL parser - it doesn't know about ArchiveResult, InstalledBinary, or any specific model. It just: `run_hook()` is a generic JSONL parser - it doesn't know about ArchiveResult, InstalledBinary, or any specific model. It just:
1. Executes the hook script 1. Executes the hook script
@@ -276,8 +614,8 @@ def run_hook(
Each Model.run() method handles its own record types differently: Each Model.run() method handles its own record types differently:
- ArchiveResult.run() extends ArchiveResult records with computed fields - ArchiveResult.run() extends ArchiveResult records with computed fields
- Machine.run() creates InstalledBinary records from hook output - Dependency.run() creates InstalledBinary records from hook output
- etc. - Crawl.run() can create Dependency records, Snapshots, or InstalledBinary records from hook output
Returns: Returns:
List of dicts with 'type' field, each extended with metadata: List of dicts with 'type' field, each extended with metadata:
@@ -285,9 +623,9 @@ def run_hook(
{ {
'type': 'ArchiveResult', 'type': 'ArchiveResult',
'status': 'succeeded', 'status': 'succeeded',
'output_str': '...',
'plugin': 'wget', 'plugin': 'wget',
'plugin_hook': 'archivebox/plugins/wget/on_Snapshot__21_wget.py', 'plugin_hook': 'archivebox/plugins/wget/on_Snapshot__21_wget.py',
'output_str': '...',
# ... other hook-reported fields # ... other hook-reported fields
}, },
{ {
@@ -325,19 +663,241 @@ def create_model_record(record: dict) -> Any:
model_type = record.pop('type') model_type = record.pop('type')
if model_type == 'InstalledBinary': if model_type == 'InstalledBinary':
obj, created = InstalledBinary.objects.get_or_create(**record) obj, created = InstalledBinary.objects.get_or_create(**record) # if model requires custom logic implement InstalledBinary.from_jsonl(**record)
return obj return obj
elif model_type == 'Dependency': elif model_type == 'Dependency':
obj, created = Dependency.objects.get_or_create(**record) obj, created = Dependency.objects.get_or_create(**record)
return obj return obj
# Add more types as needed # ... Snapshot, ArchiveResult, etc. add more types as needed
else: else:
raise ValueError(f"Unknown record type: {model_type}") raise ValueError(f"Unknown record type: {model_type}")
``` ```
--- ---
## Phase 4: Update run_hook() Implementation ## Phase 4: Plugin Audit & Standardization
**CRITICAL:** This phase MUST be done FIRST, before updating core code. Do this manually, one plugin at a time. Do NOT batch-update multiple plugins at once. Do NOT skip any plugins or checks.
**Why First?** Updating plugins to output clean JSONL before changing core code means the transition is safe and incremental. The current run_hook() can continue to work during the plugin updates.
### 4.1 Install Hook Standardization
All plugins should follow a consistent pattern for checking and declaring dependencies.
#### Hook Naming Convention
**RENAME ALL HOOKS:**
- ❌ OLD: `on_Crawl__*_validate_*.{sh,py,js}`
- ✅ NEW: `on_Crawl__*_install_*.{sh,py,js}`
Rationale: "install" is clearer than "validate" for what these hooks actually do.
#### Standard Install Hook Pattern
**ALL install hooks MUST follow this pattern:**
1. ✅ Check if InstalledBinary already exists for the configured binary
2. ✅ If NOT found, emit a Dependency JSONL record, with overrides if you need to customize install process
3. ❌ NEVER directly call npm, apt, brew, pip, or any package manager
4. ✅ Let bin provider plugins handle actual installation
**Example Standard Pattern:**
```python
#!/usr/bin/env python3
"""
Check for wget binary and emit Dependency if not found.
"""
import os
import sys
import json
from pathlib import Path
def main():
# 1. Get configured binary name/path from env
binary_path = os.environ.get('WGET_BINARY', 'wget')
# 2. Check if InstalledBinary exists for this binary
# (In practice, this check happens via database query in the actual implementation)
# For install hooks, we emit a Dependency that the system will process
# 3. Emit Dependency JSONL if needed
# The bin provider will check InstalledBinary and install if missing
dependency = {
'type': 'Dependency',
'name': 'wget',
'bin_name': Path(binary_path).name if '/' in binary_path else binary_path,
'providers': ['apt', 'brew', 'pkg'], # Priority order
'abspath': binary_path if binary_path.startswith('/') else None,
}
print(json.dumps(dependency))
return 0
if __name__ == '__main__':
sys.exit(main())
```
#### Config Variable Handling
**ALL hooks MUST respect user-configured binary paths:**
- ✅ Read `XYZ_BINARY` env var (e.g., `WGET_BINARY`, `YTDLP_BINARY`, `CHROME_BINARY`)
- ✅ Support absolute paths: `WGET_BINARY=/usr/local/bin/wget2`
- ✅ Support bin names: `WGET_BINARY=wget2`
- ✅ Check for the CORRECT binary name in InstalledBinary
- ✅ If user provides `WGET_BINARY=wget2`, check for `wget2` not `wget`
**Example Config Handling:**
```python
# Get configured binary (could be path or name)
binary_path = os.environ.get('WGET_BINARY', 'wget')
# Extract just the binary name for InstalledBinary lookup
if '/' in binary_path:
# Absolute path: /usr/local/bin/wget2 -> wget2
bin_name = Path(binary_path).name
else:
# Just a name: wget2 -> wget2
bin_name = binary_path
# Now check InstalledBinary for bin_name (not hardcoded 'wget')
```
### 4.2 Snapshot Hook Standardization
All `on_Snapshot__*.*` hooks must follow the output format specified in **Phase 2**. Key points for implementation:
#### Output Format Requirements
**CRITICAL Legacy Issues to Fix:**
1. ❌ **Remove `RESULT_JSON=` prefix** - old hooks use `console.log('RESULT_JSON=' + ...)`
2. ❌ **Remove extra output lines** - old hooks print VERSION=, START_TS=, END_TS=, STATUS=, OUTPUT=
3. ❌ **Remove `--version` calls** - hooks should NOT run binary version checks
4. ✅ **Output clean JSONL only** - exactly ONE line: `console.log(JSON.stringify(result))`
**Before (WRONG):**
```javascript
console.log(`VERSION=${version}`);
console.log(`START_TS=${startTime.toISOString()}`);
console.log(`RESULT_JSON=${JSON.stringify(result)}`);
```
**After (CORRECT):**
```javascript
console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'Done'}));
```
> **See Phase 2 for complete JSONL format specification and examples.**
#### Using Configured Binaries
**ALL on_Snapshot hooks MUST:**
1. ✅ Read the correct `XYZ_BINARY` env var
2. ✅ Use that binary path/name in their commands
3. ✅ Pass cmd in JSONL output for binary FK lookup
**Example:**
```javascript
// ✅ CORRECT - uses env var
const wgetBinary = process.env.WGET_BINARY || 'wget';
const cmd = [wgetBinary, '-p', '-k', url];
// Execute command...
const result = execSync(cmd.join(' '));
// Report cmd in output for binary FK
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: 'Downloaded page',
cmd: cmd, // ✅ Includes configured binary
}));
```
```javascript
// ❌ WRONG - hardcoded binary name
const cmd = ['wget', '-p', '-k', url]; // Ignores WGET_BINARY
```
### 4.3 Per-Plugin Checklist
**For EACH plugin, verify ALL of these:**
#### Install Hook Checklist
- [ ] Renamed from `on_Crawl__*_validate_*` to `on_Crawl__*_install_*`
- [ ] Reads `XYZ_BINARY` env var and handles both absolute paths + bin names
- [ ] Emits `{"type": "Dependency", ...}` JSONL (NOT hardcoded to always check for 'wget')
- [ ] Does NOT call npm/apt/brew/pip directly
- [ ] Follows standard pattern from section 4.1
#### Snapshot Hook Checklist
- [ ] Reads correct `XYZ_BINARY` env var and uses it in cmd
- [ ] Outputs EXACTLY ONE JSONL line (NO `RESULT_JSON=` prefix)
- [ ] NO extra output lines (VERSION=, START_TS=, END_TS=, STATUS=, OUTPUT=)
- [ ] Does NOT run `--version` commands
- [ ] Only provides allowed fields (type, status, output_str, output_json, cmd)
- [ ] Does NOT include computed fields (see Phase 2 for forbidden fields list)
- [ ] Includes `cmd` array with configured binary path
### 4.4 Implementation Process
**MANDATORY PROCESS:**
1. ✅ List ALL plugins in archivebox/plugins/
2. ✅ For EACH plugin (DO NOT BATCH):
a. Read ALL hook files in the plugin directory
b. Check install hooks against checklist 4.3
c. Check snapshot hooks against checklist 4.3
d. Fix issues one by one
e. Test the plugin hooks
f. Move to next plugin
3. ❌ DO NOT skip any plugins
4. ❌ DO NOT batch-update multiple plugins
5. ❌ DO NOT assume plugins are similar enough to update together
**Why one-by-one?**
- Each plugin may have unique patterns
- Each plugin may use different languages (sh/py/js)
- Each plugin may have different edge cases
- Batch updates lead to copy-paste errors
### 4.5 Testing Each Plugin
After updating each plugin, verify:
1. ✅ Install hook can be executed: `python3 on_Crawl__01_install_wget.py`
2. ✅ Install hook outputs valid JSONL: `python3 ... | jq .`
3. ✅ Install hook respects `XYZ_BINARY` env var
4. ✅ Snapshot hook can be executed with test URL
5. ✅ Snapshot hook outputs EXACTLY ONE JSONL line
6. ✅ Snapshot hook JSONL parses correctly: `... | jq .type`
7. ✅ Snapshot hook uses configured binary from env
### 4.6 Common Pitfalls
When auditing plugins, watch for these common mistakes:
1. **Hardcoded binary names** - Check `InstalledBinary.filter(name='wget')` → should use configured name
2. **Old output format** - Look for `RESULT_JSON=`, `VERSION=`, `START_TS=` lines
3. **Computed fields in output** - Watch for `output_files`, `start_ts`, `duration` in JSONL
4. **Missing config variables** - Ensure hooks read `XYZ_BINARY` env vars
5. **Version checks** - Remove any `--version` command executions
> See sections 4.1 and 4.2 for detailed before/after examples.
---
## Phase 5: Update run_hook() Implementation
**Note:** Only do this AFTER Phase 4 (plugin standardization) is complete. By then, all plugins will output clean JSONL and this implementation will work smoothly.
### Location: `archivebox/hooks.py` ### Location: `archivebox/hooks.py`
@@ -546,7 +1106,9 @@ def run_hook(
--- ---
## Phase 5: Update ArchiveResult.run() ## Phase 6: Update ArchiveResult.run()
**Note:** Only do this AFTER Phase 5 (run_hook() implementation) is complete.
### Location: `archivebox/core/models.py` ### Location: `archivebox/core/models.py`
@@ -562,7 +1124,7 @@ def run(self):
computed fields (output_files, output_size, binary FK, etc.). computed fields (output_files, output_size, binary FK, etc.).
""" """
from django.utils import timezone from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, find_binary_for_cmd, create_model_record
from machine.models import Machine from machine.models import Machine
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
@@ -802,9 +1364,47 @@ All existing queries continue to work unchanged - the dict structure is backward
--- ---
## Phase 6: Background Hook Finalization ## Phase 7: Background Hook Support
### Helper Functions This phase adds support for long-running background hooks that don't block other extractors.
### 7.1 Background Hook Detection
Background hooks are identified by `.bg.` suffix in filename:
- `on_Snapshot__21_consolelog.bg.js` ← background
- `on_Snapshot__11_favicon.js` ← foreground
### 7.2 Rename Background Hooks
**Files to rename:**
```bash
# Use .bg. suffix (not __background)
mv archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js \
archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
mv archivebox/plugins/ssl/on_Snapshot__23_ssl.js \
archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
mv archivebox/plugins/responses/on_Snapshot__24_responses.js \
archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
```
**Update hook content to emit proper JSON:**
Each hook should emit:
```javascript
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded', // or 'failed' or 'skipped'
output_str: 'Captured 15 console messages', // human-readable summary
output_json: { // optional structured metadata
// ... specific to each hook
}
}));
```
### 7.3 Finalization Helper Functions
Location: `archivebox/core/models.py` or new `archivebox/core/background_hooks.py` Location: `archivebox/core/models.py` or new `archivebox/core/background_hooks.py`
@@ -934,7 +1534,7 @@ def finalize_background_hook(archiveresult: 'ArchiveResult') -> None:
stderr_file.unlink() stderr_file.unlink()
``` ```
### Update SnapshotMachine ### 7.4 Update SnapshotMachine
Location: `archivebox/core/statemachines.py` Location: `archivebox/core/statemachines.py`
@@ -967,82 +1567,12 @@ class SnapshotMachine(StateMachine, strict_states=True):
return True return True
``` ```
--- ### 7.5 Deduplication
## Phase 6b: Deduplication
Deduplication is handled by external filesystem tools like `fdupes` (hardlinks), ZFS dedup, Btrfs duperemove, or rdfind. Users can run these tools periodically on the archive directory to identify and link duplicate files. ArchiveBox doesn't need to track hashes or manage deduplication itself - the filesystem layer handles it transparently. Deduplication is handled by external filesystem tools like `fdupes` (hardlinks), ZFS dedup, Btrfs duperemove, or rdfind. Users can run these tools periodically on the archive directory to identify and link duplicate files. ArchiveBox doesn't need to track hashes or manage deduplication itself - the filesystem layer handles it transparently.
--- ---
## Phase 7: Rename Background Hooks
### Files to rename:
```bash
# Use .bg. suffix (not __background)
mv archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js \
archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js
mv archivebox/plugins/ssl/on_Snapshot__23_ssl.js \
archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js
mv archivebox/plugins/responses/on_Snapshot__24_responses.js \
archivebox/plugins/responses/on_Snapshot__24_responses.bg.js
```
### Update hook content to emit proper JSON:
Each hook should emit:
```javascript
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded', // or 'failed' or 'skipped'
output_str: 'Captured 15 console messages', // human-readable summary
output_json: { // optional structured metadata
// ... specific to each hook
}
}));
```
---
## Phase 8: Update Existing Hooks
### Update all hooks to emit proper JSON format
**Example: favicon hook**
```python
# Before
print(f'Favicon saved ({size} bytes)')
print(f'OUTPUT={OUTPUT_FILE}')
print(f'STATUS=succeeded')
# After
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': f'Favicon saved ({size} bytes)',
'output_json': {
'size': size,
'format': 'ico'
}
}
print(json.dumps(result))
```
**Example: wget hook with explicit cmd**
```bash
# After wget completes
cat <<EOF
{"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded index.html", "cmd": ["wget", "-p", "-k", "$URL"]}
EOF
```
---
## Testing Strategy ## Testing Strategy
### 1. Unit Tests ### 1. Unit Tests
@@ -1166,13 +1696,18 @@ cd archivebox
python manage.py makemigrations core --name archiveresult_background_hooks python manage.py makemigrations core --name archiveresult_background_hooks
``` ```
### Step 2: Update run_hook() ### Step 2: **Plugin standardization (Phase 4)**
- Update ALL plugins to new JSONL format FIRST
- Test each plugin as you update it
- This ensures old run_hook() can still work during transition
### Step 3: Update run_hook() (Phase 5)
- Add background hook detection - Add background hook detection
- Add log file capture - Add log file capture
- Parse JSONL output (any line with {type: 'ModelName', ...}) - Parse JSONL output (any line with {type: 'ModelName', ...})
- Add plugin and plugin_hook metadata to each record - Add plugin and plugin_hook metadata to each record
### Step 3: Update ArchiveResult.run() ### Step 4: Update ArchiveResult.run() (Phase 6)
- Handle None result for background hooks (return immediately) - Handle None result for background hooks (return immediately)
- Parse records list from run_hook() - Parse records list from run_hook()
- Assert only one ArchiveResult record per hook - Assert only one ArchiveResult record per hook
@@ -1180,22 +1715,18 @@ python manage.py makemigrations core --name archiveresult_background_hooks
- Call `_populate_output_fields()` to walk directory and populate summary fields - Call `_populate_output_fields()` to walk directory and populate summary fields
- Call `create_model_record()` for any side-effect records (InstalledBinary, etc.) - Call `create_model_record()` for any side-effect records (InstalledBinary, etc.)
### Step 4: Add finalization helpers ### Step 5: Add finalization helpers (Phase 7)
- `find_background_hooks()` - `find_background_hooks()`
- `check_background_hook_completed()` - `check_background_hook_completed()`
- `finalize_background_hook()` - `finalize_background_hook()`
### Step 5: Update SnapshotMachine.is_finished() ### Step 6: Update SnapshotMachine.is_finished() (Phase 7)
- Check for background hooks - Check for background hooks
- Finalize completed ones - Finalize completed ones
### Step 6: Rename hooks ### Step 7: Rename background hooks (Phase 7)
- Rename 3 background hooks with .bg. suffix - Rename 3 background hooks with .bg. suffix
### Step 7: Update hook outputs
- Update all hooks to emit JSON format
- Remove manual timestamp/status calculation
### Step 8: Test ### Step 8: Test
- Unit tests - Unit tests
- Integration tests - Integration tests
@@ -1214,6 +1745,8 @@ python manage.py makemigrations core --name archiveresult_background_hooks
- ✅ Log files cleaned up on success, kept on failure - ✅ Log files cleaned up on success, kept on failure
- ✅ PID files cleaned up after completion - ✅ PID files cleaned up after completion
- ✅ No plugin-specific code in core (generic polling mechanism) - ✅ No plugin-specific code in core (generic polling mechanism)
- ✅ All plugins updated to clean JSONL format
- ✅ Safe incremental rollout (plugins first, then core code)
--- ---

View File

@@ -185,9 +185,26 @@ class CrawlMachine(StateMachine, strict_states=True):
machine.save(update_fields=['config']) machine.save(update_fields=['config'])
elif obj_type == 'Dependency': elif obj_type == 'Dependency':
# Dependency request - could trigger installation # Create Dependency record from JSONL
# For now just log it (installation hooks would be separate) from machine.models import Dependency
print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]')
bin_name = obj.get('bin_name')
if not bin_name:
continue
# Create or get existing dependency
dependency, created = Dependency.objects.get_or_create(
bin_name=bin_name,
defaults={
'bin_providers': obj.get('bin_providers', '*'),
'overrides': obj.get('overrides', {}),
'config': obj.get('config', {}),
}
)
# Run dependency installation if not already installed
if not dependency.is_installed:
dependency.run()
except json.JSONDecodeError: except json.JSONDecodeError:
# Not JSON, skip # Not JSON, skip

View File

@@ -1,65 +0,0 @@
# Generated by Django 6.0 on 2025-12-25 09:34
import django.db.models.deletion
from archivebox import uuid_compat
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0001_squashed'),
]
operations = [
migrations.AlterField(
model_name='dependency',
name='bin_name',
field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
),
migrations.AlterField(
model_name='dependency',
name='bin_providers',
field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
),
migrations.AlterField(
model_name='dependency',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
),
migrations.AlterField(
model_name='dependency',
name='custom_cmds',
field=models.JSONField(blank=True, default=dict, help_text="JSON map of provider -> custom install command (e.g., {'apt': 'apt install -y wget'})"),
),
migrations.AlterField(
model_name='dependency',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='installedbinary',
name='dependency',
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='installedbinary_set', to='machine.dependency'),
),
migrations.AlterField(
model_name='installedbinary',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='machine',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
),
migrations.AlterField(
model_name='machine',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='networkinterface',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -0,0 +1,38 @@
# Generated manually on 2025-12-26
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0001_squashed'),
]
operations = [
migrations.RenameField(
model_name='dependency',
old_name='custom_cmds',
new_name='overrides',
),
migrations.AlterField(
model_name='dependency',
name='bin_name',
field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
),
migrations.AlterField(
model_name='dependency',
name='bin_providers',
field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
),
migrations.AlterField(
model_name='dependency',
name='overrides',
field=models.JSONField(blank=True, default=dict, help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}"),
),
migrations.AlterField(
model_name='dependency',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
),
]

View File

@@ -109,13 +109,13 @@ class NetworkInterface(ModelWithHealthStats):
class DependencyManager(models.Manager): class DependencyManager(models.Manager):
def get_or_create_for_extractor(self, bin_name: str, bin_providers: str = '*', custom_cmds: dict = None, config: dict = None) -> 'Dependency': def get_or_create_for_extractor(self, bin_name: str, bin_providers: str = '*', overrides: dict = None, config: dict = None) -> 'Dependency':
"""Get or create a Dependency for an extractor's binary.""" """Get or create a Dependency for an extractor's binary."""
dependency, created = self.get_or_create( dependency, created = self.get_or_create(
bin_name=bin_name, bin_name=bin_name,
defaults={ defaults={
'bin_providers': bin_providers, 'bin_providers': bin_providers,
'custom_cmds': custom_cmds or {}, 'overrides': overrides or {},
'config': config or {}, 'config': config or {},
} }
) )
@@ -132,11 +132,11 @@ class Dependency(models.Model):
Example: Example:
Dependency.objects.get_or_create( Dependency.objects.get_or_create(
bin_name='wget', bin_name='wget',
bin_providers='apt,brew,nix,custom', bin_providers='apt,brew,pip,env',
custom_cmds={ overrides={
'apt': 'apt install -y --no-install-recommends wget', 'apt': {'packages': ['wget']},
'brew': 'brew install wget', 'brew': {'packages': ['wget']},
'custom': 'curl https://example.com/get-wget.sh | bash', 'pip': {'packages': ['wget']},
} }
) )
""" """
@@ -161,8 +161,8 @@ class Dependency(models.Model):
help_text="Binary executable name (e.g., wget, yt-dlp, chromium)") help_text="Binary executable name (e.g., wget, yt-dlp, chromium)")
bin_providers = models.CharField(max_length=127, default='*', bin_providers = models.CharField(max_length=127, default='*',
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any") help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any")
custom_cmds = models.JSONField(default=dict, blank=True, overrides = models.JSONField(default=dict, blank=True,
help_text="JSON map of provider -> custom install command (e.g., {'apt': 'apt install -y wget'})") help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}")
config = models.JSONField(default=dict, blank=True, config = models.JSONField(default=dict, blank=True,
help_text="JSON map of env var config to use during install") help_text="JSON map of env var config to use during install")
@@ -181,9 +181,9 @@ class Dependency(models.Model):
return True return True
return provider in self.bin_providers.split(',') return provider in self.bin_providers.split(',')
def get_install_cmd(self, provider: str) -> str | None: def get_overrides_for_provider(self, provider: str) -> dict | None:
"""Get the install command for a provider, or None for default.""" """Get the overrides for a provider, or None if not specified."""
return self.custom_cmds.get(provider) return self.overrides.get(provider)
@property @property
def installed_binaries(self): def installed_binaries(self):
@@ -195,6 +195,85 @@ class Dependency(models.Model):
"""Check if at least one valid InstalledBinary exists for this dependency.""" """Check if at least one valid InstalledBinary exists for this dependency."""
return self.installed_binaries.filter(abspath__isnull=False).exclude(abspath='').exists() return self.installed_binaries.filter(abspath__isnull=False).exclude(abspath='').exists()
def run(self):
"""
Execute dependency installation by running all on_Dependency hooks.
Each hook checks if it can handle this dependency and installs if possible.
Returns the InstalledBinary record on success, None on failure.
"""
import json
from pathlib import Path
from django.conf import settings
# Check if already installed
if self.is_installed:
return self.installed_binaries.first()
# Import here to avoid circular dependency
from archivebox.hooks import run_hooks
# Create output directory
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
output_dir = Path(DATA_DIR) / 'tmp' / f'dependency_{self.id}'
output_dir.mkdir(parents=True, exist_ok=True)
# Build kwargs for hooks - pass overrides as JSON string
hook_kwargs = {
'dependency_id': str(self.id),
'bin_name': self.bin_name,
'bin_providers': self.bin_providers,
'overrides': json.dumps(self.overrides) if self.overrides else None,
}
# Run all on_Dependency hooks - each decides if it can handle this
results = run_hooks(
event_name='Dependency',
output_dir=output_dir,
timeout=600,
**hook_kwargs
)
# Process results - parse JSONL and create InstalledBinary records
for result in results:
if result['returncode'] != 0:
continue
# Parse JSONL output
for line in result['stdout'].strip().split('\n'):
if not line.strip():
continue
try:
obj = json.loads(line)
if obj.get('type') == 'InstalledBinary':
# Create InstalledBinary record
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
continue
machine = Machine.current()
installed_binary, _ = InstalledBinary.objects.update_or_create(
machine=machine,
name=obj['name'],
defaults={
'abspath': obj['abspath'],
'version': obj['version'],
'sha256': obj.get('sha256') or '',
'binprovider': obj.get('binprovider') or 'env',
'dependency': self,
}
)
# Success! Return the installed binary
if self.is_installed:
return installed_binary
except json.JSONDecodeError:
continue
# Failed to install with any hook
return None
class InstalledBinaryManager(models.Manager): class InstalledBinaryManager(models.Manager):
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'InstalledBinary': def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'InstalledBinary':

View File

@@ -25,7 +25,8 @@ AptProvider.model_rebuild()
@click.option('--bin-name', required=True, help="Binary name to install") @click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)") @click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command (overrides default)") @click.option('--custom-cmd', default=None, help="Custom install command (overrides default)")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None): @click.option('--overrides', default=None, help="JSON-encoded overrides dict")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
"""Install binary using apt package manager.""" """Install binary using apt package manager."""
# Check if apt provider is allowed # Check if apt provider is allowed
@@ -42,7 +43,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
click.echo(f"Installing {bin_name} via apt...", err=True) click.echo(f"Installing {bin_name} via apt...", err=True)
try: try:
binary = Binary(name=bin_name, binproviders=[provider]).install() # Parse overrides if provided
overrides_dict = None
if overrides:
try:
overrides_dict = json.loads(overrides)
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
except json.JSONDecodeError:
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
except Exception as e: except Exception as e:
click.echo(f"apt install failed: {e}", err=True) click.echo(f"apt install failed: {e}", err=True)
sys.exit(1) sys.exit(1)

View File

@@ -25,7 +25,8 @@ BrewProvider.model_rebuild()
@click.option('--bin-name', required=True, help="Binary name to install") @click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)") @click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command") @click.option('--custom-cmd', default=None, help="Custom install command")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None): @click.option('--overrides', default=None, help="JSON-encoded overrides dict")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
"""Install binary using Homebrew.""" """Install binary using Homebrew."""
if bin_providers != '*' and 'brew' not in bin_providers.split(','): if bin_providers != '*' and 'brew' not in bin_providers.split(','):
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
click.echo(f"Installing {bin_name} via brew...", err=True) click.echo(f"Installing {bin_name} via brew...", err=True)
try: try:
binary = Binary(name=bin_name, binproviders=[provider]).install() # Parse overrides if provided
overrides_dict = None
if overrides:
try:
overrides_dict = json.loads(overrides)
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
except json.JSONDecodeError:
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
except Exception as e: except Exception as e:
click.echo(f"brew install failed: {e}", err=True) click.echo(f"brew install failed: {e}", err=True)
sys.exit(1) sys.exit(1)

View File

@@ -6,103 +6,29 @@ Runs at crawl start to verify Chrome is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
# Common Chrome/Chromium binary names and paths
CHROME_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
CHROME_PATHS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
'/snap/bin/chromium',
'/opt/google/chrome/chrome',
]
def get_binary_version(abspath: str) -> str | None:
"""Get version string from Chrome binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0 and result.stdout:
# Chrome version string: "Google Chrome 120.0.6099.109" or "Chromium 120.0.6099.109"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
# Find version number (looks like 120.0.6099.109)
for part in parts:
if '.' in part and part[0].isdigit():
return part
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_chrome() -> dict | None: def find_chrome() -> dict | None:
"""Find Chrome/Chromium binary.""" """Find Chrome/Chromium binary."""
# Check env var first try:
env_path = os.environ.get('CHROME_BINARY', '') from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
if env_path and Path(env_path).is_file():
return {
'name': 'chrome',
'abspath': env_path,
'version': get_binary_version(env_path),
'sha256': get_binary_hash(env_path),
'binprovider': 'env',
}
# Try shutil.which for various names # Try common Chrome/Chromium binary names
for name in CHROME_NAMES: for name in ['google-chrome', 'chromium', 'chromium-browser', 'google-chrome-stable', 'chrome']:
abspath = shutil.which(name) binary = Binary(name=name, binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
if abspath: loaded = binary.load()
return { if loaded and loaded.abspath:
'name': 'chrome', return {
'abspath': abspath, 'name': 'chrome',
'version': get_binary_version(abspath), 'abspath': str(loaded.abspath),
'sha256': get_binary_hash(abspath), 'version': str(loaded.version) if loaded.version else None,
'binprovider': 'env', 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
} 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
# Check common paths except Exception:
for path in CHROME_PATHS: pass
if Path(path).is_file():
return {
'name': 'chrome',
'abspath': path,
'version': get_binary_version(path),
'sha256': get_binary_hash(path),
'binprovider': 'env',
}
return None return None

View File

@@ -6,39 +6,8 @@ Runs at crawl start to verify forum-dl binary is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_forumdl() -> dict | None: def find_forumdl() -> dict | None:
@@ -46,11 +15,7 @@ def find_forumdl() -> dict | None:
try: try:
from abx_pkg import Binary, PipProvider, EnvProvider from abx_pkg import Binary, PipProvider, EnvProvider
class ForumdlBinary(Binary): binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()])
name: str = 'forum-dl'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = ForumdlBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -60,22 +25,9 @@ def find_forumdl() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('forum-dl') or os.environ.get('FORUMDL_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'forum-dl',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None
@@ -86,7 +38,7 @@ def main():
missing_deps = [] missing_deps = []
# Emit results for forum-dl # Emit results for forum-dl
if forumdl_result and forumdl_result.get('abspath'): if forumdl_result and forumdl_result.get('abspath') and forumdl_result.get('version'):
print(json.dumps({ print(json.dumps({
'type': 'InstalledBinary', 'type': 'InstalledBinary',
'name': forumdl_result['name'], 'name': forumdl_result['name'],
@@ -111,10 +63,19 @@ def main():
'value': forumdl_result['version'], 'value': forumdl_result['version'],
})) }))
else: else:
# forum-dl has cchardet dependency that doesn't compile on Python 3.14+
# Provide overrides to install with chardet instead
print(json.dumps({ print(json.dumps({
'type': 'Dependency', 'type': 'Dependency',
'bin_name': 'forum-dl', 'bin_name': 'forum-dl',
'bin_providers': 'pip,env', 'bin_providers': 'pip,env',
'overrides': {
'pip': {
'packages': ['--no-deps', 'forum-dl', 'chardet', 'pydantic', 'beautifulsoup4', 'lxml',
'requests', 'urllib3', 'tenacity', 'python-dateutil',
'html2text', 'warcio']
}
}
})) }))
missing_deps.append('forum-dl') missing_deps.append('forum-dl')

View File

@@ -137,6 +137,8 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
return True, None, '' # Not a forum site - success, no output return True, None, '' # Not a forum site - success, no output
if 'no content' in stderr_lower: if 'no content' in stderr_lower:
return True, None, '' # No forum found - success, no output return True, None, '' # No forum found - success, no output
if 'extractornotfounderror' in stderr_lower:
return True, None, '' # No forum extractor for this URL - success, no output
if result.returncode == 0: if result.returncode == 0:
return True, None, '' # forum-dl exited cleanly, just no forum - success return True, None, '' # forum-dl exited cleanly, just no forum - success

View File

@@ -15,6 +15,7 @@ import json
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import uuid
from pathlib import Path from pathlib import Path
import pytest import pytest
@@ -24,6 +25,75 @@ FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py'
FORUMDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_forumdl.py' FORUMDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_forumdl.py'
TEST_URL = 'https://example.com' TEST_URL = 'https://example.com'
# Module-level cache for installed binary path
_forumdl_binary_path = None
def get_forumdl_binary_path():
"""Get the installed forum-dl binary path from cache or by running validation/installation."""
global _forumdl_binary_path
if _forumdl_binary_path:
return _forumdl_binary_path
# Run validation hook to find or install binary
result = subprocess.run(
[sys.executable, str(FORUMDL_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=300
)
# Check if binary was found
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary' and record.get('name') == 'forum-dl':
_forumdl_binary_path = record.get('abspath')
return _forumdl_binary_path
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'forum-dl':
# Need to install via pip hook
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Dependency__install_using_pip_provider.py'
dependency_id = str(uuid.uuid4())
# Build command with overrides if present
cmd = [
sys.executable, str(pip_hook),
'--dependency-id', dependency_id,
'--bin-name', record['bin_name']
]
if 'overrides' in record:
cmd.extend(['--overrides', json.dumps(record['overrides'])])
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
# Parse InstalledBinary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
if install_line.strip():
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'InstalledBinary' and install_record.get('name') == 'forum-dl':
_forumdl_binary_path = install_record.get('abspath')
return _forumdl_binary_path
except json.JSONDecodeError:
pass
# Installation failed - print debug info
if not _forumdl_binary_path:
print(f"\n=== forum-dl installation failed ===", file=sys.stderr)
print(f"stdout: {install_result.stdout}", file=sys.stderr)
print(f"stderr: {install_result.stderr}", file=sys.stderr)
print(f"returncode: {install_result.returncode}", file=sys.stderr)
return None
except json.JSONDecodeError:
pass
return None
def test_hook_script_exists(): def test_hook_script_exists():
"""Verify on_Snapshot hook exists.""" """Verify on_Snapshot hook exists."""
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}" assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
@@ -64,38 +134,40 @@ def test_forumdl_validate_hook():
def test_verify_deps_with_abx_pkg(): def test_verify_deps_with_abx_pkg():
"""Verify forum-dl is available via abx-pkg.""" """Verify forum-dl is installed by calling the REAL validation and installation hooks."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides binary_path = get_forumdl_binary_path()
assert binary_path, (
missing_binaries = [] "forum-dl must be installed successfully via validation hook and pip provider. "
"NOTE: forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
# Verify forum-dl is available "due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
forumdl_binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()]) )
forumdl_loaded = forumdl_binary.load() assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
if not (forumdl_loaded and forumdl_loaded.abspath):
missing_binaries.append('forum-dl')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
def test_handles_non_forum_url(): def test_handles_non_forum_url():
"""Test that forum-dl extractor handles non-forum URLs gracefully via hook.""" """Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
# Prerequisites checked by earlier test import os
binary_path = get_forumdl_binary_path()
assert binary_path, "Binary must be installed for this test"
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir) tmpdir = Path(tmpdir)
env = os.environ.copy()
env['FORUMDL_BINARY'] = binary_path
# Run forum-dl extraction hook on non-forum URL # Run forum-dl extraction hook on non-forum URL
result = subprocess.run( result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], [sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
cwd=tmpdir, cwd=tmpdir,
capture_output=True, capture_output=True,
text=True, text=True,
env=env,
timeout=60 timeout=60
) )
# Should exit 0 even for non-forum URL # Should exit 0 even for non-forum URL (graceful handling)
assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}" assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
# Verify JSONL output # Verify JSONL output
@@ -138,8 +210,12 @@ def test_config_timeout():
"""Test that FORUMDL_TIMEOUT config is respected.""" """Test that FORUMDL_TIMEOUT config is respected."""
import os import os
binary_path = get_forumdl_binary_path()
assert binary_path, "Binary must be installed for this test"
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy() env = os.environ.copy()
env['FORUMDL_BINARY'] = binary_path
env['FORUMDL_TIMEOUT'] = '5' env['FORUMDL_TIMEOUT'] = '5'
result = subprocess.run( result = subprocess.run(

View File

@@ -6,39 +6,8 @@ Runs at crawl start to verify gallery-dl binary is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_gallerydl() -> dict | None: def find_gallerydl() -> dict | None:
@@ -46,11 +15,7 @@ def find_gallerydl() -> dict | None:
try: try:
from abx_pkg import Binary, PipProvider, EnvProvider from abx_pkg import Binary, PipProvider, EnvProvider
class GalleryDlBinary(Binary): binary = Binary(name='gallery-dl', binproviders=[PipProvider(), EnvProvider()])
name: str = 'gallery-dl'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = GalleryDlBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -60,22 +25,9 @@ def find_gallerydl() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('gallery-dl') or os.environ.get('GALLERYDL_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'gallery-dl',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None

View File

@@ -6,43 +6,8 @@ Runs at crawl start to verify git is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# git version string: "git version 2.43.0"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
if len(parts) >= 3 and parts[0] == 'git':
return parts[2]
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_git() -> dict | None: def find_git() -> dict | None:
@@ -50,11 +15,7 @@ def find_git() -> dict | None:
try: try:
from abx_pkg import Binary, EnvProvider from abx_pkg import Binary, EnvProvider
class GitBinary(Binary): binary = Binary(name='git', binproviders=[EnvProvider()])
name: str = 'git'
binproviders_supported = [EnvProvider()]
binary = GitBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -64,22 +25,9 @@ def find_git() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('git') or os.environ.get('GIT_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'git',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None

View File

@@ -6,51 +6,16 @@ Runs at crawl start to verify yt-dlp and required binaries are available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_ytdlp() -> dict | None: def find_ytdlp() -> dict | None:
"""Find yt-dlp binary.""" """Find yt-dlp binary."""
try: try:
from abx_pkg import Binary, PipProvider, EnvProvider from abx_pkg import Binary, PipProvider, BrewProvider, AptProvider, EnvProvider
class YtdlpBinary(Binary): binary = Binary(name='yt-dlp', binproviders=[PipProvider(), BrewProvider(), AptProvider(), EnvProvider()])
name: str = 'yt-dlp'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = YtdlpBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -60,22 +25,9 @@ def find_ytdlp() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'yt-dlp',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None
@@ -84,12 +36,7 @@ def find_node() -> dict | None:
try: try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
class NodeBinary(Binary): binary = Binary(name='node', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
name: str = 'node'
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
overrides: dict = {'apt': {'packages': ['nodejs']}}
binary = NodeBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -99,22 +46,9 @@ def find_node() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'node',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None
@@ -123,11 +57,7 @@ def find_ffmpeg() -> dict | None:
try: try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
class FfmpegBinary(Binary): binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
name: str = 'ffmpeg'
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
binary = FfmpegBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -137,22 +67,9 @@ def find_ffmpeg() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'ffmpeg',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None
@@ -197,7 +114,7 @@ def main():
print(json.dumps({ print(json.dumps({
'type': 'Dependency', 'type': 'Dependency',
'bin_name': 'yt-dlp', 'bin_name': 'yt-dlp',
'bin_providers': 'pip,env', 'bin_providers': 'pip,brew,apt,env',
})) }))
missing_deps.append('yt-dlp') missing_deps.append('yt-dlp')
@@ -227,10 +144,14 @@ def main():
'value': node_result['version'], 'value': node_result['version'],
})) }))
else: else:
# node is installed as 'nodejs' package on apt
print(json.dumps({ print(json.dumps({
'type': 'Dependency', 'type': 'Dependency',
'bin_name': 'node', 'bin_name': 'node',
'bin_providers': 'apt,brew,env', 'bin_providers': 'apt,brew,env',
'overrides': {
'apt': {'packages': ['nodejs']}
}
})) }))
missing_deps.append('node') missing_deps.append('node')

View File

@@ -6,39 +6,8 @@ Runs at crawl start to verify postlight-parser is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_mercury() -> dict | None: def find_mercury() -> dict | None:
@@ -46,12 +15,7 @@ def find_mercury() -> dict | None:
try: try:
from abx_pkg import Binary, NpmProvider, EnvProvider from abx_pkg import Binary, NpmProvider, EnvProvider
class MercuryBinary(Binary): binary = Binary(name='postlight-parser', binproviders=[NpmProvider(), EnvProvider()])
name: str = 'postlight-parser'
binproviders_supported = [NpmProvider(), EnvProvider()]
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
binary = MercuryBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -61,22 +25,9 @@ def find_mercury() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'postlight-parser',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None
@@ -110,10 +61,14 @@ def main():
sys.exit(0) sys.exit(0)
else: else:
# postlight-parser is installed as @postlight/parser in npm
print(json.dumps({ print(json.dumps({
'type': 'Dependency', 'type': 'Dependency',
'bin_name': 'postlight-parser', 'bin_name': 'postlight-parser',
'bin_providers': 'npm,env', 'bin_providers': 'npm,env',
'overrides': {
'npm': {'packages': ['@postlight/parser']}
}
})) }))
print(f"postlight-parser binary not found", file=sys.stderr) print(f"postlight-parser binary not found", file=sys.stderr)
sys.exit(1) sys.exit(1)

View File

@@ -25,7 +25,8 @@ NpmProvider.model_rebuild()
@click.option('--bin-name', required=True, help="Binary name to install") @click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)") @click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command") @click.option('--custom-cmd', default=None, help="Custom install command")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None): @click.option('--overrides', default=None, help="JSON-encoded overrides dict")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
"""Install binary using npm.""" """Install binary using npm."""
if bin_providers != '*' and 'npm' not in bin_providers.split(','): if bin_providers != '*' and 'npm' not in bin_providers.split(','):
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
click.echo(f"Installing {bin_name} via npm...", err=True) click.echo(f"Installing {bin_name} via npm...", err=True)
try: try:
binary = Binary(name=bin_name, binproviders=[provider]).install() # Parse overrides if provided
overrides_dict = None
if overrides:
try:
overrides_dict = json.loads(overrides)
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
except json.JSONDecodeError:
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
except Exception as e: except Exception as e:
click.echo(f"npm install failed: {e}", err=True) click.echo(f"npm install failed: {e}", err=True)
sys.exit(1) sys.exit(1)

View File

@@ -6,39 +6,8 @@ Runs at crawl start to verify papers-dl binary is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_papersdl() -> dict | None: def find_papersdl() -> dict | None:
@@ -46,11 +15,7 @@ def find_papersdl() -> dict | None:
try: try:
from abx_pkg import Binary, PipProvider, EnvProvider from abx_pkg import Binary, PipProvider, EnvProvider
class PapersdlBinary(Binary): binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()])
name: str = 'papers-dl'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = PapersdlBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -60,22 +25,9 @@ def find_papersdl() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('papers-dl') or os.environ.get('PAPERSDL_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'papers-dl',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None

View File

@@ -15,6 +15,7 @@ import json
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import uuid
from pathlib import Path from pathlib import Path
import pytest import pytest
@@ -24,6 +25,67 @@ PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py'
PAPERSDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_papersdl.py' PAPERSDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_papersdl.py'
TEST_URL = 'https://example.com' TEST_URL = 'https://example.com'
# Module-level cache for installed binary path
_papersdl_binary_path = None
def get_papersdl_binary_path():
"""Get the installed papers-dl binary path from cache or by running validation/installation."""
global _papersdl_binary_path
if _papersdl_binary_path:
return _papersdl_binary_path
# Run validation hook to find or install binary
result = subprocess.run(
[sys.executable, str(PAPERSDL_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=300
)
# Check if binary was found
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary' and record.get('name') == 'papers-dl':
_papersdl_binary_path = record.get('abspath')
return _papersdl_binary_path
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'papers-dl':
# Need to install via pip hook
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Dependency__install_using_pip_provider.py'
dependency_id = str(uuid.uuid4())
# Build command with overrides if present
cmd = [
sys.executable, str(pip_hook),
'--dependency-id', dependency_id,
'--bin-name', record['bin_name']
]
if 'overrides' in record:
cmd.extend(['--overrides', json.dumps(record['overrides'])])
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
# Parse InstalledBinary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
if install_line.strip():
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'InstalledBinary' and install_record.get('name') == 'papers-dl':
_papersdl_binary_path = install_record.get('abspath')
return _papersdl_binary_path
except json.JSONDecodeError:
pass
except json.JSONDecodeError:
pass
return None
def test_hook_script_exists(): def test_hook_script_exists():
"""Verify on_Snapshot hook exists.""" """Verify on_Snapshot hook exists."""
assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}" assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}"
@@ -64,34 +126,32 @@ def test_papersdl_validate_hook():
def test_verify_deps_with_abx_pkg(): def test_verify_deps_with_abx_pkg():
"""Verify papers-dl is available via abx-pkg.""" """Verify papers-dl is installed by calling the REAL validation and installation hooks."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides binary_path = get_papersdl_binary_path()
assert binary_path, "papers-dl must be installed successfully via validation hook and pip provider"
missing_binaries = [] assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
# Verify papers-dl is available
papersdl_binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()])
papersdl_loaded = papersdl_binary.load()
if not (papersdl_loaded and papersdl_loaded.abspath):
missing_binaries.append('papers-dl')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
def test_handles_non_paper_url(): def test_handles_non_paper_url():
"""Test that papers-dl extractor handles non-paper URLs gracefully via hook.""" """Test that papers-dl extractor handles non-paper URLs gracefully via hook."""
# Prerequisites checked by earlier test import os
binary_path = get_papersdl_binary_path()
assert binary_path, "Binary must be installed for this test"
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir) tmpdir = Path(tmpdir)
env = os.environ.copy()
env['PAPERSDL_BINARY'] = binary_path
# Run papers-dl extraction hook on non-paper URL # Run papers-dl extraction hook on non-paper URL
result = subprocess.run( result = subprocess.run(
[sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], [sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
cwd=tmpdir, cwd=tmpdir,
capture_output=True, capture_output=True,
text=True, text=True,
env=env,
timeout=60 timeout=60
) )
@@ -138,8 +198,12 @@ def test_config_timeout():
"""Test that PAPERSDL_TIMEOUT config is respected.""" """Test that PAPERSDL_TIMEOUT config is respected."""
import os import os
binary_path = get_papersdl_binary_path()
assert binary_path, "Binary must be installed for this test"
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy() env = os.environ.copy()
env['PAPERSDL_BINARY'] = binary_path
env['PAPERSDL_TIMEOUT'] = '5' env['PAPERSDL_TIMEOUT'] = '5'
result = subprocess.run( result = subprocess.run(

View File

@@ -25,7 +25,8 @@ PipProvider.model_rebuild()
@click.option('--bin-name', required=True, help="Binary name to install") @click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)") @click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command") @click.option('--custom-cmd', default=None, help="Custom install command")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None): @click.option('--overrides', default=None, help="JSON-encoded overrides dict")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None, overrides: str | None):
"""Install binary using pip.""" """Install binary using pip."""
if bin_providers != '*' and 'pip' not in bin_providers.split(','): if bin_providers != '*' and 'pip' not in bin_providers.split(','):
@@ -41,7 +42,16 @@ def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str
click.echo(f"Installing {bin_name} via pip...", err=True) click.echo(f"Installing {bin_name} via pip...", err=True)
try: try:
binary = Binary(name=bin_name, binproviders=[provider]).install() # Parse overrides if provided
overrides_dict = None
if overrides:
try:
overrides_dict = json.loads(overrides)
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
except json.JSONDecodeError:
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
binary = Binary(name=bin_name, binproviders=[provider], overrides=overrides_dict or {}).install()
except Exception as e: except Exception as e:
click.echo(f"pip install failed: {e}", err=True) click.echo(f"pip install failed: {e}", err=True)
sys.exit(1) sys.exit(1)

View File

@@ -6,39 +6,8 @@ Runs at crawl start to verify readability-extractor is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_readability() -> dict | None: def find_readability() -> dict | None:
@@ -46,12 +15,7 @@ def find_readability() -> dict | None:
try: try:
from abx_pkg import Binary, NpmProvider, EnvProvider from abx_pkg import Binary, NpmProvider, EnvProvider
class ReadabilityBinary(Binary): binary = Binary(name='readability-extractor', binproviders=[NpmProvider(), EnvProvider()])
name: str = 'readability-extractor'
binproviders_supported = [NpmProvider(), EnvProvider()]
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
binary = ReadabilityBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -61,22 +25,9 @@ def find_readability() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'readability-extractor',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None
@@ -110,10 +61,14 @@ def main():
sys.exit(0) sys.exit(0)
else: else:
# readability-extractor is installed from GitHub
print(json.dumps({ print(json.dumps({
'type': 'Dependency', 'type': 'Dependency',
'bin_name': 'readability-extractor', 'bin_name': 'readability-extractor',
'bin_providers': 'npm,env', 'bin_providers': 'npm,env',
'overrides': {
'npm': {'packages': ['github:ArchiveBox/readability-extractor']}
}
})) }))
print(f"readability-extractor binary not found", file=sys.stderr) print(f"readability-extractor binary not found", file=sys.stderr)
sys.exit(1) sys.exit(1)

View File

@@ -9,67 +9,25 @@ Outputs JSONL for InstalledBinary and Machine config updates.
import os import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from ripgrep binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# ripgrep version string: "ripgrep 14.1.0"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
for i, part in enumerate(parts):
if part.lower() == 'ripgrep' and i + 1 < len(parts):
return parts[i + 1]
# Try to find version number pattern
for part in parts:
if part[0].isdigit() and '.' in part:
return part
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_ripgrep() -> dict | None: def find_ripgrep() -> dict | None:
"""Find ripgrep binary using shutil.which or env var.""" """Find ripgrep binary."""
# Check env var first - if it's an absolute path and exists, use it try:
ripgrep_env = os.environ.get('RIPGREP_BINARY', '') from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
if ripgrep_env and '/' in ripgrep_env and Path(ripgrep_env).is_file():
abspath = ripgrep_env
else:
# Otherwise try shutil.which with the env var as the binary name
abspath = shutil.which(ripgrep_env) if ripgrep_env else None
if not abspath:
abspath = shutil.which('rg')
if abspath and Path(abspath).is_file(): binary = Binary(name='rg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
return { loaded = binary.load()
'name': 'rg', if loaded and loaded.abspath:
'abspath': abspath, return {
'version': get_binary_version(abspath), 'name': 'rg',
'sha256': get_binary_hash(abspath), 'abspath': str(loaded.abspath),
'binprovider': 'env', 'version': str(loaded.version) if loaded.version else None,
} 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except Exception:
pass
return None return None

View File

@@ -6,82 +6,27 @@ Runs at crawl start to verify single-file (npm package) is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from single-file binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0 and result.stdout:
return result.stdout.strip().split('\n')[0][:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
# For scripts, hash the script content
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_singlefile() -> dict | None: def find_singlefile() -> dict | None:
"""Find single-file binary.""" """Find single-file binary."""
# Check env var first try:
env_path = os.environ.get('SINGLEFILE_BINARY', '') from abx_pkg import Binary, NpmProvider, EnvProvider
if env_path and Path(env_path).is_file():
return {
'name': 'single-file',
'abspath': env_path,
'version': get_binary_version(env_path),
'sha256': get_binary_hash(env_path),
'binprovider': 'env',
}
# Try shutil.which binary = Binary(name='single-file', binproviders=[NpmProvider(), EnvProvider()])
for name in ['single-file', 'singlefile']: loaded = binary.load()
abspath = shutil.which(name) if loaded and loaded.abspath:
if abspath:
return { return {
'name': 'single-file', 'name': 'single-file',
'abspath': abspath, 'abspath': str(loaded.abspath),
'version': get_binary_version(abspath), 'version': str(loaded.version) if loaded.version else None,
'sha256': get_binary_hash(abspath), 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': 'npm', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
# Check common npm paths
npm_paths = [
Path.home() / '.npm-global/bin/single-file',
Path.home() / 'node_modules/.bin/single-file',
Path('/usr/local/bin/single-file'),
Path('/usr/local/lib/node_modules/.bin/single-file'),
]
for path in npm_paths:
if path.is_file():
return {
'name': 'single-file',
'abspath': str(path),
'version': get_binary_version(str(path)),
'sha256': get_binary_hash(str(path)),
'binprovider': 'npm',
} }
except Exception:
pass
return None return None

View File

@@ -6,58 +6,16 @@ Runs at crawl start to verify wget is available.
Outputs JSONL for InstalledBinary and Machine config updates. Outputs JSONL for InstalledBinary and Machine config updates.
""" """
import os
import sys import sys
import json import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# wget version string: "GNU Wget 1.24.5 built on ..."
first_line = result.stdout.strip().split('\n')[0]
# Extract version number
parts = first_line.split()
for i, part in enumerate(parts):
if part.lower() == 'wget' and i + 1 < len(parts):
return parts[i + 1]
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_wget() -> dict | None: def find_wget() -> dict | None:
"""Find wget binary using abx-pkg or fallback to shutil.which.""" """Find wget binary using abx-pkg."""
# Try abx-pkg first
try: try:
from abx_pkg import Binary, EnvProvider from abx_pkg import Binary, EnvProvider
class WgetBinary(Binary): binary = Binary(name='wget', binproviders=[EnvProvider()])
name: str = 'wget'
binproviders_supported = [EnvProvider()]
binary = WgetBinary()
loaded = binary.load() loaded = binary.load()
if loaded and loaded.abspath: if loaded and loaded.abspath:
return { return {
@@ -67,22 +25,9 @@ def find_wget() -> dict | None:
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
} }
except ImportError:
pass
except Exception: except Exception:
pass pass
# Fallback to shutil.which
abspath = shutil.which('wget') or os.environ.get('WGET_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'wget',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None return None