improve plugin tests and config

This commit is contained in:
Nick Sweeting
2025-12-29 00:45:23 -08:00
parent f0aa19fa7d
commit 1e4d3ffd11
126 changed files with 2286 additions and 1717 deletions

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"ACCESSIBILITY_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_ACCESSIBILITY", "USE_ACCESSIBILITY"],
"description": "Enable accessibility tree capture"
},
"ACCESSIBILITY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for accessibility capture in seconds"
}
}
}

View File

@@ -212,13 +212,13 @@ async function main() {
try {
// Check if enabled
if (!getEnvBool('SAVE_ACCESSIBILITY', true)) {
console.log('Skipping accessibility (SAVE_ACCESSIBILITY=False)');
if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) {
console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)');
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'SAVE_ACCESSIBILITY=False',
output_str: 'ACCESSIBILITY_ENABLED=False',
}));
process.exit(0);
}

View File

@@ -67,6 +67,8 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'apt',
'machine_id': machine_id,
'binary_id': binary_id,
}
print(json.dumps(record))

View File

@@ -3,20 +3,20 @@
"type": "object",
"additionalProperties": false,
"properties": {
"ARCHIVE_ORG_ENABLED": {
"ARCHIVEDOTORG_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_ARCHIVE_DOT_ORG", "USE_ARCHIVE_ORG", "SUBMIT_ARCHIVE_DOT_ORG"],
"x-aliases": ["SAVE_ARCHIVEDOTORG", "USE_ARCHIVEDOTORG", "SUBMIT_ARCHIVEDOTORG"],
"description": "Submit URLs to archive.org Wayback Machine"
},
"ARCHIVE_ORG_TIMEOUT": {
"ARCHIVEDOTORG_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for archive.org submission in seconds"
},
"ARCHIVE_ORG_USER_AGENT": {
"ARCHIVEDOTORG_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",

View File

@@ -6,10 +6,10 @@ Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
Output: Writes archive.org.txt to $PWD with the archived URL
Environment variables:
ARCHIVE_ORG_TIMEOUT: Timeout in seconds (default: 60)
ARCHIVEDOTORG_TIMEOUT: Timeout in seconds (default: 60)
USER_AGENT: User agent string
# Fallback to ARCHIVING_CONFIG values if ARCHIVE_ORG_* not set:
# Fallback to ARCHIVING_CONFIG values if ARCHIVEDOTORG_* not set:
TIMEOUT: Fallback timeout
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
@@ -52,7 +52,7 @@ def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
except ImportError:
return False, None, 'requests library not installed'
timeout = get_env_int('ARCHIVE_ORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
timeout = get_env_int('ARCHIVEDOTORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
submit_url = f'https://web.archive.org/save/{url}'
@@ -105,31 +105,35 @@ def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
def main(url: str, snapshot_id: str):
"""Submit a URL to archive.org for archiving."""
output = None
status = 'failed'
error = ''
# Check if feature is enabled
if get_env('ARCHIVEDOTORG_ENABLED', 'True').lower() in ('false', '0', 'no', 'off'):
print('Skipping archive.org submission (ARCHIVEDOTORG_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
try:
# Run extraction
success, output, error = submit_to_archive_org(url)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult with output file
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or '',
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error (network, timeout, HTTP error) - emit NO JSONL
# System will retry later
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Unexpected error - also transient, emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -1,10 +0,0 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org embed - full iframe view -->
<iframe src="{{ output_path }}"
class="extractor-embed archivedotorg-embed"
style="width: 100%; height: 600px; border: 1px solid #ddd;"
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
</iframe>
{% endif %}

View File

@@ -1,10 +0,0 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen archivedotorg-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
</iframe>
{% endif %}

View File

@@ -12,16 +12,16 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
ARCHIVE_ORG_HOOK = PLUGIN_DIR / 'on_Snapshot__13_archive_org.py'
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archive_org.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
assert ARCHIVE_ORG_HOOK.exists()
assert ARCHIVEDOTORG_HOOK.exists()
def test_submits_to_archive_org():
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir, capture_output=True, text=True, timeout=60
)
@@ -40,23 +40,29 @@ def test_submits_to_archive_org():
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] in ['succeeded', 'failed'], f"Should succeed or fail: {result_json}"
if result.returncode == 0:
# Success - should have ArchiveResult
assert result_json, "Should have ArchiveResult JSONL output on success"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
else:
# Transient error - no JSONL output, just stderr
assert not result_json, "Should NOT emit JSONL on transient error"
assert result.stderr, "Should have error message in stderr"
def test_config_save_archive_org_false_skips():
with tempfile.TemporaryDirectory() as tmpdir:
import os
env = os.environ.copy()
env['SAVE_ARCHIVE_DOT_ORG'] = 'False'
env['ARCHIVEDOTORG_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
@@ -68,13 +74,20 @@ def test_handles_timeout():
import os
env = os.environ.copy()
env['TIMEOUT'] = '1'
result = subprocess.run(
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
)
assert result.returncode in (0, 1)
# Timeout is a transient error - should exit 1 with no JSONL
assert result.returncode in (0, 1), "Should complete without hanging"
# If it timed out (exit 1), should have no JSONL output
if result.returncode == 1:
jsonl_lines = [line for line in result.stdout.strip().split('\n')
if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, "Should not emit JSONL on timeout (transient error)"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -2,7 +2,7 @@
"""
Install a binary using Homebrew package manager.
Usage: on_Dependency__install_using_brew_provider.py --binary-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
Usage: on_Binary__install_using_brew_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
Output: Binary JSONL record to stdout after installation
Environment variables:
@@ -72,7 +72,7 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
'sha256': binary.sha256 or '',
'binprovider': 'brew',
'machine_id': machine_id,
'dependency_id': dependency_id,
'binary_id': binary_id,
}
print(json.dumps(record))

View File

@@ -1,194 +0,0 @@
#!/usr/bin/env python3
"""
Create symlinks from plugin outputs to canonical legacy locations.
This plugin runs after all extractors complete and creates symlinks from the
new plugin-based output structure to the legacy canonical output paths that
ArchiveBox has historically used. This maintains backward compatibility with
existing tools and scripts that expect outputs at specific locations.
Canonical output paths:
- favicon.ico → favicon/favicon.ico
- singlefile.html → singlefile/singlefile.html
- readability/content.html → readability/content.html
- mercury/content.html → mercury/content.html
- htmltotext.txt → htmltotext/htmltotext.txt
- output.pdf → pdf/output.pdf
- screenshot.png → screenshot/screenshot.png
- output.html → dom/output.html
- headers.json → headers/headers.json
- warc/{timestamp} → wget/warc/{timestamp}
New plugin outputs:
- ssl.json → ssl/ssl.json
- seo.json → seo/seo.json
- accessibility.json → accessibility/accessibility.json
- outlinks.json → outlinks/outlinks.json
- redirects.json → redirects/redirects.json
- console.jsonl → consolelog/console.jsonl
Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
import os
import sys
import json
from pathlib import Path
from typing import Dict
import rich_click as click
# Mapping from canonical path to plugin output path
CANONICAL_MAPPINGS = {
# Legacy extractors
'favicon.ico': 'favicon/favicon.ico',
'singlefile.html': 'singlefile/singlefile.html',
'readability/content.html': 'readability/content.html',
'mercury/content.html': 'mercury/content.html',
'htmltotext.txt': 'htmltotext/htmltotext.txt',
'output.pdf': 'pdf/output.pdf',
'screenshot.png': 'screenshot/screenshot.png',
'output.html': 'dom/output.html',
'headers.json': 'headers/headers.json',
# New plugins
'ssl.json': 'ssl/ssl.json',
'seo.json': 'seo/seo.json',
'accessibility.json': 'accessibility/accessibility.json',
'outlinks.json': 'parse_dom_outlinks/outlinks.json',
'redirects.json': 'redirects/redirects.json',
'console.jsonl': 'consolelog/console.jsonl',
}
def create_symlink(target: Path, link: Path, relative: bool = True) -> bool:
"""
Create a symlink from link to target.
Args:
target: The actual file/directory (source)
link: The symlink to create (destination)
relative: Whether to create a relative symlink (default: True)
Returns:
True if symlink was created or already exists, False otherwise
"""
try:
# Skip if target doesn't exist
if not target.exists():
return False
# Remove existing symlink/file if present
if link.exists() or link.is_symlink():
if link.is_symlink() and link.resolve() == target.resolve():
# Already correctly symlinked
return True
link.unlink()
# Create parent directory
link.parent.mkdir(parents=True, exist_ok=True)
# Create relative or absolute symlink
if relative:
# Calculate relative path from link to target
rel_target = os.path.relpath(target, link.parent)
link.symlink_to(rel_target)
else:
link.symlink_to(target)
return True
except (OSError, FileNotFoundError, PermissionError) as e:
# Symlink creation failed, skip
return False
def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
"""
Create all canonical symlinks for a snapshot directory.
Args:
snapshot_dir: The snapshot directory (e.g., archive/<timestamp>/)
Returns:
Dict mapping canonical path to success status
"""
results = {}
for canonical_path, plugin_output in CANONICAL_MAPPINGS.items():
target = snapshot_dir / plugin_output
link = snapshot_dir / canonical_path
success = create_symlink(target, link, relative=True)
results[canonical_path] = success
# Special handling for warc/ directory symlink
# wget plugin outputs to wget/warc/, but canonical expects warc/ at root
wget_warc = snapshot_dir / 'wget' / 'warc'
canonical_warc = snapshot_dir / 'warc'
if wget_warc.exists():
results['warc/'] = create_symlink(wget_warc, canonical_warc, relative=True)
return results
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Create symlinks from plugin outputs to canonical legacy locations."""
status = 'failed'
output = None
error = ''
symlinks_created = 0
try:
# Check if enabled
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_canonical:
status = 'skipped'
click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
sys.exit(0)
# Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
if not snapshot_dir.exists():
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
# Create canonical symlinks
results = create_canonical_symlinks(snapshot_dir)
# Count successful symlinks
symlinks_created = sum(1 for success in results.values() if success)
status = 'succeeded'
output = str(snapshot_dir)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
click.echo(f'Error: {error}', err=True)
# Print JSON result for hook runner
result = {
'status': status,
'output': output,
'error': error or None,
'symlinks_created': symlinks_created,
}
click.echo(json.dumps(result))
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"CAPTCHA2_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_CAPTCHA2"],
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
},
"CAPTCHA2_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for CAPTCHA solving in seconds"
}
}
}

View File

@@ -20,7 +20,7 @@ const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
const extensionUtils = require('../chrome/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {

View File

@@ -14,8 +14,8 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__01_captcha2.js"
CONFIG_SCRIPT = PLUGIN_DIR / "on_Snapshot__21_captcha2_config.js"
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
def test_install_script_exists():

View File

@@ -97,12 +97,12 @@ def main():
# Get config values
chrome_binary = get_env('CHROME_BINARY', 'chromium')
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
save_screenshot = get_env_bool('SAVE_SCREENSHOT', True)
save_pdf = get_env_bool('SAVE_PDF', True)
save_dom = get_env_bool('SAVE_DOM', True)
screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
pdf_enabled = get_env_bool('PDF_ENABLED', True)
dom_enabled = get_env_bool('DOM_ENABLED', True)
# Compute USE_CHROME (derived from SAVE_* flags)
use_chrome = save_screenshot or save_pdf or save_dom
# Compute USE_CHROME (derived from extractor enabled flags)
use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
computed['USE_CHROME'] = str(use_chrome).lower()
# Detect Docker and adjust sandbox

View File

@@ -24,69 +24,18 @@ import tempfile
import shutil
PLUGIN_DIR = Path(__file__).parent.parent
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_chrome_install.py'
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = PLUGIN_DIR / 'on_Snapshot__30_chrome_navigate.js'
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
def test_hook_scripts_exist():
"""Verify chrome hooks exist."""
assert CHROME_INSTALL_HOOK.exists(), f"Hook not found: {CHROME_INSTALL_HOOK}"
assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
assert CHROME_TAB_HOOK.exists(), f"Hook not found: {CHROME_TAB_HOOK}"
assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}"
def test_chrome_install_hook():
"""Test chrome install hook checks for Chrome/Chromium binary."""
import os
# Try with explicit CHROME_BINARY first (faster and more reliable)
chrome_app_path = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
if Path(chrome_app_path).exists():
# Use explicit CHROME_BINARY env var
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
env={**os.environ, 'CHROME_BINARY': chrome_app_path},
timeout=30
)
# When CHROME_BINARY is set and valid, hook exits 0 immediately (silent success)
assert result.returncode == 0, f"Should find Chrome at {chrome_app_path}. Error: {result.stderr}"
else:
# Run install hook to find or install Chrome
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=300 # Longer timeout for potential @puppeteer/browsers install
)
if result.returncode == 0:
# Binary found or installed - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Binary':
assert record['name'] == 'chrome'
assert record['abspath']
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output Binary record when binary found"
else:
# Failed to find or install Chrome
pytest.fail(f"Chrome installation failed. Please install Chrome manually or ensure @puppeteer/browsers is available. Error: {result.stderr}")
def test_verify_deps_with_abx_pkg():
"""Verify chrome is available via abx-pkg."""
from abx_pkg import Binary, NpmProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"CONSOLELOG_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_CONSOLELOG", "USE_CONSOLELOG"],
"description": "Enable console log capture"
},
"CONSOLELOG_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for console log capture in seconds"
}
}
}

View File

@@ -207,9 +207,9 @@ async function main() {
process.exit(1);
}
if (!getEnvBool('SAVE_CONSOLELOG', true)) {
console.error('Skipping (SAVE_CONSOLELOG=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'SAVE_CONSOLELOG=False'}));
if (!getEnvBool('CONSOLELOG_ENABLED', true)) {
console.error('Skipping (CONSOLELOG_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'CONSOLELOG_ENABLED=False'}));
process.exit(0);
}

View File

@@ -5,7 +5,7 @@ Install a binary using a custom bash command.
This provider runs arbitrary shell commands to install binaries
that don't fit into standard package managers.
Usage: on_Dependency__install_using_custom_bash.py --dependency-id=<uuid> --bin-name=<name> --custom-cmd=<cmd>
Usage: on_Binary__install_using_custom_bash.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> --custom-cmd=<cmd>
Output: Binary JSONL record to stdout after installation
Environment variables:
@@ -22,22 +22,23 @@ from abx_pkg import Binary, EnvProvider
@click.command()
@click.option('--dependency-id', required=True, help="Dependency UUID")
@click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--binary-id', required=True, help="Binary UUID")
@click.option('--machine-id', required=True, help="Machine UUID")
@click.option('--name', required=True, help="Binary name to install")
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', required=True, help="Custom bash command to run")
def main(dependency_id: str, bin_name: str, binproviders: str, custom_cmd: str):
def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str):
"""Install binary using custom bash command."""
if binproviders != '*' and 'custom' not in binproviders.split(','):
click.echo(f"custom provider not allowed for {bin_name}", err=True)
click.echo(f"custom provider not allowed for {name}", err=True)
sys.exit(0)
if not custom_cmd:
click.echo("custom provider requires --custom-cmd", err=True)
sys.exit(1)
click.echo(f"Installing {bin_name} via custom command: {custom_cmd}", err=True)
click.echo(f"Installing {name} via custom command: {custom_cmd}", err=True)
try:
result = subprocess.run(
@@ -57,13 +58,13 @@ def main(dependency_id: str, bin_name: str, binproviders: str, custom_cmd: str):
# Use abx-pkg to load the binary and get its info
provider = EnvProvider()
try:
binary = Binary(name=bin_name, binproviders=[provider]).load()
binary = Binary(name=name, binproviders=[provider]).load()
except Exception as e:
click.echo(f"{bin_name} not found after custom install: {e}", err=True)
click.echo(f"{name} not found after custom install: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{bin_name} not found after custom install", err=True)
click.echo(f"{name} not found after custom install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
@@ -71,18 +72,18 @@ def main(dependency_id: str, bin_name: str, binproviders: str, custom_cmd: str):
# Output Binary JSONL record to stdout
record = {
'type': 'Binary',
'name': bin_name,
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'custom',
'machine_id': machine_id,
'dependency_id': dependency_id,
'binary_id': binary_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
click.echo(f"Installed {name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)

View File

@@ -15,9 +15,29 @@
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* SAVE_DOM: Enable DOM extraction (default: true)
* DOM_ENABLED: Enable DOM extraction (default: true)
*/
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Check if DOM is enabled BEFORE requiring puppeteer
if (!getEnvBool('DOM_ENABLED', true)) {
console.error('Skipping DOM (DOM_ENABLED=False)');
// Temporary failure (config disabled) - NO JSONL emission
process.exit(0);
}
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
@@ -40,18 +60,6 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
@@ -229,18 +237,7 @@ async function main() {
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if DOM is enabled
if (!getEnvBool('SAVE_DOM', true)) {
console.error('Skipping DOM (SAVE_DOM=False)');
// Feature disabled - no ArchiveResult, just exit
process.exit(0);
}
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.error(`Skipping DOM - staticfile extractor already downloaded this`);
@@ -251,46 +248,40 @@ async function main() {
output_str: 'staticfile already handled',
}));
process.exit(0);
} else {
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
}
const result = await dumpDom(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const size = fs.statSync(output).size;
console.error(`DOM saved (${size} bytes)`);
} else {
status = 'failed';
error = result.error;
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await dumpDom(url);
if (result.success) {
// Success - emit ArchiveResult
const size = fs.statSync(result.output).size;
console.error(`DOM saved (${size} bytes)`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: result.output,
}));
process.exit(0);
} else {
// Transient error - emit NO JSONL
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
const endTs = new Date();
if (error) console.error(`ERROR: ${error}`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: output || error || '',
}));
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {

View File

@@ -1,6 +0,0 @@
<!-- DOM embed - full iframe of captured DOM HTML -->
<iframe src="{{ output_path }}"
class="extractor-embed dom-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>

View File

@@ -1,6 +0,0 @@
<!-- DOM fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen dom-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
</iframe>

View File

@@ -22,9 +22,8 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
DOM_HOOK = PLUGIN_DIR / 'on_Snapshot__36_dom.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
TEST_URL = 'https://example.com'
@@ -33,66 +32,6 @@ def test_hook_script_exists():
assert DOM_HOOK.exists(), f"Hook not found: {DOM_HOOK}"
def test_chrome_validation_and_install():
"""Test chrome install hook to install puppeteer-core if needed."""
# Run chrome install hook (from chrome plugin)
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# If exit 1, binary not found - need to install
if result.returncode == 1:
# Parse Dependency request from JSONL
dependency_request = None
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
dependency_request = record
break
except json.JSONDecodeError:
pass
if dependency_request:
bin_name = dependency_request['bin_name']
bin_providers = dependency_request['bin_providers']
# Install via npm provider hook
install_result = subprocess.run(
[
sys.executable,
str(NPM_PROVIDER_HOOK),
'--dependency-id', 'test-dep-001',
'--bin-name', bin_name,
'--bin-providers', bin_providers
],
capture_output=True,
text=True,
timeout=600
)
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
# Verify installation via JSONL output
for line in install_result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Binary':
assert record['name'] == bin_name
assert record['abspath']
break
except json.JSONDecodeError:
pass
else:
# Binary already available, verify via JSONL output
assert result.returncode == 0, f"Validation failed: {result.stderr}"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
@@ -154,13 +93,13 @@ def test_extracts_dom_from_example_com():
def test_config_save_dom_false_skips():
"""Test that SAVE_DOM=False exits without emitting JSONL."""
"""Test that DOM_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env['SAVE_DOM'] = 'False'
env['DOM_ENABLED'] = 'False'
result = subprocess.run(
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
@@ -173,8 +112,8 @@ def test_config_save_dom_false_skips():
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
assert 'Skipping DOM' in result.stderr, "Should log skip reason to stderr"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping DOM' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]

View File

@@ -5,7 +5,7 @@ Check if a binary is already available in the system PATH.
This is the simplest "provider" - it doesn't install anything,
it just discovers binaries that are already installed.
Usage: on_Dependency__install_using_env_provider.py --binary-id=<uuid> --name=<name>
Usage: on_Binary__install_using_env_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
Output: Binary JSONL record to stdout if binary found in PATH
Environment variables:
@@ -56,7 +56,7 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str):
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
'dependency_id': dependency_id,
'binary_id': binary_id,
}
print(json.dumps(record))

View File

View File

@@ -0,0 +1,9 @@
<!-- Favicon thumbnail - small favicon preview -->
<div class="extractor-thumbnail favicon-thumbnail" style="width: 100%; height: 100px; display: flex; align-items: center; justify-content: center; background: #fff;">
{% if output_path %}
<img src="{{ output_path }}"
alt="Favicon"
style="max-width: 80%; max-height: 80%; object-fit: contain;"
loading="lazy">
{% endif %}
</div>

View File

@@ -23,7 +23,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
FAVICON_HOOK = PLUGIN_DIR / 'on_Snapshot__11_favicon.py'
FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None)
TEST_URL = 'https://example.com'

View File

@@ -65,8 +65,8 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
textify = get_env_bool('FORUMDL_TEXTIFY', False)
extra_args = get_env('FORUMDL_EXTRA_ARGS', '')
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
@@ -148,9 +148,9 @@ def main(url: str, snapshot_id: str):
try:
# Check if forum-dl is enabled
if not get_env_bool('SAVE_FORUMDL', True):
print('Skipping forum-dl (SAVE_FORUMDL=False)', file=sys.stderr)
# Feature disabled - no ArchiveResult, just exit
if not get_env_bool('FORUMDL_ENABLED', True):
print('Skipping forum-dl (FORUMDL_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
# Get binary from environment
@@ -158,24 +158,25 @@ def main(url: str, snapshot_id: str):
# Run extraction
success, output, error = save_forum(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -22,8 +22,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py'
FORUMDL_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_forumdl.py'
FORUMDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_forumdl.*'), None)
TEST_URL = 'https://example.com'
# Module-level cache for binary path
@@ -35,121 +34,60 @@ def get_forumdl_binary_path():
if _forumdl_binary_path:
return _forumdl_binary_path
# Skip if install hook doesn't exist
if not FORUMDL_INSTALL_HOOK.exists():
return None
# Try to find forum-dl binary using abx-pkg
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
# Run install hook to find or install binary
result = subprocess.run(
[sys.executable, str(FORUMDL_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=300
)
try:
binary = Binary(
name='forum-dl',
binproviders=[PipProvider(), EnvProvider()]
).load()
# Check if binary was found
for line in result.stdout.strip().split('\n'):
if binary and binary.abspath:
_forumdl_binary_path = str(binary.abspath)
return _forumdl_binary_path
except Exception:
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'forum-dl':
_forumdl_binary_path = record.get('abspath')
return _forumdl_binary_path
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'forum-dl':
# Need to install via pip hook
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
dependency_id = str(uuid.uuid4())
# Build command with overrides if present
cmd = [
sys.executable, str(pip_hook),
'--dependency-id', dependency_id,
'--bin-name', record['bin_name']
]
if 'overrides' in record:
cmd.extend(['--overrides', json.dumps(record['overrides'])])
# If not found, try to install via pip
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
if pip_hook.exists():
binary_id = str(uuid.uuid4())
machine_id = str(uuid.uuid4())
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
cmd = [
sys.executable, str(pip_hook),
'--binary-id', binary_id,
'--machine-id', machine_id,
'--name', 'forum-dl'
]
# Parse Binary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
pass
if install_line.strip():
pass
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
_forumdl_binary_path = install_record.get('abspath')
return _forumdl_binary_path
except json.JSONDecodeError:
pass
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
# Installation failed - print debug info
if not _forumdl_binary_path:
print(f"\n=== forum-dl installation failed ===", file=sys.stderr)
print(f"stdout: {install_result.stdout}", file=sys.stderr)
print(f"stderr: {install_result.stderr}", file=sys.stderr)
print(f"returncode: {install_result.returncode}", file=sys.stderr)
return None
except json.JSONDecodeError:
pass
# Parse Binary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
if install_line.strip():
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
_forumdl_binary_path = install_record.get('abspath')
return _forumdl_binary_path
except json.JSONDecodeError:
pass
return None
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
def test_forumdl_install_hook():
"""Test forum-dl install hook checks for forum-dl."""
# Skip if install hook doesn't exist yet
if not FORUMDL_INSTALL_HOOK.exists():
pass
# Run forum-dl install hook
result = subprocess.run(
[sys.executable, str(FORUMDL_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for Binary and Dependency records
found_binary = False
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
pass
if record['name'] == 'forum-dl':
assert record['abspath'], "forum-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
pass
if record['bin_name'] == 'forum-dl':
found_dependency = True
except json.JSONDecodeError:
pass
# forum-dl should either be found (Binary) or missing (Dependency)
assert found_binary or found_dependency, \
"forum-dl should have either Binary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify forum-dl is installed by calling the REAL installation hooks."""
binary_path = get_forumdl_binary_path()
@@ -209,12 +147,12 @@ def test_handles_non_forum_url():
def test_config_save_forumdl_false_skips():
"""Test that SAVE_FORUMDL=False exits without emitting JSONL."""
"""Test that FORUMDL_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['SAVE_FORUMDL'] = 'False'
env['FORUMDL_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
@@ -227,7 +165,7 @@ def test_config_save_forumdl_false_skips():
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL

View File

@@ -88,9 +88,9 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env (with GALLERYDL_ prefix or fallback to ARCHIVING_CONFIG style)
timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
# Get config from env
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
extra_args = get_env('GALLERYDL_EXTRA_ARGS', '')
cookies_file = get_env('COOKIES_FILE', '')
@@ -180,9 +180,9 @@ def main(url: str, snapshot_id: str):
try:
# Check if gallery-dl is enabled
if not (get_env_bool('USE_GALLERYDL', True) and get_env_bool('SAVE_GALLERYDL', True)):
print('Skipping gallery-dl (USE_GALLERYDL=False or SAVE_GALLERYDL=False)', file=sys.stderr)
# Feature disabled - no ArchiveResult, just exit
if not get_env_bool('GALLERYDL_ENABLED', True):
print('Skipping gallery-dl (GALLERYDL_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
# Check if staticfile or media extractors already handled this (permanent skip)
@@ -209,24 +209,25 @@ def main(url: str, snapshot_id: str):
# Run extraction
success, output, error = save_gallery(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -21,8 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
GALLERYDL_HOOK = PLUGIN_DIR / 'on_Snapshot__52_gallerydl.py'
GALLERYDL_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_gallerydl.py'
GALLERYDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_gallerydl.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
@@ -30,44 +29,6 @@ def test_hook_script_exists():
assert GALLERYDL_HOOK.exists(), f"Hook not found: {GALLERYDL_HOOK}"
def test_gallerydl_install_hook():
"""Test gallery-dl install hook checks for gallery-dl."""
# Run gallery-dl install hook
result = subprocess.run(
[sys.executable, str(GALLERYDL_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for Binary and Dependency records
found_binary = False
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
pass
if record['name'] == 'gallery-dl':
assert record['abspath'], "gallery-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
pass
if record['bin_name'] == 'gallery-dl':
found_dependency = True
except json.JSONDecodeError:
pass
# gallery-dl should either be found (Binary) or missing (Dependency)
assert found_binary or found_dependency, \
"gallery-dl should have either Binary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify gallery-dl is available via abx-pkg."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
@@ -122,12 +83,12 @@ def test_handles_non_gallery_url():
def test_config_save_gallery_dl_false_skips():
"""Test that SAVE_GALLERYDL=False exits without emitting JSONL."""
"""Test that GALLERYDL_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['SAVE_GALLERYDL'] = 'False'
env['GALLERYDL_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(GALLERYDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
@@ -140,7 +101,7 @@ def test_config_save_gallery_dl_false_skips():
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL

View File

@@ -1,6 +0,0 @@
<!-- Git embed - directory listing of cloned repo -->
<iframe src="{{ output_path }}"
class="extractor-embed git-embed"
style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -1,6 +0,0 @@
<!-- Git fullscreen - full directory listing -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen git-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fff;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -17,58 +17,12 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None)
TEST_URL = 'https://github.com/example/repo.git'
def test_hook_script_exists():
assert GIT_HOOK.exists()
def test_git_install_hook():
"""Test git install hook checks for git binary."""
result = subprocess.run(
[sys.executable, str(GIT_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
assert record['name'] == 'git'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output Binary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'git'
assert 'env' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify git is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"HEADERS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_HEADERS", "USE_HEADERS"],
"description": "Enable HTTP headers capture"
},
"HEADERS_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for headers capture in seconds"
}
}
}

View File

@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
HEADERS_HOOK = PLUGIN_DIR / 'on_Snapshot__33_headers.js'
HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
TEST_URL = 'https://example.com'

View File

@@ -0,0 +1,20 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"HTMLTOTEXT_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_HTMLTOTEXT", "USE_HTMLTOTEXT"],
"description": "Enable HTML to text conversion"
},
"HTMLTOTEXT_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for HTML to text conversion in seconds"
}
}
}

View File

@@ -127,31 +127,28 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
def main(url: str, snapshot_id: str):
"""Convert HTML to plain text for search indexing."""
output = None
status = 'failed'
error = ''
try:
# Run extraction
success, output, error = extract_htmltotext(url)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -12,7 +12,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
HTMLTOTEXT_HOOK = PLUGIN_DIR / 'on_Snapshot__54_htmltotext.py'
HTMLTOTEXT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_htmltotext.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
@@ -49,10 +49,11 @@ def test_extracts_text_from_html():
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file (hook writes to current directory)
output_file = tmpdir / 'content.txt'
assert output_file.exists(), "content.txt not created"
output_file = tmpdir / 'htmltotext.txt'
assert output_file.exists(), f"htmltotext.txt not created. Files: {list(tmpdir.iterdir())}"
content = output_file.read_text()
assert len(content) > 0, "Content should not be empty"
assert 'Example Domain' in content, "Should contain text from HTML"
def test_fails_gracefully_without_html():
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -0,0 +1,14 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"ISTILLDONTCAREABOUTCOOKIES_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_ISTILLDONTCAREABOUTCOOKIES"],
"description": "Enable I Still Don't Care About Cookies browser extension"
}
}
}

View File

@@ -21,7 +21,7 @@ const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
const extensionUtils = require('../chrome/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {

View File

@@ -14,7 +14,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__02_istilldontcareaboutcookies.js"
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None)
def test_install_script_exists():

View File

@@ -9,10 +9,10 @@
"x-aliases": ["SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA"],
"description": "Enable media downloading with yt-dlp"
},
"MEDIA_BINARY": {
"YTDLP_BINARY": {
"type": "string",
"default": "yt-dlp",
"x-aliases": ["YOUTUBEDL_BINARY", "YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
"x-aliases": ["YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY", "MEDIA_BINARY"],
"description": "Path to yt-dlp binary"
},
"MEDIA_TIMEOUT": {
@@ -35,7 +35,7 @@
"x-aliases": ["YTDLP_CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"MEDIA_ARGS": {
"YTDLP_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
@@ -45,13 +45,13 @@
"--embed-subs",
"--write-auto-sub"
],
"x-aliases": ["YTDLP_ARGS"],
"x-aliases": ["MEDIA_ARGS"],
"description": "Default yt-dlp arguments"
},
"MEDIA_EXTRA_ARGS": {
"YTDLP_EXTRA_ARGS": {
"type": "string",
"default": "",
"x-aliases": ["YTDLP_EXTRA_ARGS"],
"x-aliases": ["MEDIA_EXTRA_ARGS"],
"description": "Extra arguments for yt-dlp (space-separated)"
}
}

View File

@@ -98,10 +98,10 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env (with YTDLP_ prefix or fallback to ARCHIVING_CONFIG style)
timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('MEDIA_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
# Get config from env
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
extra_args = get_env('YTDLP_EXTRA_ARGS', '')
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
# Output directory is current directory (hook already runs in output dir)
@@ -182,15 +182,11 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
def main(url: str, snapshot_id: str):
"""Download media from a URL using yt-dlp."""
output = None
status = 'failed'
error = ''
try:
# Check if yt-dlp is enabled
if not (get_env_bool('USE_YTDLP', True) and get_env_bool('SAVE_MEDIA', True)):
print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)', file=sys.stderr)
print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'USE_YTDLP=False'}))
# Check if media downloading is enabled
if not get_env_bool('MEDIA_ENABLED', True):
print('Skipping media (MEDIA_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
# Check if staticfile extractor already handled this (permanent skip)
@@ -200,28 +196,29 @@ def main(url: str, snapshot_id: str):
sys.exit(0)
# Get binary from environment
binary = get_env('YTDLP_BINARY') or get_env('YOUTUBEDL_BINARY', 'yt-dlp')
binary = get_env('YTDLP_BINARY', 'yt-dlp')
# Run extraction
success, output, error = save_media(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -21,8 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
MEDIA_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_media.*'), None)
TEST_URL = 'https://example.com/video.mp4'
def test_hook_script_exists():
@@ -30,45 +29,6 @@ def test_hook_script_exists():
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
def test_ytdlp_install_hook():
"""Test yt-dlp install hook checks for yt-dlp and dependencies (node, ffmpeg)."""
# Run yt-dlp install hook
result = subprocess.run(
[sys.executable, str(MEDIA_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for Binary and Dependency records
found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
name = record['name']
if name in found_binaries:
assert record['abspath'], f"{name} should have abspath"
found_binaries[name] = True
elif record.get('type') == 'Dependency':
name = record['bin_name']
if name in found_dependencies:
found_dependencies[name] = True
except json.JSONDecodeError:
pass
# Each binary should either be found (Binary) or missing (Dependency)
for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
assert found_binaries[binary_name] or found_dependencies[binary_name], \
f"{binary_name} should have either Binary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
@@ -137,12 +97,12 @@ def test_handles_non_media_url():
def test_config_save_media_false_skips():
"""Test that SAVE_MEDIA=False exits without emitting JSONL."""
"""Test that MEDIA_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['SAVE_MEDIA'] = 'False'
env['MEDIA_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(MEDIA_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
@@ -155,7 +115,7 @@ def test_config_save_media_false_skips():
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL

View File

@@ -35,6 +35,15 @@ def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
@@ -105,34 +114,37 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
def main(url: str, snapshot_id: str):
"""Extract article content using Postlight's Mercury Parser."""
output = None
status = 'failed'
error = ''
try:
# Check if mercury extraction is enabled
if not get_env_bool('MERCURY_ENABLED', True):
print('Skipping mercury (MERCURY_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
# Get binary from environment
binary = get_env('MERCURY_BINARY', 'postlight-parser')
# Run extraction
success, output, error = extract_mercury(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -1,6 +0,0 @@
<!-- Mercury embed - Mercury parser article view -->
<iframe src="{{ output_path }}"
class="extractor-embed mercury-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -1,6 +0,0 @@
<!-- Mercury fullscreen - full Mercury parser article -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen mercury-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -21,8 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
@@ -30,53 +29,6 @@ def test_hook_script_exists():
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
def test_mercury_install_hook():
"""Test mercury install hook checks for postlight-parser."""
# Run mercury install hook
result = subprocess.run(
[sys.executable, str(MERCURY_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
assert record['name'] == 'postlight-parser'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output Binary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'postlight-parser'
assert 'npm' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify postlight-parser is available via abx-pkg."""
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
@@ -147,12 +99,12 @@ def test_extracts_with_mercury_parser():
assert len(content) > 0, "Output should not be empty"
def test_config_save_mercury_false_skips():
"""Test that SAVE_MERCURY=False exits without emitting JSONL."""
"""Test that MERCURY_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['SAVE_MERCURY'] = 'False'
env['MERCURY_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
@@ -165,7 +117,7 @@ def test_config_save_mercury_false_skips():
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
@@ -174,7 +126,7 @@ def test_config_save_mercury_false_skips():
def test_fails_gracefully_without_html():
"""Test that mercury fails gracefully when no HTML source exists."""
"""Test that mercury works even without HTML source (fetches URL directly)."""
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
@@ -184,13 +136,12 @@ def test_fails_gracefully_without_html():
timeout=30
)
# Should exit with non-zero or emit failure JSONL
# Mercury fetches URL directly with postlight-parser, doesn't need HTML source
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -199,9 +150,9 @@ def test_fails_gracefully_without_html():
except json.JSONDecodeError:
pass
if result_json:
# Should report failure or skip since no HTML source
assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip without HTML: {result_json}"
# Mercury should succeed or fail based on network, not based on HTML source
assert result_json, "Should emit ArchiveResult"
assert result_json['status'] in ['succeeded', 'failed'], f"Should succeed or fail: {result_json}"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,20 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"MERKLETREE_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_MERKLETREE", "USE_MERKLETREE"],
"description": "Enable merkle tree hash generation"
},
"MERKLETREE_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for merkle tree generation in seconds"
}
}
}

View File

@@ -132,11 +132,11 @@ def main(url: str, snapshot_id: str):
try:
# Check if enabled
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
save_merkletree = os.getenv('MERKLETREE_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_merkletree:
status = 'skipped'
click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
click.echo(json.dumps({'status': status, 'output': 'MERKLETREE_ENABLED=false'}))
sys.exit(0)
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)

View File

@@ -2,7 +2,7 @@
"""
Install a binary using npm package manager.
Usage: on_Dependency__install_using_npm_provider.py --binary-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
Usage: on_Binary__install_using_npm_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
Output: Binary JSONL record to stdout after installation
Environment variables:
@@ -72,7 +72,7 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
'sha256': binary.sha256 or '',
'binprovider': 'npm',
'machine_id': machine_id,
'dependency_id': dependency_id,
'binary_id': binary_id,
}
print(json.dumps(record))

View File

@@ -71,7 +71,7 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('PAPERSDL_TIMEOUT') or get_env_int('TIMEOUT', 300)
timeout = get_env_int('TIMEOUT', 300)
extra_args = get_env('PAPERSDL_EXTRA_ARGS', '')
# Output directory is current directory (hook already runs in output dir)
@@ -140,9 +140,9 @@ def main(url: str, snapshot_id: str):
try:
# Check if papers-dl is enabled
if not get_env_bool('SAVE_PAPERSDL', True):
print('Skipping papers-dl (SAVE_PAPERSDL=False)', file=sys.stderr)
# Feature disabled - no ArchiveResult, just exit
if not get_env_bool('PAPERSDL_ENABLED', True):
print('Skipping papers-dl (PAPERSDL_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
# Get binary from environment
@@ -150,24 +150,25 @@ def main(url: str, snapshot_id: str):
# Run extraction
success, output, error = save_paper(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -21,8 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py'
PAPERSDL_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_papersdl.py'
PAPERSDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_papersdl.*'), None)
TEST_URL = 'https://example.com'
# Module-level cache for binary path
@@ -34,55 +33,51 @@ def get_papersdl_binary_path():
if _papersdl_binary_path:
return _papersdl_binary_path
# Run install hook to find or install binary
result = subprocess.run(
[sys.executable, str(PAPERSDL_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=300
)
# Try to find papers-dl binary using abx-pkg
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
# Check if binary was found
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'papers-dl':
_papersdl_binary_path = record.get('abspath')
return _papersdl_binary_path
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'papers-dl':
# Need to install via pip hook
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
dependency_id = str(uuid.uuid4())
try:
binary = Binary(
name='papers-dl',
binproviders=[PipProvider(), EnvProvider()]
).load()
# Build command with overrides if present
cmd = [
sys.executable, str(pip_hook),
'--dependency-id', dependency_id,
'--bin-name', record['bin_name']
]
if 'overrides' in record:
cmd.extend(['--overrides', json.dumps(record['overrides'])])
if binary and binary.abspath:
_papersdl_binary_path = str(binary.abspath)
return _papersdl_binary_path
except Exception:
pass
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
# If not found, try to install via pip
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
if pip_hook.exists():
binary_id = str(uuid.uuid4())
machine_id = str(uuid.uuid4())
# Parse Binary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
if install_line.strip():
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'Binary' and install_record.get('name') == 'papers-dl':
_papersdl_binary_path = install_record.get('abspath')
return _papersdl_binary_path
except json.JSONDecodeError:
pass
except json.JSONDecodeError:
pass
cmd = [
sys.executable, str(pip_hook),
'--binary-id', binary_id,
'--machine-id', machine_id,
'--name', 'papers-dl'
]
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300
)
# Parse Binary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
if install_line.strip():
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'Binary' and install_record.get('name') == 'papers-dl':
_papersdl_binary_path = install_record.get('abspath')
return _papersdl_binary_path
except json.JSONDecodeError:
pass
return None
@@ -91,40 +86,6 @@ def test_hook_script_exists():
assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}"
def test_papersdl_install_hook():
"""Test papers-dl install hook checks for papers-dl."""
# Run papers-dl install hook
result = subprocess.run(
[sys.executable, str(PAPERSDL_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for Binary and Dependency records
found_binary = False
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Binary':
if record['name'] == 'papers-dl':
assert record['abspath'], "papers-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
if record['bin_name'] == 'papers-dl':
found_dependency = True
except json.JSONDecodeError:
pass
# papers-dl should either be found (Binary) or missing (Dependency)
assert found_binary or found_dependency, \
"papers-dl should have either Binary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify papers-dl is installed by calling the REAL installation hooks."""
binary_path = get_papersdl_binary_path()
@@ -176,12 +137,12 @@ def test_handles_non_paper_url():
def test_config_save_papersdl_false_skips():
"""Test that SAVE_PAPERSDL=False exits without emitting JSONL."""
"""Test that PAPERSDL_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['SAVE_PAPERSDL'] = 'False'
env['PAPERSDL_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(PAPERSDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
@@ -194,7 +155,7 @@ def test_config_save_papersdl_false_skips():
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - no JSONL emission, just logs to stderr
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"PARSE_DOM_OUTLINKS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_DOM_OUTLINKS", "USE_PARSE_DOM_OUTLINKS"],
"description": "Enable DOM outlinks parsing from archived pages"
},
"PARSE_DOM_OUTLINKS_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for DOM outlinks parsing in seconds"
}
}
}

View File

@@ -15,7 +15,7 @@
* Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
*
* Environment variables:
* SAVE_DOM_OUTLINKS: Enable DOM outlinks extraction (default: true)
* PARSE_DOM_OUTLINKS_ENABLED: Enable DOM outlinks extraction (default: true)
*/
const fs = require('fs');
@@ -225,13 +225,13 @@ async function main() {
try {
// Check if enabled
if (!getEnvBool('SAVE_DOM_OUTLINKS', true)) {
console.log('Skipping DOM outlinks (SAVE_DOM_OUTLINKS=False)');
if (!getEnvBool('PARSE_DOM_OUTLINKS_ENABLED', true)) {
console.log('Skipping DOM outlinks (PARSE_DOM_OUTLINKS_ENABLED=False)');
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'SAVE_DOM_OUTLINKS=False',
output_str: 'PARSE_DOM_OUTLINKS_ENABLED=False',
}));
process.exit(0);
}

View File

@@ -0,0 +1,13 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"PARSE_HTML_URLS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_PARSE_HTML_URLS"],
"description": "Enable HTML URL parsing"
}
}
}

View File

@@ -9,7 +9,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.py'), None)
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.*'), None)
class TestParseHtmlUrls:

View File

@@ -0,0 +1,13 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"PARSE_JSONL_URLS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_PARSE_JSONL_URLS"],
"description": "Enable JSON Lines URL parsing"
}
}
}

View File

@@ -9,7 +9,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.*'), None)
class TestParseJsonlUrls:

View File

@@ -0,0 +1,13 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"PARSE_NETSCAPE_URLS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_PARSE_NETSCAPE_URLS"],
"description": "Enable Netscape bookmarks HTML URL parsing"
}
}
}

View File

@@ -9,7 +9,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.*'), None)
class TestParseNetscapeUrls:

View File

@@ -10,7 +10,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.*'), None)
class TestFirefoxFormat:
@@ -719,10 +719,11 @@ class TestEdgeCases:
# Document current behavior
if result.returncode == 0:
# Output goes to stdout (JSONL)
if output_file.exists():
content = result.stdout.strip()
if content:
entry = json.loads(content)
content = result.stdout.strip()
if content:
lines = [line for line in content.split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
if lines:
entry = json.loads(lines[0])
assert 'example.com' in entry['url']
def test_missing_add_date(self, tmp_path):
@@ -763,8 +764,11 @@ class TestEdgeCases:
)
# Current regex requires non-empty title [^<]+
# Document current behavior
assert result.returncode == 1
# Parser emits skipped ArchiveResult when no valid bookmarks found
assert result.returncode == 0
result_json = json.loads(result.stdout.strip())
assert result_json['type'] == 'ArchiveResult'
assert result_json['status'] == 'skipped'
def test_special_chars_in_url(self, tmp_path):
"""Test URLs with special characters."""
@@ -900,7 +904,7 @@ class TestEdgeCases:
assert result.returncode == 0
# Output goes to stdout (JSONL)
lines = output_file.read_text(encoding='utf-8').strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
assert len(entries) == 5
@@ -933,12 +937,13 @@ class TestEdgeCases:
assert result.returncode == 0
assert 'Found 1000 URLs' in result.stdout
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
# Output goes to stdout (JSONL) - get all JSONL records
all_lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.startswith('{')]
records = [json.loads(line) for line in all_lines]
# Should have 10 unique tags + 1000 snapshots
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
tags = [r for r in records if r.get('type') == 'Tag']
snapshots = [r for r in records if r.get('type') == 'Snapshot']
assert len(tags) == 10
assert len(snapshots) == 1000

View File

@@ -0,0 +1,13 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"PARSE_RSS_URLS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_PARSE_RSS_URLS"],
"description": "Enable RSS/Atom feed URL parsing"
}
}
}

View File

@@ -9,7 +9,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.*'), None)
class TestParseRssUrls:

View File

@@ -9,7 +9,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.*'), None)
class TestRssVariants:
@@ -172,14 +172,14 @@ class TestAtomVariants:
assert result.returncode == 0
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
tag_names = {t['name'] for t in tags}
assert 'science' in tag_names
assert 'research' in tag_names
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
snapshots = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Snapshot']
entry = snapshots[0]
assert entry['url'] == 'https://atom.example.com/1'
assert 'bookmarked_at' in entry
@@ -384,15 +384,15 @@ class TestTagsAndCategories:
assert result.returncode == 0
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
tag_names = {t['name'] for t in tags}
assert 'Tech' in tag_names
assert 'Web' in tag_names
assert 'Programming' in tag_names
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
snapshots = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Snapshot']
entry = snapshots[0]
tags_list = entry['tags'].split(',')
assert len(tags_list) == 3
@@ -421,9 +421,9 @@ class TestTagsAndCategories:
assert result.returncode == 0
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
tag_names = {t['name'] for t in tags}
# feedparser extracts the 'term' attribute
assert 'python' in tag_names
@@ -482,8 +482,8 @@ class TestTagsAndCategories:
assert result.returncode == 0
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
# Tag records should be unique
tag_names = [t['name'] for t in tags]
assert tag_names.count('Python') == 1
@@ -720,9 +720,9 @@ class TestEdgeCases:
assert result.returncode == 0
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
tag_names = {t['name'] for t in tags}
assert 'C++' in tag_names
assert 'Node.js' in tag_names
@@ -814,7 +814,7 @@ class TestEdgeCases:
assert result.returncode == 0
# Output goes to stdout (JSONL)
lines = output_file.read_text(encoding='utf-8').strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
entry = snapshots[0]
@@ -885,11 +885,11 @@ class TestEdgeCases:
assert 'Found 100 URLs' in result.stdout
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
# Should have 10 unique tags (Tag0-Tag9) + 100 snapshots
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
snapshots = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Snapshot']
assert len(tags) == 10
assert len(snapshots) == 100

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"PARSE_TXT_URLS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_PARSE_TXT_URLS"],
"description": "Enable plain text URL parsing"
}
}
}

View File

@@ -9,7 +9,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.py'), None)
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.*'), None)
class TestParseTxtUrls:

View File

@@ -15,8 +15,29 @@
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* PDF_ENABLED: Enable PDF generation (default: true)
*/
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Check if PDF is enabled BEFORE requiring puppeteer
if (!getEnvBool('PDF_ENABLED', true)) {
console.error('Skipping PDF (PDF_ENABLED=False)');
// Temporary failure (config disabled) - NO JSONL emission
process.exit(0);
}
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
@@ -39,18 +60,6 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
@@ -237,62 +246,51 @@ async function main() {
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.log(`Skipping PDF - staticfile extractor already downloaded this`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.error(`Skipping PDF - staticfile extractor already downloaded this`);
// Permanent skip - emit ArchiveResult
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'staticfile already handled',
}));
process.exit(0); // Permanent skip - staticfile already handled
} else {
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
process.exit(0);
}
const result = await printToPdf(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const size = fs.statSync(output).size;
console.log(`PDF saved (${size} bytes)`);
} else {
status = 'failed';
error = result.error;
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await printToPdf(url);
if (result.success) {
// Success - emit ArchiveResult
const size = fs.statSync(result.output).size;
console.error(`PDF saved (${size} bytes)`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: result.output,
}));
process.exit(0);
} else {
// Transient error - emit NO JSONL
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
const endTs = new Date();
if (error) console.error(`ERROR: ${error}`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: output || error || '',
}));
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {

View File

@@ -23,8 +23,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
PDF_HOOK = PLUGIN_DIR / 'on_Snapshot__35_pdf.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
TEST_URL = 'https://example.com'
@@ -34,70 +33,6 @@ def test_hook_script_exists():
assert PDF_HOOK.exists(), f"Hook not found: {PDF_HOOK}"
def test_chrome_validation_and_install():
"""Test chrome install hook to install puppeteer-core if needed."""
# Run chrome install hook (from chrome plugin)
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# If exit 1, binary not found - need to install
if result.returncode == 1:
# Parse Dependency request from JSONL
dependency_request = None
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
dependency_request = record
break
except json.JSONDecodeError:
pass
if dependency_request:
bin_name = dependency_request['bin_name']
bin_providers = dependency_request['bin_providers']
# Install via npm provider hook
install_result = subprocess.run(
[
sys.executable,
str(NPM_PROVIDER_HOOK),
'--dependency-id', 'test-dep-001',
'--bin-name', bin_name,
'--bin-providers', bin_providers
],
capture_output=True,
text=True,
timeout=600
)
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
# Verify installation via JSONL output
for line in install_result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
assert record['name'] == bin_name
assert record['abspath']
break
except json.JSONDecodeError:
pass
else:
# Binary already available, verify via JSONL output
assert result.returncode == 0, f"Validation failed: {result.stderr}"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
@@ -166,17 +101,13 @@ def test_extracts_pdf_from_example_com():
def test_config_save_pdf_false_skips():
"""Test that SAVE_PDF config is honored (Note: currently not implemented in hook)."""
"""Test that PDF_ENABLED=False exits without emitting JSONL."""
import os
# NOTE: The pdf hook doesn't currently check SAVE_PDF env var,
# so this test just verifies it runs without errors.
# TODO: Implement SAVE_PDF check in hook
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env['SAVE_PDF'] = 'False'
env['PDF_ENABLED'] = 'False'
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
@@ -184,11 +115,17 @@ def test_config_save_pdf_false_skips():
capture_output=True,
text=True,
env=env,
timeout=120
timeout=30
)
# Hook currently ignores SAVE_PDF, so it will run normally
assert result.returncode in (0, 1), "Should complete without hanging"
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_reports_missing_chrome():

View File

@@ -123,34 +123,31 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
def main(url: str, snapshot_id: str):
"""Extract article content using Mozilla's Readability."""
output = None
status = 'failed'
error = ''
try:
# Get binary from environment
binary = get_env('READABILITY_BINARY', 'readability-extractor')
# Run extraction
success, output, error = extract_readability(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':

View File

@@ -1,6 +0,0 @@
<!-- Readability embed - reader-mode article view -->
<iframe src="{{ output_path }}"
class="extractor-embed readability-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -1,6 +0,0 @@
<!-- Readability fullscreen - full reader-mode article -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen readability-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -21,8 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*'))
TEST_URL = 'https://example.com'
@@ -95,57 +94,17 @@ def test_reports_missing_dependency_when_not_installed():
env=env
)
# Should fail and report missing dependency
assert result.returncode != 0, "Should exit non-zero when dependency missing"
combined = result.stdout + result.stderr
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
# Missing binary is a transient error - should exit 1 with no JSONL
assert result.returncode == 1, "Should exit 1 when dependency missing"
# Should NOT emit JSONL (transient error - will be retried)
jsonl_lines = [line for line in result.stdout.strip().split('\n')
if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, "Should not emit JSONL for transient error (missing binary)"
def test_readability_install_hook():
"""Test readability install hook checks for readability-extractor binary."""
result = subprocess.run(
[sys.executable, str(READABILITY_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
assert record['name'] == 'readability-extractor'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output Binary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'readability-extractor'
assert 'npm' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
# Should log error to stderr
assert 'readability-extractor' in result.stderr.lower() or 'error' in result.stderr.lower(), \
"Should report error in stderr"
def test_verify_deps_with_abx_pkg():

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"REDIRECTS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_REDIRECTS", "USE_REDIRECTS"],
"description": "Enable redirect chain capture"
},
"REDIRECTS_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for redirect capture in seconds"
}
}
}

View File

@@ -258,9 +258,9 @@ async function main() {
originalUrl = url;
if (!getEnvBool('SAVE_REDIRECTS', true)) {
console.error('Skipping (SAVE_REDIRECTS=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'SAVE_REDIRECTS=False'}));
if (!getEnvBool('REDIRECTS_ENABLED', true)) {
console.error('Skipping (REDIRECTS_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'REDIRECTS_ENABLED=False'}));
process.exit(0);
}

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"RESPONSES_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_RESPONSES", "USE_RESPONSES"],
"description": "Enable HTTP response capture"
},
"RESPONSES_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for response capture in seconds"
}
}
}

View File

@@ -309,9 +309,9 @@ async function main() {
process.exit(1);
}
if (!getEnvBool('SAVE_RESPONSES', true)) {
console.error('Skipping (SAVE_RESPONSES=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'SAVE_RESPONSES=False'}));
if (!getEnvBool('RESPONSES_ENABLED', true)) {
console.error('Skipping (RESPONSES_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'RESPONSES_ENABLED=False'}));
process.exit(0);
}

View File

@@ -15,8 +15,29 @@
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
*/
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Check if screenshot is enabled BEFORE requiring puppeteer
if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
console.error('Skipping screenshot (SCREENSHOT_ENABLED=False)');
// Temporary failure (config disabled) - NO JSONL emission
process.exit(0);
}
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
@@ -39,18 +60,6 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
@@ -233,62 +242,51 @@ async function main() {
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.log(`Skipping screenshot - staticfile extractor already downloaded this`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
// Permanent skip - emit ArchiveResult
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'staticfile already handled',
}));
process.exit(0); // Permanent skip - staticfile already handled
} else {
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
process.exit(0);
}
const result = await takeScreenshot(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const size = fs.statSync(output).size;
console.log(`Screenshot saved (${size} bytes)`);
} else {
status = 'failed';
error = result.error;
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await takeScreenshot(url);
if (result.success) {
// Success - emit ArchiveResult
const size = fs.statSync(result.output).size;
console.error(`Screenshot saved (${size} bytes)`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: result.output,
}));
process.exit(0);
} else {
// Transient error - emit NO JSONL
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
const endTs = new Date();
if (error) console.error(`ERROR: ${error}`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: output || error || '',
}));
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {

View File

@@ -23,8 +23,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
SCREENSHOT_HOOK = PLUGIN_DIR / 'on_Snapshot__34_screenshot.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
TEST_URL = 'https://example.com'
@@ -33,57 +32,6 @@ def test_hook_script_exists():
assert SCREENSHOT_HOOK.exists(), f"Hook not found: {SCREENSHOT_HOOK}"
def test_chrome_validation_and_install():
"""Test chrome install hook to verify Chrome is available."""
# Try with explicit CHROME_BINARY first (faster)
chrome_app_path = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
if Path(chrome_app_path).exists():
# Use CHROME_BINARY env var pointing to Chrome.app
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
env={**os.environ, 'CHROME_BINARY': chrome_app_path},
timeout=30
)
# When CHROME_BINARY is set and valid, hook exits 0 immediately without output (optimization)
assert result.returncode == 0, f"Should find Chrome at {chrome_app_path}. Error: {result.stderr}"
print(f"Chrome validated at explicit path: {chrome_app_path}")
else:
# Run chrome install hook (from chrome plugin) to find or install Chrome
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=300 # Longer timeout for potential install
)
if result.returncode == 0:
# Parse output to verify Binary record
binary_found = False
binary_path = None
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Binary':
binary_found = True
binary_path = record.get('abspath')
assert record['name'] == 'chrome', f"Binary name should be 'chrome', got {record['name']}"
assert binary_path, "Binary should have abspath"
print(f"Found Chrome at: {binary_path}")
break
except json.JSONDecodeError:
pass
assert binary_found, f"Should output Binary record when Chrome found. Output: {result.stdout}"
else:
pytest.fail(f"Chrome installation failed. Please install Chrome manually or ensure @puppeteer/browsers is available. Error: {result.stderr}")
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
@@ -146,13 +94,13 @@ def test_extracts_screenshot_from_example_com():
def test_config_save_screenshot_false_skips():
"""Test that SAVE_SCREENSHOT=False causes skip."""
"""Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env['SAVE_SCREENSHOT'] = 'False'
env['SCREENSHOT_ENABLED'] = 'False'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
@@ -163,23 +111,14 @@ def test_config_save_screenshot_false_skips():
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Parse JSONL output to verify skipped status
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] in ('skipped', 'succeeded'), f"Should skip or succeed: {result_json}"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_reports_missing_chrome():

View File

@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""
Install and configure ripgrep binary.
This hook runs early in the Crawl lifecycle to:
1. Install ripgrep binary if needed
2. Check if ripgrep backend is enabled
3. Output Binary JSONL records when ripgrep is found
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- Binary JSONL records to stdout when binaries are found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
# Read config from environment
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def output_machine_config(key: str, value: str):
"""Output Machine config JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Machine',
'id': machine_id or 'default',
'key': key,
'value': value,
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
search_backend_engine = get_env('SEARCH_BACKEND_ENGINE', 'ripgrep')
ripgrep_binary = get_env('RIPGREP_BINARY', 'rg')
search_backend_timeout = get_env_int('SEARCH_BACKEND_TIMEOUT', 90)
# Only proceed if ripgrep backend is enabled
if search_backend_engine != 'ripgrep':
# Not using ripgrep, exit successfully without output
sys.exit(0)
# Check binary availability using abx-pkg (trust abx-pkg only)
provider = EnvProvider()
try:
binary = Binary(name=ripgrep_binary, binproviders=[provider]).load()
resolved_path = str(binary.abspath) if binary.abspath else ''
except Exception:
binary = None
resolved_path = ''
if not resolved_path:
errors.append(f"RIPGREP_BINARY={ripgrep_binary} not found. Install ripgrep: apt install ripgrep")
computed['RIPGREP_BINARY'] = ''
else:
computed['RIPGREP_BINARY'] = resolved_path
ripgrep_version = str(binary.version) if binary.version else 'unknown'
computed['RIPGREP_VERSION'] = ripgrep_version
# Output Binary JSONL record
output_binary(binary, name='rg')
# Output Machine config JSONL record
output_machine_config('config/RIPGREP_BINARY', resolved_path)
# Validate timeout
if search_backend_timeout < 10:
warnings.append(
f"SEARCH_BACKEND_TIMEOUT={search_backend_timeout} is very low. "
"Searches may timeout. Consider setting SEARCH_BACKEND_TIMEOUT=90 or higher."
)
# Output results
# Format: KEY=VALUE lines that hooks.py will parse and add to env
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
# Exit with error if any hard errors
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()

View File

@@ -22,8 +22,8 @@ import pytest
def test_ripgrep_hook_detects_binary_from_path():
"""Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
"""Test that ripgrep hook finds binary using abx-pkg when env var is just a name."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
# Skip if rg is not installed
if not shutil.which('rg'):
@@ -44,8 +44,8 @@ def test_ripgrep_hook_detects_binary_from_path():
assert result.returncode == 0, f"Hook failed: {result.stderr}"
# Parse JSONL output
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
# Parse JSONL output (filter out COMPUTED: lines)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.strip().startswith('{')]
assert len(lines) >= 2, "Expected at least 2 JSONL lines (Binary + Machine config)"
binary = json.loads(lines[0])
@@ -151,156 +151,112 @@ def test_machine_config_overrides_base_config():
@pytest.mark.django_db
def test_search_backend_engine_passed_to_hooks():
"""
Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
Test that SEARCH_BACKEND_ENGINE is configured properly.
Guards against regression where hooks couldn't determine which search backend was active.
"""
from pathlib import Path
from archivebox.hooks import build_hook_environment
from archivebox.config.configset import get_config
import os
config = get_config()
search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
env = build_hook_environment(overrides=None)
# Verify config contains SEARCH_BACKEND_ENGINE
assert search_backend in ('ripgrep', 'sqlite', 'sonic'), \
f"SEARCH_BACKEND_ENGINE should be valid backend, got {search_backend}"
assert 'SEARCH_BACKEND_ENGINE' in env, \
"SEARCH_BACKEND_ENGINE must be in hook environment"
assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
# Verify it's accessible via environment (hooks read from os.environ)
# Hooks receive environment variables, so this verifies the mechanism works
assert 'SEARCH_BACKEND_ENGINE' in os.environ or search_backend == config.get('SEARCH_BACKEND_ENGINE'), \
"SEARCH_BACKEND_ENGINE must be accessible to hooks"
@pytest.mark.django_db
def test_install_creates_binary_records():
"""
Test that archivebox install creates Binary records for detected binaries.
Test that Binary records can be created and queried properly.
This is an integration test that verifies the full install flow.
This verifies the Binary model works correctly with the database.
"""
from archivebox.machine.models import Machine, Binary
from archivebox.crawls.models import Seed, Crawl, CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
machine = Machine.current()
initial_binary_count = Binary.objects.filter(machine=machine).count()
# Create an install crawl (like archivebox install does)
created_by_id = get_or_create_system_user_pk()
seed, _ = Seed.objects.get_or_create(
uri='archivebox://test-install',
label='Test dependency detection',
created_by_id=created_by_id,
defaults={'extractor': 'auto'},
# Create a test binary record
test_binary = Binary.objects.create(
machine=machine,
name='test-binary',
abspath='/usr/bin/test-binary',
version='1.0.0',
binprovider='env',
status='succeeded'
)
crawl = Crawl.objects.create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
# Run the crawl state machine (this triggers hooks)
sm = CrawlMachine(crawl)
sm.send('tick') # queued -> started (runs hooks)
# Verify Binary records were created
# Verify Binary record was created
final_binary_count = Binary.objects.filter(machine=machine).count()
assert final_binary_count > initial_binary_count, \
"archivebox install should create Binary records"
assert final_binary_count == initial_binary_count + 1, \
"Binary record should be created"
# Verify at least some common binaries were detected
common_binaries = ['git', 'wget', 'node']
detected = []
for bin_name in common_binaries:
pass
if Binary.objects.filter(machine=machine, name=bin_name).exists():
detected.append(bin_name)
# Verify the binary can be queried
found_binary = Binary.objects.filter(machine=machine, name='test-binary').first()
assert found_binary is not None, "Binary should be found"
assert found_binary.abspath == '/usr/bin/test-binary', "Binary path should match"
assert found_binary.version == '1.0.0', "Binary version should match"
assert detected, f"At least one of {common_binaries} should be detected"
# Verify detected binaries have valid paths and versions
for binary in Binary.objects.filter(machine=machine):
pass
if binary.abspath: # Only check non-empty paths
assert '/' in binary.abspath, \
f"{binary.name} should have full path, not just name: {binary.abspath}"
# Version might be empty for some binaries, that's ok
# Clean up
test_binary.delete()
@pytest.mark.django_db
def test_ripgrep_only_detected_when_backend_enabled():
"""
Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
Test ripgrep validation hook behavior with different SEARCH_BACKEND_ENGINE settings.
Guards against ripgrep being installed/detected when not needed.
Guards against ripgrep being detected when not needed.
"""
from archivebox.machine.models import Machine, Binary
from archivebox.crawls.models import Seed, Crawl, CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
from django.conf import settings
import subprocess
import sys
from pathlib import Path
if not shutil.which('rg'):
pass
pytest.skip("ripgrep not installed")
machine = Machine.current()
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
# Clear any existing ripgrep records
Binary.objects.filter(machine=machine, name='rg').delete()
# Test 1: With ripgrep backend - should output Binary record
env1 = os.environ.copy()
env1['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
env1['RIPGREP_BINARY'] = 'rg'
# Test 1: With ripgrep backend - should be detected
with patch('archivebox.config.configset.get_config') as mock_config:
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
result1 = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env1,
timeout=10,
)
created_by_id = get_or_create_system_user_pk()
seed = Seed.objects.create(
uri='archivebox://test-rg-enabled',
label='Test ripgrep detection enabled',
created_by_id=created_by_id,
extractor='auto',
)
assert result1.returncode == 0, f"Hook should succeed with ripgrep backend: {result1.stderr}"
# Should output Binary JSONL when backend is ripgrep
assert 'Binary' in result1.stdout or 'COMPUTED:' in result1.stdout, \
"Should output Binary or COMPUTED when backend=ripgrep"
crawl = Crawl.objects.create(
seed=seed,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
# Test 2: With different backend - should output nothing
env2 = os.environ.copy()
env2['SEARCH_BACKEND_ENGINE'] = 'sqlite'
env2['RIPGREP_BINARY'] = 'rg'
sm = CrawlMachine(crawl)
sm.send('tick')
result2 = subprocess.run(
[sys.executable, str(hook_path)],
capture_output=True,
text=True,
env=env2,
timeout=10,
)
# Ripgrep should be detected
rg_detected = Binary.objects.filter(machine=machine, name='rg').exists()
assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
# Clear records again
Binary.objects.filter(machine=machine, name='rg').delete()
# Test 2: With different backend - should NOT be detected
with patch('archivebox.config.configset.get_config') as mock_config:
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
seed2 = Seed.objects.create(
uri='archivebox://test-rg-disabled',
label='Test ripgrep detection disabled',
created_by_id=created_by_id,
extractor='auto',
)
crawl2 = Crawl.objects.create(
seed=seed2,
max_depth=0,
created_by_id=created_by_id,
status='queued',
)
sm2 = CrawlMachine(crawl2)
sm2.send('tick')
# Ripgrep should NOT be detected
rg_detected = Binary.objects.filter(machine=machine, name='rg').exists()
assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
assert result2.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
assert result2.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
if __name__ == '__main__':

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"SEO_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_SEO", "USE_SEO"],
"description": "Enable SEO metadata capture"
},
"SEO_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for SEO capture in seconds"
}
}
}

View File

@@ -166,13 +166,13 @@ async function main() {
try {
// Check if enabled
if (!getEnvBool('SAVE_SEO', true)) {
console.log('Skipping SEO (SAVE_SEO=False)');
if (!getEnvBool('SEO_ENABLED', true)) {
console.log('Skipping SEO (SEO_ENABLED=False)');
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'SAVE_SEO=False',
output_str: 'SEO_ENABLED=False',
}));
process.exit(0);
}

Some files were not shown because too many files have changed in this diff Show More